All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] xfs: various log stuff...
@ 2021-01-28  4:41 Dave Chinner
  2021-01-28  4:41 ` [PATCH 1/5] xfs: log stripe roundoff is a property of the log Dave Chinner
                   ` (5 more replies)
  0 siblings, 6 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28  4:41 UTC (permalink / raw)
  To: linux-xfs

Hi folks,

Quick patch dump for y'all. A couple of minor cleanups to the
log behaviour, a fix for the CIL throttle hang and a couple of
patches to rework the cache flushing that journal IO does to reduce
the number of cache flushes by a couple of orders of magnitude.

All passes fstests with no regressions, no performance regressions
from fsmark, dbench and various fio workloads, some big gains even
on fast storage.

Cheers,

Dave.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/5] xfs: log stripe roundoff is a property of the log
  2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
@ 2021-01-28  4:41 ` Dave Chinner
  2021-01-28 14:57   ` Brian Foster
  2021-01-28 21:25   ` Darrick J. Wong
  2021-01-28  4:41 ` [PATCH 2/5] xfs: separate CIL commit record IO Dave Chinner
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28  4:41 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

We don't need to look at the xfs_mount and superblock every time we
need to do an iclog roundoff calculation. The property is fixed for
the life of the log, so store the roundoff in the log at mount time
and use that everywhere.

On a debug build:

$ size fs/xfs/xfs_log.o.*
   text	   data	    bss	    dec	    hex	filename
  27360	    560	      8	  27928	   6d18	fs/xfs/xfs_log.o.orig
  27219	    560	      8	  27787	   6c8b	fs/xfs/xfs_log.o.patched

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_log_format.h |  3 --
 fs/xfs/xfs_log.c               | 60 +++++++++++++++-------------------
 fs/xfs/xfs_log_priv.h          |  2 ++
 3 files changed, 28 insertions(+), 37 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 8bd00da6d2a4..16587219549c 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t;
 #define XLOG_MIN_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
 #define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
 #define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
-                                 (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
 
 #define XLOG_HEADER_SIZE	512
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 58699881c100..c5f507c24577 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1400,6 +1400,12 @@ xlog_alloc_log(
 	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
 	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
 
+	/* roundoff padding for transaction data and one for commit record */
+	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+		log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
+	else
+		log->l_iclog_roundoff = BBSIZE;
+
 	xlog_grant_head_init(&log->l_reserve_head);
 	xlog_grant_head_init(&log->l_write_head);
 
@@ -1852,29 +1858,15 @@ xlog_calc_iclog_size(
 	uint32_t		*roundoff)
 {
 	uint32_t		count_init, count;
-	bool			use_lsunit;
-
-	use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
-			log->l_mp->m_sb.sb_logsunit > 1;
 
 	/* Add for LR header */
 	count_init = log->l_iclog_hsize + iclog->ic_offset;
+	count = roundup(count_init, log->l_iclog_roundoff);
 
-	/* Round out the log write size */
-	if (use_lsunit) {
-		/* we have a v2 stripe unit to use */
-		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
-	} else {
-		count = BBTOB(BTOBB(count_init));
-	}
-
-	ASSERT(count >= count_init);
 	*roundoff = count - count_init;
 
-	if (use_lsunit)
-		ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
-	else
-		ASSERT(*roundoff < BBTOB(1));
+	ASSERT(count >= count_init);
+	ASSERT(*roundoff < log->l_iclog_roundoff);
 	return count;
 }
 
@@ -3149,10 +3141,9 @@ xlog_state_switch_iclogs(
 	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
 
 	/* Round up to next log-sunit */
-	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
-	    log->l_mp->m_sb.sb_logsunit > 1) {
-		uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
-		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+	if (log->l_iclog_roundoff > BBSIZE) {
+		log->l_curr_block = roundup(log->l_curr_block,
+						BTOBB(log->l_iclog_roundoff));
 	}
 
 	if (log->l_curr_block >= log->l_logBBsize) {
@@ -3404,12 +3395,11 @@ xfs_log_ticket_get(
  * Figure out the total log space unit (in bytes) that would be
  * required for a log ticket.
  */
-int
-xfs_log_calc_unit_res(
-	struct xfs_mount	*mp,
+static int
+xlog_calc_unit_res(
+	struct xlog		*log,
 	int			unit_bytes)
 {
-	struct xlog		*log = mp->m_log;
 	int			iclog_space;
 	uint			num_headers;
 
@@ -3485,18 +3475,20 @@ xfs_log_calc_unit_res(
 	/* for commit-rec LR header - note: padding will subsume the ophdr */
 	unit_bytes += log->l_iclog_hsize;
 
-	/* for roundoff padding for transaction data and one for commit record */
-	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
-		/* log su roundoff */
-		unit_bytes += 2 * mp->m_sb.sb_logsunit;
-	} else {
-		/* BB roundoff */
-		unit_bytes += 2 * BBSIZE;
-        }
+	/* roundoff padding for transaction data and one for commit record */
+	unit_bytes += log->l_iclog_roundoff;
 
 	return unit_bytes;
 }
 
+int
+xfs_log_calc_unit_res(
+	struct xfs_mount	*mp,
+	int			unit_bytes)
+{
+	return xlog_calc_unit_res(mp->m_log, unit_bytes);
+}
+
 /*
  * Allocate and initialise a new log ticket.
  */
@@ -3513,7 +3505,7 @@ xlog_ticket_alloc(
 
 	tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
 
-	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+	unit_res = xlog_calc_unit_res(log, unit_bytes);
 
 	atomic_set(&tic->t_ref, 1);
 	tic->t_task		= current;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1c6fdbf3d506..037950cf1061 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -436,6 +436,8 @@ struct xlog {
 #endif
 	/* log recovery lsn tracking (for buffer submission */
 	xfs_lsn_t		l_recovery_lsn;
+
+	uint32_t		l_iclog_roundoff;/* padding roundoff */
 };
 
 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/5] xfs: separate CIL commit record IO
  2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
  2021-01-28  4:41 ` [PATCH 1/5] xfs: log stripe roundoff is a property of the log Dave Chinner
@ 2021-01-28  4:41 ` Dave Chinner
  2021-01-28 15:07   ` Brian Foster
                     ` (2 more replies)
  2021-01-28  4:41 ` [PATCH 3/5] xfs: journal IO cache flush reductions Dave Chinner
                   ` (3 subsequent siblings)
  5 siblings, 3 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28  4:41 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

To allow for iclog IO device cache flush behaviour to be optimised,
we first need to separate out the commit record iclog IO from the
rest of the checkpoint so we can wait for the checkpoint IO to
complete before we issue the commit record.

This separate is only necessary if the commit record is being
written into a different iclog to the start of the checkpoint. If
the entire checkpoint and commit is in the one iclog, then they are
both covered by the one set of cache flush primitives on the iclog
and hence there is no need to separate them.

Otherwise, we need to wait for all the previous iclogs to complete
so they are ordered correctly and made stable by the REQ_PREFLUSH
that the commit record iclog IO issues. This guarantees that if a
reader sees the commit record in the journal, they will also see the
entire checkpoint that commit record closes off.

This also provides the guarantee that when the commit record IO
completes, we can safely unpin all the log items in the checkpoint
so they can be written back because the entire checkpoint is stable
in the journal.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c      | 47 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_log_cil.c  |  7 +++++++
 fs/xfs/xfs_log_priv.h |  2 ++
 3 files changed, 56 insertions(+)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c5f507c24577..c5e3da23961c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -808,6 +808,53 @@ xlog_wait_on_iclog(
 	return 0;
 }
 
+/*
+ * Wait on any iclogs that are still flushing in the range of start_lsn to
+ * the current iclog's lsn. The caller holds a reference to the iclog, but
+ * otherwise holds no log locks.
+ *
+ * We walk backwards through the iclogs to find the iclog with the highest lsn
+ * in the range that we need to wait for and then wait for it to complete.
+ * Completion ordering of iclog IOs ensures that all prior iclogs to this IO are
+ * complete by the time our candidate has completed.
+ */
+int
+xlog_wait_on_iclog_lsn(
+	struct xlog_in_core	*iclog,
+	xfs_lsn_t		start_lsn)
+{
+	struct xlog		*log = iclog->ic_log;
+	struct xlog_in_core	*prev;
+	int			error = -EIO;
+
+	spin_lock(&log->l_icloglock);
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto out_unlock;
+
+	error = 0;
+	for (prev = iclog->ic_prev; prev != iclog; prev = prev->ic_prev) {
+
+		/* Done if the lsn is before our start lsn */
+		if (XFS_LSN_CMP(be64_to_cpu(prev->ic_header.h_lsn),
+				start_lsn) < 0)
+			break;
+
+		/* Don't need to wait on completed, clean iclogs */
+		if (prev->ic_state == XLOG_STATE_DIRTY ||
+		    prev->ic_state == XLOG_STATE_ACTIVE) {
+			continue;
+		}
+
+		/* wait for completion on this iclog */
+		xlog_wait(&prev->ic_force_wait, &log->l_icloglock);
+		return 0;
+	}
+
+out_unlock:
+	spin_unlock(&log->l_icloglock);
+	return error;
+}
+
 /*
  * Write out an unmount record using the ticket provided. We have to account for
  * the data space used in the unmount ticket as this write is not done from a
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b0ef071b3cb5..c5cc1b7ad25e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -870,6 +870,13 @@ xlog_cil_push_work(
 	wake_up_all(&cil->xc_commit_wait);
 	spin_unlock(&cil->xc_push_lock);
 
+	/*
+	 * If the checkpoint spans multiple iclogs, wait for all previous
+	 * iclogs to complete before we submit the commit_iclog.
+	 */
+	if (ctx->start_lsn != commit_lsn)
+		xlog_wait_on_iclog_lsn(commit_iclog, ctx->start_lsn);
+
 	/* release the hounds! */
 	xfs_log_release_iclog(commit_iclog);
 	return;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 037950cf1061..a7ac85aaff4e 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -584,6 +584,8 @@ xlog_wait(
 	remove_wait_queue(wq, &wait);
 }
 
+int xlog_wait_on_iclog_lsn(struct xlog_in_core *iclog, xfs_lsn_t start_lsn);
+
 /*
  * The LSN is valid so long as it is behind the current LSN. If it isn't, this
  * means that the next log record that includes this metadata could have a
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/5] xfs: journal IO cache flush reductions
  2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
  2021-01-28  4:41 ` [PATCH 1/5] xfs: log stripe roundoff is a property of the log Dave Chinner
  2021-01-28  4:41 ` [PATCH 2/5] xfs: separate CIL commit record IO Dave Chinner
@ 2021-01-28  4:41 ` Dave Chinner
  2021-01-28 15:12   ` Brian Foster
                     ` (2 more replies)
  2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
                   ` (2 subsequent siblings)
  5 siblings, 3 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28  4:41 UTC (permalink / raw)
  To: linux-xfs

From: Steve Lord <lord@sgi.com>

Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to
guarantee the ordering requirements the journal has w.r.t. metadata
writeback. THe two ordering constraints are:

1. we cannot overwrite metadata in the journal until we guarantee
that the dirty metadata has been written back in place and is
stable.

2. we cannot write back dirty metadata until it has been written to
the journal and guaranteed to be stable (and hence recoverable) in
the journal.

THe ordering guarantees of #1 are provided by REQ_PREFLUSH. This
causes the journal IO to issue a cache flush and wait for it to
complete before issuing the write IO to the journal. Hence all
completed metadata IO is guaranteed to be stable before the journal
overwrites the old metadata.

THe ordering guarantees of #2 are provided by the REQ_FUA, which
ensures the journal writes do not complete until they are on stable
storage. Hence by the time the last journal IO in a checkpoint
completes, we know that the entire checkpoint is on stable storage
and we can unpin the dirty metadata and allow it to be written back.

This is the mechanism by which ordering was first implemented in XFS
way back in 2002 by this commit:

commit 95d97c36e5155075ba2eb22b17562cfcc53fcf96
Author: Steve Lord <lord@sgi.com>
Date:   Fri May 24 14:30:21 2002 +0000

    Add support for drive write cache flushing - should the kernel
    have the infrastructure

A lot has changed since then, most notably we now use delayed
logging to checkpoint the filesystem to the journal rather than
write each individual transaction to the journal. Cache flushes on
journal IO are necessary when individual transactions are wholly
contained within a single iclog. However, CIL checkpoints are single
transactions that typically span hundreds to thousands of individual
journal writes, and so the requirements for device cache flushing
have changed.

That is, the ordering rules I state above apply to ordering of
atomic transactions recorded in the journal, not to the journal IO
itself. Hence we need to ensure metadata is stable before we start
writing a new transaction to the journal (guarantee #1), and we need
to ensure the entire transaction is stable in the journal before we
start metadata writeback (guarantee #2).

Hence we only need a REQ_PREFLUSH on the journal IO that starts a
new journal transaction to provide #1, and it is not on any other
journal IO done within the context of that journal transaction.

To ensure that the entire journal transaction is on stable storage
before we run the completion code that unpins all the dirty metadata
recorded in the journal transaction, the last write of the
transaction must also ensure that the entire journal transaction is
stable. We already know what IO that will be, thanks to the commit
record we explicitly write to complete the transaction. We can order
all the previous journal IO for this transaction by waiting for all
the previous iclogs containing the transaction data to complete
their IO, then issuing the commit record IO using REQ_PREFLUSH
| REQ_FUA. The preflush ensures all the previous journal IO is
stable before the commit record hits the log, and the REQ_FUA
ensures that the commit record is stable before completion is
signalled to the filesystem.

Hence using REQ_PREFLUSH on the first IO of a journal transaction,
and then ordering the journal IO before issuing the commit record
with REQ_PREFLUSH | REQ_FUA, we get all the same ordering guarantees
that we currently achieve by issuing all journal IO with cache
flushes.

As an optimisation, when the commit record lands in the same iclog
as the journal transaction starts, we don't need to wait for
anything and can simply issue the journal IO with REQ_PREFLUSH |
REQ_FUA as we do right now. This means that for fsync() heavy
workloads, the cache flush behaviour is completely unchanged and
there is no degradation in performance as a result of optimise the
multi-IO transaction case.

To further simplify the implementation, we also issue the initial IO
in a journal transaction with REQ_FUA. THis ensures the journal is
dirtied by the first IO in a long running transaction as quickly as
possible. This helps ensure that log recovery will at least have a
transaction header for the incomplete transaction in the log similar
to the stable journal write behaviour we have now.

The most notable sign that there is less IO latency on my test
machine (nvme SSDs) is that the "noiclogs" rate has dropped
substantially. This metric indicates that the CIL push is blocking
in xlog_get_iclog_space() waiting for iclog IO completion to occur.
With 8 iclogs of 256kB, the rate is appoximately 1 noiclog event to
every 4 iclog writes. IOWs, every 4th call to xlog_get_iclog_space()
is blocking waiting for log IO. With the changes in this patch, this
drops to 1 noiclog event for every 100 iclog writes. Hence it is
clear that log IO is completing much faster than it was previously,
but it is also clear that for large iclog sizes, this isn't the
performance limiting factor on this hardware.

With smaller iclogs (32kB), however, there is a sustantial
difference. With the cache flush modifications, the journal is now
running at over 4000 write IOPS, and the journal throughput is
largely identical to the 256kB iclogs and the noiclog event rate
stays low at about 1:50 iclog writes. The existing code tops out at
about 2500 IOPS as the number of cache flushes dominate performance
and latency. The noiclog event rate is about 1:4, and the
performance variance is quite large as the journal throughput can
fall to less than half the peak sustained rate when the cache flush
rate prevents metadata writeback from keeping up and the log runs
out of space and throttles reservations.

As a result:

	logbsize	fsmark create rate	rm -rf
before	32kb		152851+/-5.3e+04	5m28s
patched	32kb		221533+/-1.1e+04	5m24s

before	256kb		220239+/-6.2e+03	4m58s
patched	256kb		228286+/-9.2e+03	5m06s

The rm -rf times are included because I ran them, but the
differences are largely noise. This workload is largely metadata
read IO latency bound and the changes to the journal cache flushing
doesn't really make any noticable difference to behaviour apart from
a reduction in noiclog events from background CIL pushing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c      | 34 ++++++++++++++++++++++------------
 fs/xfs/xfs_log_priv.h |  3 +++
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c5e3da23961c..8de93893e0e6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1804,8 +1804,7 @@ xlog_write_iclog(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog,
 	uint64_t		bno,
-	unsigned int		count,
-	bool			need_flush)
+	unsigned int		count)
 {
 	ASSERT(bno < log->l_logBBsize);
 
@@ -1843,10 +1842,11 @@ xlog_write_iclog(
 	 * writeback throttle from throttling log writes behind background
 	 * metadata writeback and causing priority inversions.
 	 */
-	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
-				REQ_IDLE | REQ_FUA;
-	if (need_flush)
-		iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
+		iclog->ic_bio.bi_opf |= REQ_PREFLUSH | REQ_FUA;
+		iclog->ic_flags &= ~XLOG_ICL_NEED_FLUSH;
+	}
 
 	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
 		xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1949,7 +1949,7 @@ xlog_sync(
 	unsigned int		roundoff;       /* roundoff to BB or stripe */
 	uint64_t		bno;
 	unsigned int		size;
-	bool			need_flush = true, split = false;
+	bool			split = false;
 
 	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 
@@ -2007,13 +2007,14 @@ xlog_sync(
 	 * synchronously here; for an internal log we can simply use the block
 	 * layer state machine for preflushes.
 	 */
-	if (log->l_targ != log->l_mp->m_ddev_targp || split) {
+	if (log->l_targ != log->l_mp->m_ddev_targp ||
+	    (split && (iclog->ic_flags & XLOG_ICL_NEED_FLUSH))) {
 		xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
-		need_flush = false;
+		iclog->ic_flags &= ~XLOG_ICL_NEED_FLUSH;
 	}
 
 	xlog_verify_iclog(log, iclog, count);
-	xlog_write_iclog(log, iclog, bno, count, need_flush);
+	xlog_write_iclog(log, iclog, bno, count);
 }
 
 /*
@@ -2464,9 +2465,18 @@ xlog_write(
 		ASSERT(log_offset <= iclog->ic_size - 1);
 		ptr = iclog->ic_datap + log_offset;
 
-		/* start_lsn is the first lsn written to. That's all we need. */
-		if (!*start_lsn)
+		/*
+		 * Start_lsn is the first lsn written to. That's all the caller
+		 * needs to have returned. Setting it indicates the first iclog
+		 * of a new checkpoint or the commit record for a checkpoint, so
+		 * also mark the iclog as requiring a pre-flush to ensure all
+		 * metadata writeback or journal IO in the checkpoint is
+		 * correctly ordered against this new log write.
+		 */
+		if (!*start_lsn) {
 			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+			iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+		}
 
 		/*
 		 * This loop writes out as many regions as can fit in the amount
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index a7ac85aaff4e..9f1e627ccb74 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -133,6 +133,8 @@ enum xlog_iclog_state {
 
 #define XLOG_COVER_OPS		5
 
+#define XLOG_ICL_NEED_FLUSH     (1 << 0)        /* iclog needs REQ_PREFLUSH */
+
 /* Ticket reservation region accounting */ 
 #define XLOG_TIC_LEN_MAX	15
 
@@ -201,6 +203,7 @@ typedef struct xlog_in_core {
 	u32			ic_size;
 	u32			ic_offset;
 	enum xlog_iclog_state	ic_state;
+	unsigned int		ic_flags;
 	char			*ic_datap;	/* pointer to iclog data */
 
 	/* Callback structures need their own cacheline */
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards
  2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
                   ` (2 preceding siblings ...)
  2021-01-28  4:41 ` [PATCH 3/5] xfs: journal IO cache flush reductions Dave Chinner
@ 2021-01-28  4:41 ` Dave Chinner
  2021-01-28 16:53   ` Brian Foster
                     ` (3 more replies)
  2021-01-28  4:41 ` [PATCH 5/5] xfs: reduce buffer log item shadow allocations Dave Chinner
  2021-02-01 12:39 ` [PATCH 0/5] xfs: various log stuff Christoph Hellwig
  5 siblings, 4 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28  4:41 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

A hang with tasks stuck on the CIL hard throttle was reported and
largely diagnosed by Donald Buczek, who discovered that it was a
result of the CIL context space usage decrementing in committed
transactions once the hard throttle limit had been hit and processes
were already blocked.  This resulted in the CIL push not waking up
those waiters because the CIL context was no longer over the hard
throttle limit.

The surprising aspect of this was the CIL space usage going
backwards regularly enough to trigger this situation. Assumptions
had been made in design that the relogging process would only
increase the size of the objects in the CIL, and so that space would
only increase.

This change and commit message fixes the issue and documents the
result of an audit of the triggers that can cause the CIL space to
go backwards, how large the backwards steps tend to be, the
frequency in which they occur, and what the impact on the CIL
accounting code is.

Even though the CIL ctx->space_used can go backwards, it will only
do so if the log item is already logged to the CIL and contains a
space reservation for it's entire logged state. This is tracked by
the shadow buffer state on the log item. If the item is not
previously logged in the CIL it has no shadow buffer nor log vector,
and hence the entire size of the logged item copied to the log
vector is accounted to the CIL space usage. i.e.  it will always go
up in this case.

If the item has a log vector (i.e. already in the CIL) and the size
decreases, then the existing log vector will be overwritten and the
space usage will go down. This is the only condition where the space
usage reduces, and it can only occur when an item is already tracked
in the CIL. Hence we are safe from CIL space usage underruns as a
result of log items decreasing in size when they are relogged.

Typically this reduction in CIL usage occurs from metadta blocks
being free, such as when a btree block merge
occurs or a directory enter/xattr entry is removed and the da-tree
is reduced in size. This generally results in a reduction in size of
around a single block in the CIL, but also tends to increase the
number of log vectors because the parent and sibling nodes in the
tree needs to be updated when a btree block is removed. If a
multi-level merge occurs, then we see reduction in size of 2+
blocks, but again the log vector count goes up.

The other vector is inode fork size changes, which only log the
current size of the fork and ignore the previously logged size when
the fork is relogged. Hence if we are removing items from the inode
fork (dir/xattr removal in shortform, extent record removal in
extent form, etc) the relogged size of the inode for can decrease.

No other log items can decrease in size either because they are a
fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging
an intent actually creates a new intent log item and doesn't relog
the old item at all.) Hence the only two vectors for CIL context
size reduction are relogging inode forks and marking buffers active
in the CIL as stale.

Long story short: the majority of the code does the right thing and
handles the reduction in log item size correctly, and only the CIL
hard throttle implementation is problematic and needs fixing. This
patch makes that fix, as well as adds comments in the log item code
that result in items shrinking in size when they are relogged as a
clear reminder that this can and does happen frequently.

The throttle fix is based upon the change Donald proposed, though it
goes further to ensure that once the throttle is activated, it
captures all tasks until the CIL push issues a wakeup, regardless of
whether the CIL space used has gone back under the throttle
threshold.

This ensures that we prevent tasks reducing the CIL slightly under
the throttle threshold and then making more changes that push it
well over the throttle limit. This is acheived by checking if the
throttle wait queue is already active as a condition of throttling.
Hence once we start throttling, we continue to apply the throttle
until the CIL context push wakes everything on the wait queue.

We can use waitqueue_active() for the waitqueue manipulations and
checks as they are all done under the ctx->xc_push_lock. Hence the
waitqueue has external serialisation and we can safely peek inside
the wait queue without holding the internal waitqueue locks.

Many thanks to Donald for his diagnostic and analysis work to
isolate the cause of this hang.

Reported-by: Donald Buczek <buczek@molgen.mpg.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_buf_item.c   | 37 ++++++++++++++++++-------------------
 fs/xfs/xfs_inode_item.c | 14 ++++++++++++++
 fs/xfs/xfs_log_cil.c    | 22 +++++++++++++++++-----
 3 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index dc0be2a639cc..17960b1ce5ef 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -56,14 +56,12 @@ xfs_buf_log_format_size(
 }
 
 /*
- * This returns the number of log iovecs needed to log the
- * given buf log item.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item segment.
  *
- * It calculates this as 1 iovec for the buf log format structure
- * and 1 for each stretch of non-contiguous chunks to be logged.
- * Contiguous chunks are logged in a single iovec.
- *
- * If the XFS_BLI_STALE flag has been set, then log nothing.
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
+ * in a single iovec.
  */
 STATIC void
 xfs_buf_item_size_segment(
@@ -119,11 +117,8 @@ xfs_buf_item_size_segment(
 }
 
 /*
- * This returns the number of log iovecs needed to log the given buf log item.
- *
- * It calculates this as 1 iovec for the buf log format structure and 1 for each
- * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
- * in a single iovec.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item.
  *
  * Discontiguous buffers need a format structure per region that is being
  * logged. This makes the changes in the buffer appear to log recovery as though
@@ -133,7 +128,11 @@ xfs_buf_item_size_segment(
  * what ends up on disk.
  *
  * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
- * format structures.
+ * format structures. If the item has previously been logged and has dirty
+ * regions, we do not relog them in stale buffers. This has the effect of
+ * reducing the size of the relogged item by the amount of dirty data tracked
+ * by the log item. This can result in the committing transaction reducing the
+ * amount of space being consumed by the CIL.
  */
 STATIC void
 xfs_buf_item_size(
@@ -147,9 +146,9 @@ xfs_buf_item_size(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		/*
-		 * The buffer is stale, so all we need to log
-		 * is the buf log format structure with the
-		 * cancel flag in it.
+		 * The buffer is stale, so all we need to log is the buf log
+		 * format structure with the cancel flag in it as we are never
+		 * going to replay the changes tracked in the log item.
 		 */
 		trace_xfs_buf_item_size_stale(bip);
 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
@@ -164,9 +163,9 @@ xfs_buf_item_size(
 
 	if (bip->bli_flags & XFS_BLI_ORDERED) {
 		/*
-		 * The buffer has been logged just to order it.
-		 * It is not being included in the transaction
-		 * commit, so no vectors are used at all.
+		 * The buffer has been logged just to order it. It is not being
+		 * included in the transaction commit, so no vectors are used at
+		 * all.
 		 */
 		trace_xfs_buf_item_size_ordered(bip);
 		*nvecs = XFS_LOG_VEC_ORDERED;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 17e20a6d8b4e..6ff91e5bf3cd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
 	return container_of(lip, struct xfs_inode_log_item, ili_item);
 }
 
+/*
+ * The logged size of an inode fork is always the current size of the inode
+ * fork. This means that when an inode fork is relogged, the size of the logged
+ * region is determined by the current state, not the combination of the
+ * previously logged state + the current state. This is different relogging
+ * behaviour to most other log items which will retain the size of the
+ * previously logged changes when smaller regions are relogged.
+ *
+ * Hence operations that remove data from the inode fork (e.g. shortform
+ * dir/attr remove, extent form extent removal, etc), the size of the relogged
+ * inode gets -smaller- rather than stays the same size as the previously logged
+ * size and this can result in the committing transaction reducing the amount of
+ * space being consumed by the CIL.
+ */
 STATIC void
 xfs_inode_item_data_fork_size(
 	struct xfs_inode_log_item *iip,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c5cc1b7ad25e..daf1f3eb24a8 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -668,9 +668,14 @@ xlog_cil_push_work(
 	ASSERT(push_seq <= ctx->sequence);
 
 	/*
-	 * Wake up any background push waiters now this context is being pushed.
+	 * As we are about to switch to a new CILi, empty context, we no longer
+	 * need to throttle tasks on CIL space overruns. Wake any waiters that
+	 * the hard push throttle may have caught so they can start committing
+	 * to the new context. The ctx->xc_push_lock provides the serialisation
+	 * necessary for safely using the lockless waitqueue_active() check in
+	 * this context.
 	 */
-	if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+	if (waitqueue_active(&cil->xc_push_wait))
 		wake_up_all(&cil->xc_push_wait);
 
 	/*
@@ -914,7 +919,7 @@ xlog_cil_push_background(
 	ASSERT(!list_empty(&cil->xc_cil));
 
 	/*
-	 * don't do a background push if we haven't used up all the
+	 * Don't do a background push if we haven't used up all the
 	 * space available yet.
 	 */
 	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
@@ -938,9 +943,16 @@ xlog_cil_push_background(
 
 	/*
 	 * If we are well over the space limit, throttle the work that is being
-	 * done until the push work on this context has begun.
+	 * done until the push work on this context has begun. Enforce the hard
+	 * throttle on all transaction commits once it has been activated, even
+	 * if the committing transactions have resulted in the space usage
+	 * dipping back down under the hard limit.
+	 *
+	 * The ctx->xc_push_lock provides the serialisation necessary for safely
+	 * using the lockless waitqueue_active() check in this context.
 	 */
-	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
+	    waitqueue_active(&cil->xc_push_wait)) {
 		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
 		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
 		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 5/5] xfs: reduce buffer log item shadow allocations
  2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
                   ` (3 preceding siblings ...)
  2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
@ 2021-01-28  4:41 ` Dave Chinner
  2021-01-28 16:54   ` Brian Foster
  2021-02-02 12:01   ` Chandan Babu R
  2021-02-01 12:39 ` [PATCH 0/5] xfs: various log stuff Christoph Hellwig
  5 siblings, 2 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28  4:41 UTC (permalink / raw)
  To: linux-xfs

From: Dave Chinner <dchinner@redhat.com>

When we modify btrees repeatedly, we regularly increase the size of
the logged region by a single chunk at a time (per transaction
commit). This results in the CIL formatting code having to
reallocate the log vector buffer every time the buffer dirty region
grows. Hence over a typical 4kB btree buffer, we might grow the log
vector 4096/128 = 32x over a short period where we repeatedly add
or remove records to/from the buffer over a series of running
transaction. This means we are doing 32 memory allocations and frees
over this time during a performance critical path in the journal.

The amount of space tracked in the CIL for the object is calculated
during the ->iop_format() call for the buffer log item, but the
buffer memory allocated for it is calculated by the ->iop_size()
call. The size callout determines the size of the buffer, the format
call determines the space used in the buffer.

Hence we can oversize the buffer space required in the size
calculation without impacting the amount of space used and accounted
to the CIL for the changes being logged. This allows us to reduce
the number of allocations by rounding up the buffer size to allow
for future growth. This can safe a substantial amount of CPU time in
this path:

-   46.52%     2.02%  [kernel]                  [k] xfs_log_commit_cil
   - 44.49% xfs_log_commit_cil
      - 30.78% _raw_spin_lock
         - 30.75% do_raw_spin_lock
              30.27% __pv_queued_spin_lock_slowpath

(oh, ouch!)
....
      - 1.05% kmem_alloc_large
         - 1.02% kmem_alloc
              0.94% __kmalloc

This overhead here us what this patch is aimed at. After:

      - 0.76% kmem_alloc_large                                                                                                                                      ▒
         - 0.75% kmem_alloc                                                                                                                                         ▒
              0.70% __kmalloc                                                                                                                                       ▒


Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_buf_item.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 17960b1ce5ef..0628a65d9c55 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -142,6 +142,7 @@ xfs_buf_item_size(
 {
 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 	int			i;
+	int			bytes;
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	if (bip->bli_flags & XFS_BLI_STALE) {
@@ -173,7 +174,7 @@ xfs_buf_item_size(
 	}
 
 	/*
-	 * the vector count is based on the number of buffer vectors we have
+	 * The vector count is based on the number of buffer vectors we have
 	 * dirty bits in. This will only be greater than one when we have a
 	 * compound buffer with more than one segment dirty. Hence for compound
 	 * buffers we need to track which segment the dirty bits correspond to,
@@ -181,10 +182,18 @@ xfs_buf_item_size(
 	 * count for the extra buf log format structure that will need to be
 	 * written.
 	 */
+	bytes = 0;
 	for (i = 0; i < bip->bli_format_count; i++) {
 		xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
-					  nvecs, nbytes);
+					  nvecs, &bytes);
 	}
+
+	/*
+	 * Round up the buffer size required to minimise the number of memory
+	 * allocations that need to be done as this item grows when relogged by
+	 * repeated modifications.
+	 */
+	*nbytes = round_up(bytes, 512);
 	trace_xfs_buf_item_size(bip);
 }
 
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] xfs: log stripe roundoff is a property of the log
  2021-01-28  4:41 ` [PATCH 1/5] xfs: log stripe roundoff is a property of the log Dave Chinner
@ 2021-01-28 14:57   ` Brian Foster
  2021-01-28 20:59     ` Dave Chinner
  2021-01-28 21:25   ` Darrick J. Wong
  1 sibling, 1 reply; 29+ messages in thread
From: Brian Foster @ 2021-01-28 14:57 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:50PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> We don't need to look at the xfs_mount and superblock every time we
> need to do an iclog roundoff calculation. The property is fixed for
> the life of the log, so store the roundoff in the log at mount time
> and use that everywhere.
> 
> On a debug build:
> 
> $ size fs/xfs/xfs_log.o.*
>    text	   data	    bss	    dec	    hex	filename
>   27360	    560	      8	  27928	   6d18	fs/xfs/xfs_log.o.orig
>   27219	    560	      8	  27787	   6c8b	fs/xfs/xfs_log.o.patched
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_log_format.h |  3 --
>  fs/xfs/xfs_log.c               | 60 +++++++++++++++-------------------
>  fs/xfs/xfs_log_priv.h          |  2 ++
>  3 files changed, 28 insertions(+), 37 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 58699881c100..c5f507c24577 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -1400,6 +1400,12 @@ xlog_alloc_log(
>  	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
>  	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
>  
> +	/* roundoff padding for transaction data and one for commit record */

I don't follow this comment. What do you mean by "... and one for commit
record?"

> +	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
> +		log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
> +	else
> +		log->l_iclog_roundoff = BBSIZE;
> +
>  	xlog_grant_head_init(&log->l_reserve_head);
>  	xlog_grant_head_init(&log->l_write_head);
>  
...
> @@ -3485,18 +3475,20 @@ xfs_log_calc_unit_res(
>  	/* for commit-rec LR header - note: padding will subsume the ophdr */
>  	unit_bytes += log->l_iclog_hsize;
>  
> -	/* for roundoff padding for transaction data and one for commit record */
> -	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
> -		/* log su roundoff */
> -		unit_bytes += 2 * mp->m_sb.sb_logsunit;
> -	} else {
> -		/* BB roundoff */
> -		unit_bytes += 2 * BBSIZE;
> -        }
> +	/* roundoff padding for transaction data and one for commit record */
> +	unit_bytes += log->l_iclog_roundoff;
>  
>  	return unit_bytes;
>  }
>  
> +int
> +xfs_log_calc_unit_res(
> +	struct xfs_mount	*mp,
> +	int			unit_bytes)
> +{
> +	return xlog_calc_unit_res(mp->m_log, unit_bytes);
> +}
> +

Could this be moved to the header as an inline? Otherwise looks
reasonable.

Brian

>  /*
>   * Allocate and initialise a new log ticket.
>   */
> @@ -3513,7 +3505,7 @@ xlog_ticket_alloc(
>  
>  	tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
>  
> -	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
> +	unit_res = xlog_calc_unit_res(log, unit_bytes);
>  
>  	atomic_set(&tic->t_ref, 1);
>  	tic->t_task		= current;
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 1c6fdbf3d506..037950cf1061 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -436,6 +436,8 @@ struct xlog {
>  #endif
>  	/* log recovery lsn tracking (for buffer submission */
>  	xfs_lsn_t		l_recovery_lsn;
> +
> +	uint32_t		l_iclog_roundoff;/* padding roundoff */
>  };
>  
>  #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
> -- 
> 2.28.0
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/5] xfs: separate CIL commit record IO
  2021-01-28  4:41 ` [PATCH 2/5] xfs: separate CIL commit record IO Dave Chinner
@ 2021-01-28 15:07   ` Brian Foster
  2021-01-28 21:22     ` Dave Chinner
  2021-01-30  9:13   ` Chandan Babu R
  2021-02-01 12:59   ` Christoph Hellwig
  2 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2021-01-28 15:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

FYI, I finally got to reading your prior log cache flushing thread
yesterday afternoon. I was planning to revisit that and probably reply
this morning after having some time to digest, but saw this and so will
reply here..

On Thu, Jan 28, 2021 at 03:41:51PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> To allow for iclog IO device cache flush behaviour to be optimised,
> we first need to separate out the commit record iclog IO from the
> rest of the checkpoint so we can wait for the checkpoint IO to
> complete before we issue the commit record.
> 
> This separate is only necessary if the commit record is being
> written into a different iclog to the start of the checkpoint. If
> the entire checkpoint and commit is in the one iclog, then they are
> both covered by the one set of cache flush primitives on the iclog
> and hence there is no need to separate them.
> 

I find the description here a bit vague.. we have to separate out the
commit record I/O, but only if it's already separate..? Glancing at the
code, I don't see any new "separation" happening, so it's not clear to
me what that really refers to. This looks more like we're introducing
serialization to provide checkpoint -> commit record ordering (when the
commit record happens to be in a separate iclog).

> Otherwise, we need to wait for all the previous iclogs to complete
> so they are ordered correctly and made stable by the REQ_PREFLUSH
> that the commit record iclog IO issues. This guarantees that if a
> reader sees the commit record in the journal, they will also see the
> entire checkpoint that commit record closes off.
> 
> This also provides the guarantee that when the commit record IO
> completes, we can safely unpin all the log items in the checkpoint
> so they can be written back because the entire checkpoint is stable
> in the journal.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log.c      | 47 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_log_cil.c  |  7 +++++++
>  fs/xfs/xfs_log_priv.h |  2 ++
>  3 files changed, 56 insertions(+)
> 
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index c5f507c24577..c5e3da23961c 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -808,6 +808,53 @@ xlog_wait_on_iclog(
>  	return 0;
>  }
>  
> +/*
> + * Wait on any iclogs that are still flushing in the range of start_lsn to
> + * the current iclog's lsn. The caller holds a reference to the iclog, but
> + * otherwise holds no log locks.
> + *
> + * We walk backwards through the iclogs to find the iclog with the highest lsn
> + * in the range that we need to wait for and then wait for it to complete.
> + * Completion ordering of iclog IOs ensures that all prior iclogs to this IO are
> + * complete by the time our candidate has completed.
> + */
> +int
> +xlog_wait_on_iclog_lsn(
> +	struct xlog_in_core	*iclog,
> +	xfs_lsn_t		start_lsn)
> +{
> +	struct xlog		*log = iclog->ic_log;
> +	struct xlog_in_core	*prev;
> +	int			error = -EIO;
> +
> +	spin_lock(&log->l_icloglock);
> +	if (XLOG_FORCED_SHUTDOWN(log))
> +		goto out_unlock;
> +
> +	error = 0;
> +	for (prev = iclog->ic_prev; prev != iclog; prev = prev->ic_prev) {
> +
> +		/* Done if the lsn is before our start lsn */
> +		if (XFS_LSN_CMP(be64_to_cpu(prev->ic_header.h_lsn),
> +				start_lsn) < 0)
> +			break;

Hmm.. that logic looks a bit dodgy when you consider that the iclog
header lsn is reset to zero on activation. I think it actually works as
intended because iclog completion reactivates iclogs in LSN order and
this loop walks in reverse order, but that is a very subtle connection
that might be useful to document.

> +
> +		/* Don't need to wait on completed, clean iclogs */
> +		if (prev->ic_state == XLOG_STATE_DIRTY ||
> +		    prev->ic_state == XLOG_STATE_ACTIVE) {
> +			continue;
> +		}
> +
> +		/* wait for completion on this iclog */
> +		xlog_wait(&prev->ic_force_wait, &log->l_icloglock);
> +		return 0;
> +	}
> +
> +out_unlock:
> +	spin_unlock(&log->l_icloglock);
> +	return error;
> +}
> +
>  /*
>   * Write out an unmount record using the ticket provided. We have to account for
>   * the data space used in the unmount ticket as this write is not done from a
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index b0ef071b3cb5..c5cc1b7ad25e 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -870,6 +870,13 @@ xlog_cil_push_work(
>  	wake_up_all(&cil->xc_commit_wait);
>  	spin_unlock(&cil->xc_push_lock);
>  
> +	/*
> +	 * If the checkpoint spans multiple iclogs, wait for all previous
> +	 * iclogs to complete before we submit the commit_iclog.
> +	 */
> +	if (ctx->start_lsn != commit_lsn)
> +		xlog_wait_on_iclog_lsn(commit_iclog, ctx->start_lsn);
> +

This is an interesting serialization point because we don't necessarily
submit the iclog that holds the commit record. I actually think in most
cases the separate commit record won't land on disk until the next CIL
push causes a sync of the current head iclog (assuming it fills it up).
Granted, this is the last point we have context of the commit record
being written, so it makes sense from an opportunistic standpoint to
serialize here (just as we already block to ensure commit record
ordering across checkpoints). That said, with the aggressive batching
ability of the CIL I think blocking on prior pushes is potentially less
heavy handed than unconditionally blocking on all prior iclog I/O. I
wonder if this might be something to address if we're going to compound
this path with more serialization.

From a performance perspective, it seems like this makes the CIL push
_nearly_ synchronous by default. I.e., if we have a several hundred MB
CIL context, we're going to wait for all but the final iclog to complete
before we return. Of course we'll be waiting for much of that anyways to
reuse limited iclog space, but I have to think more about what that
means in general (maybe not much) and get through the subsequent
patches. In the meantime, have you put thought into any potential
negative performance impact from this that might or might not be offset
by subsequent flush optimizations?

Brian

>  	/* release the hounds! */
>  	xfs_log_release_iclog(commit_iclog);
>  	return;
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 037950cf1061..a7ac85aaff4e 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -584,6 +584,8 @@ xlog_wait(
>  	remove_wait_queue(wq, &wait);
>  }
>  
> +int xlog_wait_on_iclog_lsn(struct xlog_in_core *iclog, xfs_lsn_t start_lsn);
> +
>  /*
>   * The LSN is valid so long as it is behind the current LSN. If it isn't, this
>   * means that the next log record that includes this metadata could have a
> -- 
> 2.28.0
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] xfs: journal IO cache flush reductions
  2021-01-28  4:41 ` [PATCH 3/5] xfs: journal IO cache flush reductions Dave Chinner
@ 2021-01-28 15:12   ` Brian Foster
  2021-01-28 21:46     ` Dave Chinner
  2021-01-28 21:26   ` Dave Chinner
  2021-01-30 12:56   ` Chandan Babu R
  2 siblings, 1 reply; 29+ messages in thread
From: Brian Foster @ 2021-01-28 15:12 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:52PM +1100, Dave Chinner wrote:
> From: Steve Lord <lord@sgi.com>
> 
> Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to
> guarantee the ordering requirements the journal has w.r.t. metadata
> writeback. THe two ordering constraints are:
> 
> 1. we cannot overwrite metadata in the journal until we guarantee
> that the dirty metadata has been written back in place and is
> stable.
> 
> 2. we cannot write back dirty metadata until it has been written to
> the journal and guaranteed to be stable (and hence recoverable) in
> the journal.
> 
> THe ordering guarantees of #1 are provided by REQ_PREFLUSH. This
> causes the journal IO to issue a cache flush and wait for it to
> complete before issuing the write IO to the journal. Hence all
> completed metadata IO is guaranteed to be stable before the journal
> overwrites the old metadata.
> 
> THe ordering guarantees of #2 are provided by the REQ_FUA, which
> ensures the journal writes do not complete until they are on stable
> storage. Hence by the time the last journal IO in a checkpoint
> completes, we know that the entire checkpoint is on stable storage
> and we can unpin the dirty metadata and allow it to be written back.
> 
> This is the mechanism by which ordering was first implemented in XFS
> way back in 2002 by this commit:
> 
> commit 95d97c36e5155075ba2eb22b17562cfcc53fcf96
> Author: Steve Lord <lord@sgi.com>
> Date:   Fri May 24 14:30:21 2002 +0000
> 
>     Add support for drive write cache flushing - should the kernel
>     have the infrastructure
> 
> A lot has changed since then, most notably we now use delayed
> logging to checkpoint the filesystem to the journal rather than
> write each individual transaction to the journal. Cache flushes on
> journal IO are necessary when individual transactions are wholly
> contained within a single iclog. However, CIL checkpoints are single
> transactions that typically span hundreds to thousands of individual
> journal writes, and so the requirements for device cache flushing
> have changed.
> 
> That is, the ordering rules I state above apply to ordering of
> atomic transactions recorded in the journal, not to the journal IO
> itself. Hence we need to ensure metadata is stable before we start
> writing a new transaction to the journal (guarantee #1), and we need
> to ensure the entire transaction is stable in the journal before we
> start metadata writeback (guarantee #2).
> 
> Hence we only need a REQ_PREFLUSH on the journal IO that starts a
> new journal transaction to provide #1, and it is not on any other
> journal IO done within the context of that journal transaction.
> 

Conceptually, if the preflush is for metadata completion -> log
submission ordering and the fua is for log completion to metadata
submission ordering, then it seems quite logical that the current flush
pattern could be widened to provide similar guarantees for the
checkpoint model implemented by delayed logging.

> To ensure that the entire journal transaction is on stable storage
> before we run the completion code that unpins all the dirty metadata
> recorded in the journal transaction, the last write of the
> transaction must also ensure that the entire journal transaction is
> stable. We already know what IO that will be, thanks to the commit
> record we explicitly write to complete the transaction. We can order
> all the previous journal IO for this transaction by waiting for all
> the previous iclogs containing the transaction data to complete
> their IO, then issuing the commit record IO using REQ_PREFLUSH
> | REQ_FUA. The preflush ensures all the previous journal IO is
> stable before the commit record hits the log, and the REQ_FUA
> ensures that the commit record is stable before completion is
> signalled to the filesystem.
> 
> Hence using REQ_PREFLUSH on the first IO of a journal transaction,
> and then ordering the journal IO before issuing the commit record
> with REQ_PREFLUSH | REQ_FUA, we get all the same ordering guarantees
> that we currently achieve by issuing all journal IO with cache
> flushes.
> 
> As an optimisation, when the commit record lands in the same iclog
> as the journal transaction starts, we don't need to wait for
> anything and can simply issue the journal IO with REQ_PREFLUSH |
> REQ_FUA as we do right now. This means that for fsync() heavy
> workloads, the cache flush behaviour is completely unchanged and
> there is no degradation in performance as a result of optimise the
> multi-IO transaction case.
> 
> To further simplify the implementation, we also issue the initial IO
> in a journal transaction with REQ_FUA. THis ensures the journal is
> dirtied by the first IO in a long running transaction as quickly as
> possible. This helps ensure that log recovery will at least have a
> transaction header for the incomplete transaction in the log similar
> to the stable journal write behaviour we have now.
> 
...
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log.c      | 34 ++++++++++++++++++++++------------
>  fs/xfs/xfs_log_priv.h |  3 +++
>  2 files changed, 25 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index c5e3da23961c..8de93893e0e6 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
...
> @@ -2464,9 +2465,18 @@ xlog_write(
>  		ASSERT(log_offset <= iclog->ic_size - 1);
>  		ptr = iclog->ic_datap + log_offset;
>  
> -		/* start_lsn is the first lsn written to. That's all we need. */
> -		if (!*start_lsn)
> +		/*
> +		 * Start_lsn is the first lsn written to. That's all the caller
> +		 * needs to have returned. Setting it indicates the first iclog
> +		 * of a new checkpoint or the commit record for a checkpoint, so
> +		 * also mark the iclog as requiring a pre-flush to ensure all
> +		 * metadata writeback or journal IO in the checkpoint is
> +		 * correctly ordered against this new log write.
> +		 */
> +		if (!*start_lsn) {
>  			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
> +			iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
> +		}

My understanding is that one of the reasons for the preflush per iclog
approach is that we don't have any submission -> completion ordering
guarantees across iclogs. This is why we explicitly order commit record
completions and whatnot, to ensure the important bits are ordered
correctly. The fact we implement that ordering ourselves suggests that
PREFLUSH|FUA itself do not provide such ordering, though that's not
something I've investigated.

In any event, if the purpose fo the PREFLUSH is to ensure that metadata
in the targeted LSN range is committed to stable storage, and we have no
submission ordering guarantees across non-commit record iclogs, what
prevents a subsequent iclog from the same checkpoint from completing
before the first iclog with a PREFLUSH? If nothing, then what prevents
that subsequent iclog from overwriting a range of LSNs in the log that
have not been committed to stable storage (potentially leaving a
corruption vector in the event of a crash)? Wouldn't this now require
some form of start record completion ordering to ensure the integrity of
the flush?

Brian

>  
>  		/*
>  		 * This loop writes out as many regions as can fit in the amount
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index a7ac85aaff4e..9f1e627ccb74 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -133,6 +133,8 @@ enum xlog_iclog_state {
>  
>  #define XLOG_COVER_OPS		5
>  
> +#define XLOG_ICL_NEED_FLUSH     (1 << 0)        /* iclog needs REQ_PREFLUSH */
> +
>  /* Ticket reservation region accounting */ 
>  #define XLOG_TIC_LEN_MAX	15
>  
> @@ -201,6 +203,7 @@ typedef struct xlog_in_core {
>  	u32			ic_size;
>  	u32			ic_offset;
>  	enum xlog_iclog_state	ic_state;
> +	unsigned int		ic_flags;
>  	char			*ic_datap;	/* pointer to iclog data */
>  
>  	/* Callback structures need their own cacheline */
> -- 
> 2.28.0
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards
  2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
@ 2021-01-28 16:53   ` Brian Foster
  2021-02-02  5:52   ` Chandan Babu R
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2021-01-28 16:53 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:53PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> A hang with tasks stuck on the CIL hard throttle was reported and
> largely diagnosed by Donald Buczek, who discovered that it was a
> result of the CIL context space usage decrementing in committed
> transactions once the hard throttle limit had been hit and processes
> were already blocked.  This resulted in the CIL push not waking up
> those waiters because the CIL context was no longer over the hard
> throttle limit.
> 
> The surprising aspect of this was the CIL space usage going
> backwards regularly enough to trigger this situation. Assumptions
> had been made in design that the relogging process would only
> increase the size of the objects in the CIL, and so that space would
> only increase.
> 
> This change and commit message fixes the issue and documents the
> result of an audit of the triggers that can cause the CIL space to
> go backwards, how large the backwards steps tend to be, the
> frequency in which they occur, and what the impact on the CIL
> accounting code is.
> 
> Even though the CIL ctx->space_used can go backwards, it will only
> do so if the log item is already logged to the CIL and contains a
> space reservation for it's entire logged state. This is tracked by
> the shadow buffer state on the log item. If the item is not
> previously logged in the CIL it has no shadow buffer nor log vector,
> and hence the entire size of the logged item copied to the log
> vector is accounted to the CIL space usage. i.e.  it will always go
> up in this case.
> 
> If the item has a log vector (i.e. already in the CIL) and the size
> decreases, then the existing log vector will be overwritten and the
> space usage will go down. This is the only condition where the space
> usage reduces, and it can only occur when an item is already tracked
> in the CIL. Hence we are safe from CIL space usage underruns as a
> result of log items decreasing in size when they are relogged.
> 
> Typically this reduction in CIL usage occurs from metadta blocks
> being free, such as when a btree block merge
> occurs or a directory enter/xattr entry is removed and the da-tree
> is reduced in size. This generally results in a reduction in size of
> around a single block in the CIL, but also tends to increase the
> number of log vectors because the parent and sibling nodes in the
> tree needs to be updated when a btree block is removed. If a
> multi-level merge occurs, then we see reduction in size of 2+
> blocks, but again the log vector count goes up.
> 
> The other vector is inode fork size changes, which only log the
> current size of the fork and ignore the previously logged size when
> the fork is relogged. Hence if we are removing items from the inode
> fork (dir/xattr removal in shortform, extent record removal in
> extent form, etc) the relogged size of the inode for can decrease.
> 
> No other log items can decrease in size either because they are a
> fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging
> an intent actually creates a new intent log item and doesn't relog
> the old item at all.) Hence the only two vectors for CIL context
> size reduction are relogging inode forks and marking buffers active
> in the CIL as stale.
> 
> Long story short: the majority of the code does the right thing and
> handles the reduction in log item size correctly, and only the CIL
> hard throttle implementation is problematic and needs fixing. This
> patch makes that fix, as well as adds comments in the log item code
> that result in items shrinking in size when they are relogged as a
> clear reminder that this can and does happen frequently.
> 
> The throttle fix is based upon the change Donald proposed, though it
> goes further to ensure that once the throttle is activated, it
> captures all tasks until the CIL push issues a wakeup, regardless of
> whether the CIL space used has gone back under the throttle
> threshold.
> 
> This ensures that we prevent tasks reducing the CIL slightly under
> the throttle threshold and then making more changes that push it
> well over the throttle limit. This is acheived by checking if the
> throttle wait queue is already active as a condition of throttling.
> Hence once we start throttling, we continue to apply the throttle
> until the CIL context push wakes everything on the wait queue.
> 
> We can use waitqueue_active() for the waitqueue manipulations and
> checks as they are all done under the ctx->xc_push_lock. Hence the
> waitqueue has external serialisation and we can safely peek inside
> the wait queue without holding the internal waitqueue locks.
> 
> Many thanks to Donald for his diagnostic and analysis work to
> isolate the cause of this hang.
> 
> Reported-by: Donald Buczek <buczek@molgen.mpg.de>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_buf_item.c   | 37 ++++++++++++++++++-------------------
>  fs/xfs/xfs_inode_item.c | 14 ++++++++++++++
>  fs/xfs/xfs_log_cil.c    | 22 +++++++++++++++++-----
>  3 files changed, 49 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index dc0be2a639cc..17960b1ce5ef 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -56,14 +56,12 @@ xfs_buf_log_format_size(
>  }
>  
>  /*
> - * This returns the number of log iovecs needed to log the
> - * given buf log item.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item segment.
>   *
> - * It calculates this as 1 iovec for the buf log format structure
> - * and 1 for each stretch of non-contiguous chunks to be logged.
> - * Contiguous chunks are logged in a single iovec.
> - *
> - * If the XFS_BLI_STALE flag has been set, then log nothing.
> + * It calculates this as 1 iovec for the buf log format structure and 1 for each
> + * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> + * in a single iovec.
>   */
>  STATIC void
>  xfs_buf_item_size_segment(
> @@ -119,11 +117,8 @@ xfs_buf_item_size_segment(
>  }
>  
>  /*
> - * This returns the number of log iovecs needed to log the given buf log item.
> - *
> - * It calculates this as 1 iovec for the buf log format structure and 1 for each
> - * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> - * in a single iovec.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item.
>   *
>   * Discontiguous buffers need a format structure per region that is being
>   * logged. This makes the changes in the buffer appear to log recovery as though
> @@ -133,7 +128,11 @@ xfs_buf_item_size_segment(
>   * what ends up on disk.
>   *
>   * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
> - * format structures.
> + * format structures. If the item has previously been logged and has dirty
> + * regions, we do not relog them in stale buffers. This has the effect of
> + * reducing the size of the relogged item by the amount of dirty data tracked
> + * by the log item. This can result in the committing transaction reducing the
> + * amount of space being consumed by the CIL.
>   */
>  STATIC void
>  xfs_buf_item_size(
> @@ -147,9 +146,9 @@ xfs_buf_item_size(
>  	ASSERT(atomic_read(&bip->bli_refcount) > 0);
>  	if (bip->bli_flags & XFS_BLI_STALE) {
>  		/*
> -		 * The buffer is stale, so all we need to log
> -		 * is the buf log format structure with the
> -		 * cancel flag in it.
> +		 * The buffer is stale, so all we need to log is the buf log
> +		 * format structure with the cancel flag in it as we are never
> +		 * going to replay the changes tracked in the log item.
>  		 */
>  		trace_xfs_buf_item_size_stale(bip);
>  		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
> @@ -164,9 +163,9 @@ xfs_buf_item_size(
>  
>  	if (bip->bli_flags & XFS_BLI_ORDERED) {
>  		/*
> -		 * The buffer has been logged just to order it.
> -		 * It is not being included in the transaction
> -		 * commit, so no vectors are used at all.
> +		 * The buffer has been logged just to order it. It is not being
> +		 * included in the transaction commit, so no vectors are used at
> +		 * all.
>  		 */
>  		trace_xfs_buf_item_size_ordered(bip);
>  		*nvecs = XFS_LOG_VEC_ORDERED;
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 17e20a6d8b4e..6ff91e5bf3cd 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
>  	return container_of(lip, struct xfs_inode_log_item, ili_item);
>  }
>  
> +/*
> + * The logged size of an inode fork is always the current size of the inode
> + * fork. This means that when an inode fork is relogged, the size of the logged
> + * region is determined by the current state, not the combination of the
> + * previously logged state + the current state. This is different relogging
> + * behaviour to most other log items which will retain the size of the
> + * previously logged changes when smaller regions are relogged.
> + *
> + * Hence operations that remove data from the inode fork (e.g. shortform

"Hence for ..." ?

> + * dir/attr remove, extent form extent removal, etc), the size of the relogged
> + * inode gets -smaller- rather than stays the same size as the previously logged
> + * size and this can result in the committing transaction reducing the amount of
> + * space being consumed by the CIL.
> + */
>  STATIC void
>  xfs_inode_item_data_fork_size(
>  	struct xfs_inode_log_item *iip,
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index c5cc1b7ad25e..daf1f3eb24a8 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -668,9 +668,14 @@ xlog_cil_push_work(
>  	ASSERT(push_seq <= ctx->sequence);
>  
>  	/*
> -	 * Wake up any background push waiters now this context is being pushed.
> +	 * As we are about to switch to a new CILi, empty context, we no longer

s/CILi, empty context/CIL empty context/ ?

Nits aside, the rest LGTM:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> +	 * need to throttle tasks on CIL space overruns. Wake any waiters that
> +	 * the hard push throttle may have caught so they can start committing
> +	 * to the new context. The ctx->xc_push_lock provides the serialisation
> +	 * necessary for safely using the lockless waitqueue_active() check in
> +	 * this context.
>  	 */
> -	if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
> +	if (waitqueue_active(&cil->xc_push_wait))
>  		wake_up_all(&cil->xc_push_wait);
>  
>  	/*
> @@ -914,7 +919,7 @@ xlog_cil_push_background(
>  	ASSERT(!list_empty(&cil->xc_cil));
>  
>  	/*
> -	 * don't do a background push if we haven't used up all the
> +	 * Don't do a background push if we haven't used up all the
>  	 * space available yet.
>  	 */
>  	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
> @@ -938,9 +943,16 @@ xlog_cil_push_background(
>  
>  	/*
>  	 * If we are well over the space limit, throttle the work that is being
> -	 * done until the push work on this context has begun.
> +	 * done until the push work on this context has begun. Enforce the hard
> +	 * throttle on all transaction commits once it has been activated, even
> +	 * if the committing transactions have resulted in the space usage
> +	 * dipping back down under the hard limit.
> +	 *
> +	 * The ctx->xc_push_lock provides the serialisation necessary for safely
> +	 * using the lockless waitqueue_active() check in this context.
>  	 */
> -	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> +	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
> +	    waitqueue_active(&cil->xc_push_wait)) {
>  		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
>  		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
>  		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
> -- 
> 2.28.0
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 5/5] xfs: reduce buffer log item shadow allocations
  2021-01-28  4:41 ` [PATCH 5/5] xfs: reduce buffer log item shadow allocations Dave Chinner
@ 2021-01-28 16:54   ` Brian Foster
  2021-01-28 21:58     ` Dave Chinner
  2021-02-02 12:01   ` Chandan Babu R
  1 sibling, 1 reply; 29+ messages in thread
From: Brian Foster @ 2021-01-28 16:54 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:54PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> When we modify btrees repeatedly, we regularly increase the size of
> the logged region by a single chunk at a time (per transaction
> commit). This results in the CIL formatting code having to
> reallocate the log vector buffer every time the buffer dirty region
> grows. Hence over a typical 4kB btree buffer, we might grow the log
> vector 4096/128 = 32x over a short period where we repeatedly add
> or remove records to/from the buffer over a series of running
> transaction. This means we are doing 32 memory allocations and frees
> over this time during a performance critical path in the journal.
> 
> The amount of space tracked in the CIL for the object is calculated
> during the ->iop_format() call for the buffer log item, but the
> buffer memory allocated for it is calculated by the ->iop_size()
> call. The size callout determines the size of the buffer, the format
> call determines the space used in the buffer.
> 
> Hence we can oversize the buffer space required in the size
> calculation without impacting the amount of space used and accounted
> to the CIL for the changes being logged. This allows us to reduce
> the number of allocations by rounding up the buffer size to allow
> for future growth. This can safe a substantial amount of CPU time in
> this path:
> 
> -   46.52%     2.02%  [kernel]                  [k] xfs_log_commit_cil
>    - 44.49% xfs_log_commit_cil
>       - 30.78% _raw_spin_lock
>          - 30.75% do_raw_spin_lock
>               30.27% __pv_queued_spin_lock_slowpath
> 
> (oh, ouch!)
> ....
>       - 1.05% kmem_alloc_large
>          - 1.02% kmem_alloc
>               0.94% __kmalloc
> 
> This overhead here us what this patch is aimed at. After:
> 
>       - 0.76% kmem_alloc_large                                                                                                                                      ▒
>          - 0.75% kmem_alloc                                                                                                                                         ▒
>               0.70% __kmalloc                                                                                                                                       ▒
> 
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_buf_item.c | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 17960b1ce5ef..0628a65d9c55 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
...
> @@ -181,10 +182,18 @@ xfs_buf_item_size(
>  	 * count for the extra buf log format structure that will need to be
>  	 * written.
>  	 */
> +	bytes = 0;
>  	for (i = 0; i < bip->bli_format_count; i++) {
>  		xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
> -					  nvecs, nbytes);
> +					  nvecs, &bytes);
>  	}
> +
> +	/*
> +	 * Round up the buffer size required to minimise the number of memory
> +	 * allocations that need to be done as this item grows when relogged by
> +	 * repeated modifications.
> +	 */
> +	*nbytes = round_up(bytes, 512);

If nbytes starts out as zero anyways, what's the need for the new
variable? Otherwise looks reasonable.

Brian

>  	trace_xfs_buf_item_size(bip);
>  }
>  
> -- 
> 2.28.0
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] xfs: log stripe roundoff is a property of the log
  2021-01-28 14:57   ` Brian Foster
@ 2021-01-28 20:59     ` Dave Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28 20:59 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 09:57:03AM -0500, Brian Foster wrote:
> On Thu, Jan 28, 2021 at 03:41:50PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > We don't need to look at the xfs_mount and superblock every time we
> > need to do an iclog roundoff calculation. The property is fixed for
> > the life of the log, so store the roundoff in the log at mount time
> > and use that everywhere.
> > 
> > On a debug build:
> > 
> > $ size fs/xfs/xfs_log.o.*
> >    text	   data	    bss	    dec	    hex	filename
> >   27360	    560	      8	  27928	   6d18	fs/xfs/xfs_log.o.orig
> >   27219	    560	      8	  27787	   6c8b	fs/xfs/xfs_log.o.patched
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/libxfs/xfs_log_format.h |  3 --
> >  fs/xfs/xfs_log.c               | 60 +++++++++++++++-------------------
> >  fs/xfs/xfs_log_priv.h          |  2 ++
> >  3 files changed, 28 insertions(+), 37 deletions(-)
> > 
> ...
> > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> > index 58699881c100..c5f507c24577 100644
> > --- a/fs/xfs/xfs_log.c
> > +++ b/fs/xfs/xfs_log.c
> > @@ -1400,6 +1400,12 @@ xlog_alloc_log(
> >  	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
> >  	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
> >  
> > +	/* roundoff padding for transaction data and one for commit record */
> 
> I don't follow this comment. What do you mean by "... and one for commit
> record?"

I just copied the comment from the code I lifted it from. I will
remove it.

> > +int
> > +xfs_log_calc_unit_res(
> > +	struct xfs_mount	*mp,
> > +	int			unit_bytes)
> > +{
> > +	return xlog_calc_unit_res(mp->m_log, unit_bytes);
> > +}
> > +
> 
> Could this be moved to the header as an inline? Otherwise looks
> reasonable.

Not without making everything that includes the header file this is
added to first including all the log header files.

We really need to clean up the mess that are external log interfaces
xfs_log_...(mp...) vs internal  log interfaces xlog_...(log...) and
the random distribution of such functions through xfs_log.c,
xfs_log_recover.c, xfs_mount.c, xfs_log.h, xfs_log_priv.h,
xfs_trans.h, and xfs_log_format.h....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/5] xfs: separate CIL commit record IO
  2021-01-28 15:07   ` Brian Foster
@ 2021-01-28 21:22     ` Dave Chinner
       [not found]       ` <20210129145851.GB2660974@bfoster>
  0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2021-01-28 21:22 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 10:07:41AM -0500, Brian Foster wrote:
> FYI, I finally got to reading your prior log cache flushing thread
> yesterday afternoon. I was planning to revisit that and probably reply
> this morning after having some time to digest, but saw this and so will
> reply here..
> 
> On Thu, Jan 28, 2021 at 03:41:51PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > To allow for iclog IO device cache flush behaviour to be optimised,
> > we first need to separate out the commit record iclog IO from the
> > rest of the checkpoint so we can wait for the checkpoint IO to
> > complete before we issue the commit record.
> > 
> > This separate is only necessary if the commit record is being
> > written into a different iclog to the start of the checkpoint. If
> > the entire checkpoint and commit is in the one iclog, then they are
> > both covered by the one set of cache flush primitives on the iclog
> > and hence there is no need to separate them.
> > 
> 
> I find the description here a bit vague.. we have to separate out the
> commit record I/O, but only if it's already separate..?

Yes, because if the commit record is in a different iclog to the
start of the checkpoint, we have to ensure that the start of the
checkpoint has been written and completed before we issue a
PREFLUSH, otherwise the cache flush is not guaranteed to capture
that IO. i.e. the block layer cache flushing mechanisms do not order
against IOs in flight, only completed IOs. Hence if the commit
record is in a different iclog, we have to wait for the other iclogs
to complete before flushign the cache. If the checkpoint is wholly
contained within a single iclog, then we don't have to wait for
anything, the single journal IO has all the cache flushes we need.


> Glancing at the
> code, I don't see any new "separation" happening, so it's not clear to
> me what that really refers to. This looks more like we're introducing
> serialization to provide checkpoint -> commit record ordering (when the
> commit record happens to be in a separate iclog).

Yes, that's exactly the separation that is being done by the call to
xlog_wait_on_iclog_lsn() from the CIL push code once we hold a
reference to the commit iclog...

> 
> > Otherwise, we need to wait for all the previous iclogs to complete
> > so they are ordered correctly and made stable by the REQ_PREFLUSH
> > that the commit record iclog IO issues. This guarantees that if a
> > reader sees the commit record in the journal, they will also see the
> > entire checkpoint that commit record closes off.
> > 
> > This also provides the guarantee that when the commit record IO
> > completes, we can safely unpin all the log items in the checkpoint
> > so they can be written back because the entire checkpoint is stable
> > in the journal.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_log.c      | 47 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_log_cil.c  |  7 +++++++
> >  fs/xfs/xfs_log_priv.h |  2 ++
> >  3 files changed, 56 insertions(+)
> > 
> > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> > index c5f507c24577..c5e3da23961c 100644
> > --- a/fs/xfs/xfs_log.c
> > +++ b/fs/xfs/xfs_log.c
> > @@ -808,6 +808,53 @@ xlog_wait_on_iclog(
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Wait on any iclogs that are still flushing in the range of start_lsn to
> > + * the current iclog's lsn. The caller holds a reference to the iclog, but
> > + * otherwise holds no log locks.
> > + *
> > + * We walk backwards through the iclogs to find the iclog with the highest lsn
> > + * in the range that we need to wait for and then wait for it to complete.
> > + * Completion ordering of iclog IOs ensures that all prior iclogs to this IO are
> > + * complete by the time our candidate has completed.
> > + */
> > +int
> > +xlog_wait_on_iclog_lsn(
> > +	struct xlog_in_core	*iclog,
> > +	xfs_lsn_t		start_lsn)
> > +{
> > +	struct xlog		*log = iclog->ic_log;
> > +	struct xlog_in_core	*prev;
> > +	int			error = -EIO;
> > +
> > +	spin_lock(&log->l_icloglock);
> > +	if (XLOG_FORCED_SHUTDOWN(log))
> > +		goto out_unlock;
> > +
> > +	error = 0;
> > +	for (prev = iclog->ic_prev; prev != iclog; prev = prev->ic_prev) {
> > +
> > +		/* Done if the lsn is before our start lsn */
> > +		if (XFS_LSN_CMP(be64_to_cpu(prev->ic_header.h_lsn),
> > +				start_lsn) < 0)
> > +			break;
> 
> Hmm.. that logic looks a bit dodgy when you consider that the iclog
> header lsn is reset to zero on activation. I think it actually works as
> intended because iclog completion reactivates iclogs in LSN order and
> this loop walks in reverse order, but that is a very subtle connection
> that might be useful to document.

It is documented in the comment above the function "We walk
backwards through the iclogs to find the iclog....". A newly
activated iclog will have an LSN of zero, and that means there are
no prior iclogs in the list in the range we need to flush because of
the completion ordering guarantees we have for iclog IO (i.e. they
are always completed in ring order, not IO completion order).

> > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> > index b0ef071b3cb5..c5cc1b7ad25e 100644
> > --- a/fs/xfs/xfs_log_cil.c
> > +++ b/fs/xfs/xfs_log_cil.c
> > @@ -870,6 +870,13 @@ xlog_cil_push_work(
> >  	wake_up_all(&cil->xc_commit_wait);
> >  	spin_unlock(&cil->xc_push_lock);
> >  
> > +	/*
> > +	 * If the checkpoint spans multiple iclogs, wait for all previous
> > +	 * iclogs to complete before we submit the commit_iclog.
> > +	 */
> > +	if (ctx->start_lsn != commit_lsn)
> > +		xlog_wait_on_iclog_lsn(commit_iclog, ctx->start_lsn);
> > +
> 
> This is an interesting serialization point because we don't necessarily
> submit the iclog that holds the commit record. I actually think in most
> cases the separate commit record won't land on disk until the next CIL
> push causes a sync of the current head iclog (assuming it fills it up).

Which is exactly what happens now. It's not a big deal, because if
we are running a sync transaction, the log force will push it to
disk and wait for it. CIL pushes are not synchronous and never have
been - the log force is what creates the appearance of synchronous
behaviour to the caller...

> Granted, this is the last point we have context of the commit record
> being written, so it makes sense from an opportunistic standpoint to
> serialize here (just as we already block to ensure commit record
> ordering across checkpoints). That said, with the aggressive batching
> ability of the CIL I think blocking on prior pushes is potentially less
> heavy handed than unconditionally blocking on all prior iclog I/O. I
> wonder if this might be something to address if we're going to compound
> this path with more serialization.

This adds ordering to a CIL push, not serialisation across pushes.
The CIL pipeline at this point is running fully asynchronously to
user operations and other pushes. The only serialisation point with user
operations is a log force as we've already ordered the commit record
in the iclogs against other CIL flushes that haven't completed.

i.e. there's nothing here to serialise against other CIL operations
in progress - pushes or commits - it's just internal push IO ordering
that we are waiting on here.

> From a performance perspective, it seems like this makes the CIL push
> _nearly_ synchronous by default.

Nope, not at all. There is nothing waiting on the CIL push to
complete, except maybe a log force waiting for the iclogs to be
synced to disk and completed.

> I.e., if we have a several hundred MB
> CIL context,

Not possible. A 2GB log has a hard throttle limit now of 64MB.

> we're going to wait for all but the final iclog to complete
> before we return. Of course we'll be waiting for much of that anyways to
> reuse limited iclog space, but I have to think more about what that
> means in general (maybe not much) and get through the subsequent
> patches. In the meantime, have you put thought into any potential
> negative performance impact from this that might or might not be offset
> by subsequent flush optimizations?

This isn't a stand-alone change - it's only separated out from the
cache flush changes to make them easier to see. I do not expect
there to be any performance degradation due to this change because
of the fact that the CIL will only block things waiting on a
synchronous log force at this point. In the case where there are
lots of sync log forces, checkpoints will be small and single
iclogs, so no change of behaviour. For checkpoints that are large,
the reduction in IO latency and the ability for the block layer to
now merge adjacent journal IO because they don't require FUA or
cache flushes more than makes up for any additional latency the
ordering might introduce.

Cheers,

Dave.

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] xfs: log stripe roundoff is a property of the log
  2021-01-28  4:41 ` [PATCH 1/5] xfs: log stripe roundoff is a property of the log Dave Chinner
  2021-01-28 14:57   ` Brian Foster
@ 2021-01-28 21:25   ` Darrick J. Wong
  2021-01-28 22:00     ` Dave Chinner
  1 sibling, 1 reply; 29+ messages in thread
From: Darrick J. Wong @ 2021-01-28 21:25 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:50PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> We don't need to look at the xfs_mount and superblock every time we
> need to do an iclog roundoff calculation. The property is fixed for
> the life of the log, so store the roundoff in the log at mount time
> and use that everywhere.
> 
> On a debug build:
> 
> $ size fs/xfs/xfs_log.o.*
>    text	   data	    bss	    dec	    hex	filename
>   27360	    560	      8	  27928	   6d18	fs/xfs/xfs_log.o.orig
>   27219	    560	      8	  27787	   6c8b	fs/xfs/xfs_log.o.patched
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_log_format.h |  3 --
>  fs/xfs/xfs_log.c               | 60 +++++++++++++++-------------------
>  fs/xfs/xfs_log_priv.h          |  2 ++
>  3 files changed, 28 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index 8bd00da6d2a4..16587219549c 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t;
>  #define XLOG_MIN_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
>  #define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
>  #define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
> -#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
> -                                 (log)->l_mp->m_sb.sb_logsunit)
> -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
>  
>  #define XLOG_HEADER_SIZE	512
>  
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 58699881c100..c5f507c24577 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -1400,6 +1400,12 @@ xlog_alloc_log(
>  	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
>  	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
>  
> +	/* roundoff padding for transaction data and one for commit record */
> +	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
> +		log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
> +	else
> +		log->l_iclog_roundoff = BBSIZE;
> +
>  	xlog_grant_head_init(&log->l_reserve_head);
>  	xlog_grant_head_init(&log->l_write_head);
>  
> @@ -1852,29 +1858,15 @@ xlog_calc_iclog_size(
>  	uint32_t		*roundoff)
>  {
>  	uint32_t		count_init, count;
> -	bool			use_lsunit;
> -
> -	use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
> -			log->l_mp->m_sb.sb_logsunit > 1;
>  
>  	/* Add for LR header */
>  	count_init = log->l_iclog_hsize + iclog->ic_offset;
> +	count = roundup(count_init, log->l_iclog_roundoff);
>  
> -	/* Round out the log write size */
> -	if (use_lsunit) {
> -		/* we have a v2 stripe unit to use */
> -		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
> -	} else {
> -		count = BBTOB(BTOBB(count_init));
> -	}
> -
> -	ASSERT(count >= count_init);
>  	*roundoff = count - count_init;
>  
> -	if (use_lsunit)
> -		ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
> -	else
> -		ASSERT(*roundoff < BBTOB(1));
> +	ASSERT(count >= count_init);
> +	ASSERT(*roundoff < log->l_iclog_roundoff);
>  	return count;
>  }
>  
> @@ -3149,10 +3141,9 @@ xlog_state_switch_iclogs(
>  	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
>  
>  	/* Round up to next log-sunit */
> -	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
> -	    log->l_mp->m_sb.sb_logsunit > 1) {
> -		uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
> -		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
> +	if (log->l_iclog_roundoff > BBSIZE) {
> +		log->l_curr_block = roundup(log->l_curr_block,
> +						BTOBB(log->l_iclog_roundoff));
>  	}
>  
>  	if (log->l_curr_block >= log->l_logBBsize) {
> @@ -3404,12 +3395,11 @@ xfs_log_ticket_get(
>   * Figure out the total log space unit (in bytes) that would be
>   * required for a log ticket.
>   */
> -int
> -xfs_log_calc_unit_res(
> -	struct xfs_mount	*mp,
> +static int
> +xlog_calc_unit_res(
> +	struct xlog		*log,
>  	int			unit_bytes)
>  {
> -	struct xlog		*log = mp->m_log;
>  	int			iclog_space;
>  	uint			num_headers;
>  
> @@ -3485,18 +3475,20 @@ xfs_log_calc_unit_res(
>  	/* for commit-rec LR header - note: padding will subsume the ophdr */
>  	unit_bytes += log->l_iclog_hsize;
>  
> -	/* for roundoff padding for transaction data and one for commit record */
> -	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
> -		/* log su roundoff */
> -		unit_bytes += 2 * mp->m_sb.sb_logsunit;
> -	} else {
> -		/* BB roundoff */
> -		unit_bytes += 2 * BBSIZE;
> -        }
> +	/* roundoff padding for transaction data and one for commit record */
> +	unit_bytes += log->l_iclog_roundoff;

I don't understand why the "2 *" disappears here.  It's not a part of
the roundoff computation when we allocate the log, so AFAICT it's not
just buried elsewhere?

Was the old code saying that it added the roundoff factor twice because
we needed to do so once for the transaction data and the second time for
the commit record?

--D

>  
>  	return unit_bytes;
>  }
>  
> +int
> +xfs_log_calc_unit_res(
> +	struct xfs_mount	*mp,
> +	int			unit_bytes)
> +{
> +	return xlog_calc_unit_res(mp->m_log, unit_bytes);
> +}
> +
>  /*
>   * Allocate and initialise a new log ticket.
>   */
> @@ -3513,7 +3505,7 @@ xlog_ticket_alloc(
>  
>  	tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
>  
> -	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
> +	unit_res = xlog_calc_unit_res(log, unit_bytes);
>  
>  	atomic_set(&tic->t_ref, 1);
>  	tic->t_task		= current;
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 1c6fdbf3d506..037950cf1061 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -436,6 +436,8 @@ struct xlog {
>  #endif
>  	/* log recovery lsn tracking (for buffer submission */
>  	xfs_lsn_t		l_recovery_lsn;
> +
> +	uint32_t		l_iclog_roundoff;/* padding roundoff */
>  };
>  
>  #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
> -- 
> 2.28.0
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] xfs: journal IO cache flush reductions
  2021-01-28  4:41 ` [PATCH 3/5] xfs: journal IO cache flush reductions Dave Chinner
  2021-01-28 15:12   ` Brian Foster
@ 2021-01-28 21:26   ` Dave Chinner
  2021-01-30 12:56   ` Chandan Babu R
  2 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28 21:26 UTC (permalink / raw)
  To: linux-xfs

On Thu, Jan 28, 2021 at 03:41:52PM +1100, Dave Chinner wrote:
> From: Steve Lord <lord@sgi.com>
> 

Oh, I just noticed that git mangled this commit message because I
quoted another commit in the commit message. i.e. it turned this:

> commit 95d97c36e5155075ba2eb22b17562cfcc53fcf96
> Author: Steve Lord <lord@sgi.com>
> Date:   Fri May 24 14:30:21 2002 +0000

into a "from" directive to say this is who authored the patch.

Ok, my guilt header has "from dchinner@redhat.com" in it because my
local repos are owned by "david@fromorbit.com", so something in the
toolchain has mangled this. It'll need a resend...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] xfs: journal IO cache flush reductions
  2021-01-28 15:12   ` Brian Foster
@ 2021-01-28 21:46     ` Dave Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28 21:46 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 10:12:05AM -0500, Brian Foster wrote:
> On Thu, Jan 28, 2021 at 03:41:52PM +1100, Dave Chinner wrote:
> ...
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_log.c      | 34 ++++++++++++++++++++++------------
> >  fs/xfs/xfs_log_priv.h |  3 +++
> >  2 files changed, 25 insertions(+), 12 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> > index c5e3da23961c..8de93893e0e6 100644
> > --- a/fs/xfs/xfs_log.c
> > +++ b/fs/xfs/xfs_log.c
> ...
> > @@ -2464,9 +2465,18 @@ xlog_write(
> >  		ASSERT(log_offset <= iclog->ic_size - 1);
> >  		ptr = iclog->ic_datap + log_offset;
> >  
> > -		/* start_lsn is the first lsn written to. That's all we need. */
> > -		if (!*start_lsn)
> > +		/*
> > +		 * Start_lsn is the first lsn written to. That's all the caller
> > +		 * needs to have returned. Setting it indicates the first iclog
> > +		 * of a new checkpoint or the commit record for a checkpoint, so
> > +		 * also mark the iclog as requiring a pre-flush to ensure all
> > +		 * metadata writeback or journal IO in the checkpoint is
> > +		 * correctly ordered against this new log write.
> > +		 */
> > +		if (!*start_lsn) {
> >  			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
> > +			iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
> > +		}
> 
> My understanding is that one of the reasons for the preflush per iclog
> approach is that we don't have any submission -> completion ordering
> guarantees across iclogs. This is why we explicitly order commit record
> completions and whatnot, to ensure the important bits are ordered
> correctly. The fact we implement that ordering ourselves suggests that
> PREFLUSH|FUA itself do not provide such ordering, though that's not
> something I've investigated.

PREFLUSH provides ordering between completed IOs and the IO to be
submitted. It does not provide any ordering guarantees against IO
currently in flight, so the application needs to wait for the IOs it
needs to order against to complete before issuing an IO with
PREFLUSH.

i.e. PREFLUSH provides a "many" completion to "single" submission
ordering guarantee on stable storage.

REQ_FUA only guarantees that when the write IO completes, it is on
stable storage. It does not provide ordering guarantees against any
IO in flight, nor IOs submitted while it is in flight. Once it
completes, however, it is guaranteed taht any latter IO submission
will hit stable storage after that IO.

i.e. REQ_FUA provides a "single" completion to "many" submission
ordering guarantee on stable storage.

> In any event, if the purpose fo the PREFLUSH is to ensure that metadata
> in the targeted LSN range is committed to stable storage, and we have no
> submission ordering guarantees across non-commit record iclogs, what
> prevents a subsequent iclog from the same checkpoint from completing
> before the first iclog with a PREFLUSH?

Fair point. I suspect that we should just do an explicit cache flush
before we start the checkpoint, and then we don't have to worry
about REQ_PREFLUSH for the first iclog in the checkpoint at all.

Actually, I wonder if we can pipeline that - submit an async cache
flush bio as soon as we enter the push work, then once we're ready
to call xlog_write() having pulled the hundreds of thousands of log
vectors off the CIL, we wait on the cache flush bio to complete.
THis gets around the first iclog in a long checkpoint requiring 
cache flushing or FUA. It also means that if there is a single
iclog for the checkpoint, we only need a FUA write as the cache
flush has already been done...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 5/5] xfs: reduce buffer log item shadow allocations
  2021-01-28 16:54   ` Brian Foster
@ 2021-01-28 21:58     ` Dave Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28 21:58 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 11:54:35AM -0500, Brian Foster wrote:
> On Thu, Jan 28, 2021 at 03:41:54PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > When we modify btrees repeatedly, we regularly increase the size of
> > the logged region by a single chunk at a time (per transaction
> > commit). This results in the CIL formatting code having to
> > reallocate the log vector buffer every time the buffer dirty region
> > grows. Hence over a typical 4kB btree buffer, we might grow the log
> > vector 4096/128 = 32x over a short period where we repeatedly add
> > or remove records to/from the buffer over a series of running
> > transaction. This means we are doing 32 memory allocations and frees
> > over this time during a performance critical path in the journal.
> > 
> > The amount of space tracked in the CIL for the object is calculated
> > during the ->iop_format() call for the buffer log item, but the
> > buffer memory allocated for it is calculated by the ->iop_size()
> > call. The size callout determines the size of the buffer, the format
> > call determines the space used in the buffer.
> > 
> > Hence we can oversize the buffer space required in the size
> > calculation without impacting the amount of space used and accounted
> > to the CIL for the changes being logged. This allows us to reduce
> > the number of allocations by rounding up the buffer size to allow
> > for future growth. This can safe a substantial amount of CPU time in
> > this path:
> > 
> > -   46.52%     2.02%  [kernel]                  [k] xfs_log_commit_cil
> >    - 44.49% xfs_log_commit_cil
> >       - 30.78% _raw_spin_lock
> >          - 30.75% do_raw_spin_lock
> >               30.27% __pv_queued_spin_lock_slowpath
> > 
> > (oh, ouch!)
> > ....
> >       - 1.05% kmem_alloc_large
> >          - 1.02% kmem_alloc
> >               0.94% __kmalloc
> > 
> > This overhead here us what this patch is aimed at. After:
> > 
> >       - 0.76% kmem_alloc_large                                                                                                                                      ▒
> >          - 0.75% kmem_alloc                                                                                                                                         ▒
> >               0.70% __kmalloc                                                                                                                                       ▒
> > 
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_buf_item.c | 13 +++++++++++--
> >  1 file changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> > index 17960b1ce5ef..0628a65d9c55 100644
> > --- a/fs/xfs/xfs_buf_item.c
> > +++ b/fs/xfs/xfs_buf_item.c
> ...
> > @@ -181,10 +182,18 @@ xfs_buf_item_size(
> >  	 * count for the extra buf log format structure that will need to be
> >  	 * written.
> >  	 */
> > +	bytes = 0;
> >  	for (i = 0; i < bip->bli_format_count; i++) {
> >  		xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
> > -					  nvecs, nbytes);
> > +					  nvecs, &bytes);
> >  	}
> > +
> > +	/*
> > +	 * Round up the buffer size required to minimise the number of memory
> > +	 * allocations that need to be done as this item grows when relogged by
> > +	 * repeated modifications.
> > +	 */
> > +	*nbytes = round_up(bytes, 512);
> 
> If nbytes starts out as zero anyways, what's the need for the new
> variable? Otherwise looks reasonable.

Personal preference. It makes the code clear that we are returning
just the size of this item, not blindly adding it to an external
variable.

Just another example of an API wart that is a hold over from ancient
code from before the days of delayed logging. ->iop_size is always
called with nvecs = nbytes = 0, so they only ever return the
size/vecs the item will use. The ancient code passed running count
variables to these functions, not "always initialised to zero".
varaibles. We should really clean that up across the entire
interface at some point...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/5] xfs: log stripe roundoff is a property of the log
  2021-01-28 21:25   ` Darrick J. Wong
@ 2021-01-28 22:00     ` Dave Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2021-01-28 22:00 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 01:25:11PM -0800, Darrick J. Wong wrote:
> On Thu, Jan 28, 2021 at 03:41:50PM +1100, Dave Chinner wrote:
> > @@ -3404,12 +3395,11 @@ xfs_log_ticket_get(
> >   * Figure out the total log space unit (in bytes) that would be
> >   * required for a log ticket.
> >   */
> > -int
> > -xfs_log_calc_unit_res(
> > -	struct xfs_mount	*mp,
> > +static int
> > +xlog_calc_unit_res(
> > +	struct xlog		*log,
> >  	int			unit_bytes)
> >  {
> > -	struct xlog		*log = mp->m_log;
> >  	int			iclog_space;
> >  	uint			num_headers;
> >  
> > @@ -3485,18 +3475,20 @@ xfs_log_calc_unit_res(
> >  	/* for commit-rec LR header - note: padding will subsume the ophdr */
> >  	unit_bytes += log->l_iclog_hsize;
> >  
> > -	/* for roundoff padding for transaction data and one for commit record */
> > -	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
> > -		/* log su roundoff */
> > -		unit_bytes += 2 * mp->m_sb.sb_logsunit;
> > -	} else {
> > -		/* BB roundoff */
> > -		unit_bytes += 2 * BBSIZE;
> > -        }
> > +	/* roundoff padding for transaction data and one for commit record */
> > +	unit_bytes += log->l_iclog_roundoff;
> 
> I don't understand why the "2 *" disappears here.  It's not a part of
> the roundoff computation when we allocate the log, so AFAICT it's not
> just buried elsewhere?
> 
> Was the old code saying that it added the roundoff factor twice because
> we needed to do so once for the transaction data and the second time for
> the commit record?

Just  a bug. I originally copying this entire chunk into the log
init code (hence the comment), then I found another place it could
be used but it didn't need the "* 2" value. SO i changed the init
site, forgetting to put it back here.

Will fix and resend.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/5] xfs: separate CIL commit record IO
       [not found]       ` <20210129145851.GB2660974@bfoster>
@ 2021-01-29 22:25         ` Dave Chinner
  2021-02-01 16:07           ` Brian Foster
  0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2021-01-29 22:25 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs

On Fri, Jan 29, 2021 at 09:58:51AM -0500, Brian Foster wrote:
> On Fri, Jan 29, 2021 at 08:22:21AM +1100, Dave Chinner wrote:
> > On Thu, Jan 28, 2021 at 10:07:41AM -0500, Brian Foster wrote:
> > > FYI, I finally got to reading your prior log cache flushing thread
> > > yesterday afternoon. I was planning to revisit that and probably reply
> > > this morning after having some time to digest, but saw this and so will
> > > reply here..
> > > 
> > > On Thu, Jan 28, 2021 at 03:41:51PM +1100, Dave Chinner wrote:
> > > > From: Dave Chinner <dchinner@redhat.com>
> > > > 
> > > > To allow for iclog IO device cache flush behaviour to be optimised,
> > > > we first need to separate out the commit record iclog IO from the
> > > > rest of the checkpoint so we can wait for the checkpoint IO to
> > > > complete before we issue the commit record.
> > > > 
> > > > This separate is only necessary if the commit record is being
> > > > written into a different iclog to the start of the checkpoint. If
> > > > the entire checkpoint and commit is in the one iclog, then they are
> > > > both covered by the one set of cache flush primitives on the iclog
> > > > and hence there is no need to separate them.
> > > > 
> > > 
> > > I find the description here a bit vague.. we have to separate out the
> > > commit record I/O, but only if it's already separate..?
> > 
> > Yes, because if the commit record is in a different iclog to the
> > start of the checkpoint, we have to ensure that the start of the
> > checkpoint has been written and completed before we issue a
> > PREFLUSH, otherwise the cache flush is not guaranteed to capture
> > that IO. i.e. the block layer cache flushing mechanisms do not order
> > against IOs in flight, only completed IOs. Hence if the commit
> > record is in a different iclog, we have to wait for the other iclogs
> > to complete before flushign the cache. If the checkpoint is wholly
> > contained within a single iclog, then we don't have to wait for
> > anything, the single journal IO has all the cache flushes we need.
> > 
> 
> Yeah, I get the functional behavior. I think the commit log would do
> better to refer to serializing or ordering the commit record (when split
> or separate) as opposed to separating it.

Except we already have "ordering the commit record" functionality
implemented in the CIL push code, where it explicitly orders the
commit record for the push against the commit records of other
pushes in progress.

SO I don't think naming it "ordering the commit record" improves the
situation because it introduces ambiguity into what that means. It's
also not serialisation...

> > > > +int
> > > > +xlog_wait_on_iclog_lsn(
> > > > +	struct xlog_in_core	*iclog,
> > > > +	xfs_lsn_t		start_lsn)
> > > > +{
> > > > +	struct xlog		*log = iclog->ic_log;
> > > > +	struct xlog_in_core	*prev;
> > > > +	int			error = -EIO;
> > > > +
> > > > +	spin_lock(&log->l_icloglock);
> > > > +	if (XLOG_FORCED_SHUTDOWN(log))
> > > > +		goto out_unlock;
> > > > +
> > > > +	error = 0;
> > > > +	for (prev = iclog->ic_prev; prev != iclog; prev = prev->ic_prev) {
> > > > +
> > > > +		/* Done if the lsn is before our start lsn */
> > > > +		if (XFS_LSN_CMP(be64_to_cpu(prev->ic_header.h_lsn),
> > > > +				start_lsn) < 0)
> > > > +			break;
> > > 
> > > Hmm.. that logic looks a bit dodgy when you consider that the iclog
> > > header lsn is reset to zero on activation. I think it actually works as
> > > intended because iclog completion reactivates iclogs in LSN order and
> > > this loop walks in reverse order, but that is a very subtle connection
> > > that might be useful to document.
> > 
> > It is documented in the comment above the function "We walk
> > backwards through the iclogs to find the iclog....". A newly
> > activated iclog will have an LSN of zero, and that means there are
> > no prior iclogs in the list in the range we need to flush because of
> > the completion ordering guarantees we have for iclog IO (i.e. they
> > are always completed in ring order, not IO completion order).
> > 
> 
> Can you update the comment with that sentence to be more explicit about
> the zero LSN case?


> 
> > > > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> > > > index b0ef071b3cb5..c5cc1b7ad25e 100644
> > > > --- a/fs/xfs/xfs_log_cil.c
> > > > +++ b/fs/xfs/xfs_log_cil.c
> > > > @@ -870,6 +870,13 @@ xlog_cil_push_work(
> > > >  	wake_up_all(&cil->xc_commit_wait);
> > > >  	spin_unlock(&cil->xc_push_lock);
> > > >  
> > > > +	/*
> > > > +	 * If the checkpoint spans multiple iclogs, wait for all previous
> > > > +	 * iclogs to complete before we submit the commit_iclog.
> > > > +	 */
> > > > +	if (ctx->start_lsn != commit_lsn)
> > > > +		xlog_wait_on_iclog_lsn(commit_iclog, ctx->start_lsn);
> > > > +
> > > 
> > > This is an interesting serialization point because we don't necessarily
> > > submit the iclog that holds the commit record. I actually think in most
> > > cases the separate commit record won't land on disk until the next CIL
> > > push causes a sync of the current head iclog (assuming it fills it up).
> > 
> > Which is exactly what happens now. It's not a big deal, because if
> > we are running a sync transaction, the log force will push it to
> > disk and wait for it. CIL pushes are not synchronous and never have
> > been - the log force is what creates the appearance of synchronous
> > behaviour to the caller...
> > 
> 
> Right, but otherwise the CIL push blocks on previous checkpoints (and
> now the majority of the current checkpoint) only to write the commit
> record into an iclog and probably not submit it until the next CIL
> checkpoint.

That's exactly the same behaviour we have now. Nothing guarantees
that the iclog containing a commit record is written immediately.
So this simply isn't a concern because we are not changing
behaviour.

> I agree it's probably not a big deal. The same behavior
> likely repeats for subsequent CIL pushes, so it may not even be
> noticeable. This is just an observation that we're compounding more
> serialization

This does not add serialisation!

Serialisation means you do something that will stall *other tasks*
because they have to wait for the current task to finish before they
can proceed. Adding an _ordering delay_ into an unserialised task
does not create new serialisation points, it just lengthens the
runtime of that task.

> on another bit of a historical impedance mismatch between
> delayed logging and the underlying layer. We've potentially just wrote
> out a large context, yet none of the attached items can be processed
> until another checkpoint occurs because the callbacks that do AIL
> insertion and whatnot don't run until the iclog with the commit record
> hits the disk.

Yup, classic two phase commit algorithm behaviour right there. We
already have this occurring and it most definitely is not a problem
that we need to solve. Indeed, this behaviour is the prime reason
for log covering to exist...

> This isn't necessarily a commentary on this patch, but it would be nice
> if we could start thinking about handling that situation better;
> particularly if we've just blocked waiting for the majority of that
> checkpoint I/O to complete.

The thing is, we are already blocking  waiting for iclog IO to
complete. We're doing it at a rate of at lease 1 in every 4 iclogs
we need to fill in long running checkpoints. So the CIL push that
needs to write a thousand iclogs before the commit record is written
has already blocked on iclog IO completion at least 250 times.

In comaprison, with the cache flush changes and this commit record
separation, I'm seeing the iclog block rate drop to 1 in 100 iclogs
written. The fact is that cache flushing is -far more expensive- and
results is far more blocking and IO completion waiting in the CIL
push than this code that orders the commit record by waiting for
iclog completion to occur.

CIL checkpoint submission is far more IO efficient and *much lower
latency* with this single software level ordering delay than using
hardware cache flushes to guarantee ordering. To focus on "this
new ordering delay might add latency" completely misses the bigger
picture that we just replaced 2-3 orders of magnitude of IO blocking
delays with a single ordering delay.

That's what's important here - it's not that we've added a single
ordering delay into the checkpoint, but that we've removed
*hundreds* of IO blocking delays in the checkpoint writing process.

> That means there's an increased chance that
> the next iclog in the ring may be active. Perhaps we could introduce
> some logic to switch out the commit record iclog before the CIL push
> returns in those particular cases.  For example, switch out if the
> current checkpoint hit multiple iclogs..? Or hit multiple iclogs and the
> next iclog in the ring is clean..?

We could just call xlog_state_switch_iclogs() to mark it WANT_SYNC,
but then we can't aggregate more changes into it and fill it up. If
someone starts waiting on that iclog (i.e. a log force), then it
immediately gets marked WANT_SYNC and submitted to disk when it is
released. But if there is no-one waiting on it, then we largely
don't care if an asynchronous checkpoint is committed immediately,
at the start of the next checkpoint, or at worst, within 30s when
the log worker next kicks a log force....

It's just not necessary to ensure the commit record hits the disk
with minimal latency...

> > > From a performance perspective, it seems like this makes the CIL push
> > > _nearly_ synchronous by default.
> > 
> > Nope, not at all. There is nothing waiting on the CIL push to
> > complete, except maybe a log force waiting for the iclogs to be
> > synced to disk and completed.
> > 
> 
> Right, that's sort of the point... we have various places that issue
> async log forces. These async log forces push the CIL, which
> historically only ever blocks on log I/O by virtue of waiting on iclog
> space. IOW, there is a bit of runway between an async CIL push blocking
> on I/O or not provided by the iclog mechanism.
> 
> With this patch, a CIL push (and thus an async log force) now does not
> return until all but the commit record iclog have been submitted for I/O
> and completed.

Same as right now. I have not changed anything.

> The log force will then submit the commit record log, but
> then not wait on it. This is obviously not a functional problem since a
> CIL push was never guaranteed to not block at all (iclogs are a limited
> resource), but there's clearly a change in behavior here worth reasoning
> about for async forces..

No, there is no change in behaviour at all. The log force in both
cases submits the commit record.

> > > I.e., if we have a several hundred MB
> > > CIL context,
> > 
> > Not possible. A 2GB log has a hard throttle limit now of 64MB.
> > 
> 
> Well, I don't think the throttle guarantees an upper bound on the
> context size, but that's a different topic.

Sure, it's a bit lazy. But once you capture the user tasks that run
commits on the throttle, the size stops growing.

> That was just an example of
> a largish sized checkpoint. 64MB is still plenty enough to require
> multiple passes through the set of iclogs. Regardless, I think the
> before and after change is not significant given that a 64MB checkpoint
> already requires internal blocking to reuse iclog space as the
> checkpoint flushes anyways. So it's really just (l_iclog_bufs - 1) of
> additional I/O we're waiting for in the worst case.

I put the metrics that prove this assertion wrong are in the commit
message of the next patch. i.e. I measured how many "noiclog" events
are occuring during CIL checkpoints. I mentioned that above already
- reducing the IO blocking rate from hundreds of events per
checkpoint down to a small handful is a major win....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/5] xfs: separate CIL commit record IO
  2021-01-28  4:41 ` [PATCH 2/5] xfs: separate CIL commit record IO Dave Chinner
  2021-01-28 15:07   ` Brian Foster
@ 2021-01-30  9:13   ` Chandan Babu R
  2021-02-01 12:59   ` Christoph Hellwig
  2 siblings, 0 replies; 29+ messages in thread
From: Chandan Babu R @ 2021-01-30  9:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On 28 Jan 2021 at 10:11, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> To allow for iclog IO device cache flush behaviour to be optimised,
> we first need to separate out the commit record iclog IO from the
> rest of the checkpoint so we can wait for the checkpoint IO to
> complete before we issue the commit record.
>
> This separate is only necessary if the commit record is being
> written into a different iclog to the start of the checkpoint. If
> the entire checkpoint and commit is in the one iclog, then they are
> both covered by the one set of cache flush primitives on the iclog
> and hence there is no need to separate them.
>
> Otherwise, we need to wait for all the previous iclogs to complete
> so they are ordered correctly and made stable by the REQ_PREFLUSH
> that the commit record iclog IO issues. This guarantees that if a
> reader sees the commit record in the journal, they will also see the
> entire checkpoint that commit record closes off.
>
> This also provides the guarantee that when the commit record IO
> completes, we can safely unpin all the log items in the checkpoint
> so they can be written back because the entire checkpoint is stable
> in the journal.
>

W.r.t correctness of the changes in this patch,

Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log.c      | 47 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_log_cil.c  |  7 +++++++
>  fs/xfs/xfs_log_priv.h |  2 ++
>  3 files changed, 56 insertions(+)
>
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index c5f507c24577..c5e3da23961c 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -808,6 +808,53 @@ xlog_wait_on_iclog(
>  	return 0;
>  }
>  
> +/*
> + * Wait on any iclogs that are still flushing in the range of start_lsn to
> + * the current iclog's lsn. The caller holds a reference to the iclog, but
> + * otherwise holds no log locks.
> + *
> + * We walk backwards through the iclogs to find the iclog with the highest lsn
> + * in the range that we need to wait for and then wait for it to complete.
> + * Completion ordering of iclog IOs ensures that all prior iclogs to this IO are
> + * complete by the time our candidate has completed.
> + */
> +int
> +xlog_wait_on_iclog_lsn(
> +	struct xlog_in_core	*iclog,
> +	xfs_lsn_t		start_lsn)
> +{
> +	struct xlog		*log = iclog->ic_log;
> +	struct xlog_in_core	*prev;
> +	int			error = -EIO;
> +
> +	spin_lock(&log->l_icloglock);
> +	if (XLOG_FORCED_SHUTDOWN(log))
> +		goto out_unlock;
> +
> +	error = 0;
> +	for (prev = iclog->ic_prev; prev != iclog; prev = prev->ic_prev) {
> +
> +		/* Done if the lsn is before our start lsn */
> +		if (XFS_LSN_CMP(be64_to_cpu(prev->ic_header.h_lsn),
> +				start_lsn) < 0)
> +			break;
> +
> +		/* Don't need to wait on completed, clean iclogs */
> +		if (prev->ic_state == XLOG_STATE_DIRTY ||
> +		    prev->ic_state == XLOG_STATE_ACTIVE) {
> +			continue;
> +		}
> +
> +		/* wait for completion on this iclog */
> +		xlog_wait(&prev->ic_force_wait, &log->l_icloglock);
> +		return 0;
> +	}
> +
> +out_unlock:
> +	spin_unlock(&log->l_icloglock);
> +	return error;
> +}
> +
>  /*
>   * Write out an unmount record using the ticket provided. We have to account for
>   * the data space used in the unmount ticket as this write is not done from a
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index b0ef071b3cb5..c5cc1b7ad25e 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -870,6 +870,13 @@ xlog_cil_push_work(
>  	wake_up_all(&cil->xc_commit_wait);
>  	spin_unlock(&cil->xc_push_lock);
>  
> +	/*
> +	 * If the checkpoint spans multiple iclogs, wait for all previous
> +	 * iclogs to complete before we submit the commit_iclog.
> +	 */
> +	if (ctx->start_lsn != commit_lsn)
> +		xlog_wait_on_iclog_lsn(commit_iclog, ctx->start_lsn);
> +
>  	/* release the hounds! */
>  	xfs_log_release_iclog(commit_iclog);
>  	return;
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 037950cf1061..a7ac85aaff4e 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -584,6 +584,8 @@ xlog_wait(
>  	remove_wait_queue(wq, &wait);
>  }
>  
> +int xlog_wait_on_iclog_lsn(struct xlog_in_core *iclog, xfs_lsn_t start_lsn);
> +
>  /*
>   * The LSN is valid so long as it is behind the current LSN. If it isn't, this
>   * means that the next log record that includes this metadata could have a


-- 
chandan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] xfs: journal IO cache flush reductions
  2021-01-28  4:41 ` [PATCH 3/5] xfs: journal IO cache flush reductions Dave Chinner
  2021-01-28 15:12   ` Brian Foster
  2021-01-28 21:26   ` Dave Chinner
@ 2021-01-30 12:56   ` Chandan Babu R
  2 siblings, 0 replies; 29+ messages in thread
From: Chandan Babu R @ 2021-01-30 12:56 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On 28 Jan 2021 at 10:11, Dave Chinner wrote:
> From: Steve Lord <lord@sgi.com>
>
> Currently every journal IO is issued as REQ_PREFLUSH | REQ_FUA to
> guarantee the ordering requirements the journal has w.r.t. metadata
> writeback. THe two ordering constraints are:
>
> 1. we cannot overwrite metadata in the journal until we guarantee
> that the dirty metadata has been written back in place and is
> stable.
>
> 2. we cannot write back dirty metadata until it has been written to
> the journal and guaranteed to be stable (and hence recoverable) in
> the journal.
>
> THe ordering guarantees of #1 are provided by REQ_PREFLUSH. This
> causes the journal IO to issue a cache flush and wait for it to
> complete before issuing the write IO to the journal. Hence all
> completed metadata IO is guaranteed to be stable before the journal
> overwrites the old metadata.
>
> THe ordering guarantees of #2 are provided by the REQ_FUA, which
> ensures the journal writes do not complete until they are on stable
> storage. Hence by the time the last journal IO in a checkpoint
> completes, we know that the entire checkpoint is on stable storage
> and we can unpin the dirty metadata and allow it to be written back.
>
> This is the mechanism by which ordering was first implemented in XFS
> way back in 2002 by this commit:
>
> commit 95d97c36e5155075ba2eb22b17562cfcc53fcf96
> Author: Steve Lord <lord@sgi.com>
> Date:   Fri May 24 14:30:21 2002 +0000
>
>     Add support for drive write cache flushing - should the kernel
>     have the infrastructure
>
> A lot has changed since then, most notably we now use delayed
> logging to checkpoint the filesystem to the journal rather than
> write each individual transaction to the journal. Cache flushes on
> journal IO are necessary when individual transactions are wholly
> contained within a single iclog. However, CIL checkpoints are single
> transactions that typically span hundreds to thousands of individual
> journal writes, and so the requirements for device cache flushing
> have changed.
>
> That is, the ordering rules I state above apply to ordering of
> atomic transactions recorded in the journal, not to the journal IO
> itself. Hence we need to ensure metadata is stable before we start
> writing a new transaction to the journal (guarantee #1), and we need
> to ensure the entire transaction is stable in the journal before we
> start metadata writeback (guarantee #2).
>
> Hence we only need a REQ_PREFLUSH on the journal IO that starts a
> new journal transaction to provide #1, and it is not on any other
> journal IO done within the context of that journal transaction.
>
> To ensure that the entire journal transaction is on stable storage
> before we run the completion code that unpins all the dirty metadata
> recorded in the journal transaction, the last write of the
> transaction must also ensure that the entire journal transaction is
> stable. We already know what IO that will be, thanks to the commit
> record we explicitly write to complete the transaction. We can order
> all the previous journal IO for this transaction by waiting for all
> the previous iclogs containing the transaction data to complete
> their IO, then issuing the commit record IO using REQ_PREFLUSH
> | REQ_FUA. The preflush ensures all the previous journal IO is
> stable before the commit record hits the log, and the REQ_FUA
> ensures that the commit record is stable before completion is
> signalled to the filesystem.
>
> Hence using REQ_PREFLUSH on the first IO of a journal transaction,
> and then ordering the journal IO before issuing the commit record
> with REQ_PREFLUSH | REQ_FUA, we get all the same ordering guarantees
> that we currently achieve by issuing all journal IO with cache
> flushes.
>
> As an optimisation, when the commit record lands in the same iclog
> as the journal transaction starts, we don't need to wait for
> anything and can simply issue the journal IO with REQ_PREFLUSH |
> REQ_FUA as we do right now. This means that for fsync() heavy
> workloads, the cache flush behaviour is completely unchanged and
> there is no degradation in performance as a result of optimise the
> multi-IO transaction case.
>
> To further simplify the implementation, we also issue the initial IO
> in a journal transaction with REQ_FUA. THis ensures the journal is
> dirtied by the first IO in a long running transaction as quickly as
> possible. This helps ensure that log recovery will at least have a
> transaction header for the incomplete transaction in the log similar
> to the stable journal write behaviour we have now.
>
> The most notable sign that there is less IO latency on my test
> machine (nvme SSDs) is that the "noiclogs" rate has dropped
> substantially. This metric indicates that the CIL push is blocking
> in xlog_get_iclog_space() waiting for iclog IO completion to occur.
> With 8 iclogs of 256kB, the rate is appoximately 1 noiclog event to
> every 4 iclog writes. IOWs, every 4th call to xlog_get_iclog_space()
> is blocking waiting for log IO. With the changes in this patch, this
> drops to 1 noiclog event for every 100 iclog writes. Hence it is
> clear that log IO is completing much faster than it was previously,
> but it is also clear that for large iclog sizes, this isn't the
> performance limiting factor on this hardware.
>
> With smaller iclogs (32kB), however, there is a sustantial
> difference. With the cache flush modifications, the journal is now
> running at over 4000 write IOPS, and the journal throughput is
> largely identical to the 256kB iclogs and the noiclog event rate
> stays low at about 1:50 iclog writes. The existing code tops out at
> about 2500 IOPS as the number of cache flushes dominate performance
> and latency. The noiclog event rate is about 1:4, and the
> performance variance is quite large as the journal throughput can
> fall to less than half the peak sustained rate when the cache flush
> rate prevents metadata writeback from keeping up and the log runs
> out of space and throttles reservations.
>
> As a result:
>
> 	logbsize	fsmark create rate	rm -rf
> before	32kb		152851+/-5.3e+04	5m28s
> patched	32kb		221533+/-1.1e+04	5m24s
>
> before	256kb		220239+/-6.2e+03	4m58s
> patched	256kb		228286+/-9.2e+03	5m06s
>
> The rm -rf times are included because I ran them, but the
> differences are largely noise. This workload is largely metadata
> read IO latency bound and the changes to the journal cache flushing
> doesn't really make any noticable difference to behaviour apart from
> a reduction in noiclog events from background CIL pushing.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log.c      | 34 ++++++++++++++++++++++------------
>  fs/xfs/xfs_log_priv.h |  3 +++
>  2 files changed, 25 insertions(+), 12 deletions(-)
>
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index c5e3da23961c..8de93893e0e6 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -1804,8 +1804,7 @@ xlog_write_iclog(
>  	struct xlog		*log,
>  	struct xlog_in_core	*iclog,
>  	uint64_t		bno,
> -	unsigned int		count,
> -	bool			need_flush)
> +	unsigned int		count)
>  {
>  	ASSERT(bno < log->l_logBBsize);
>
> @@ -1843,10 +1842,11 @@ xlog_write_iclog(
>  	 * writeback throttle from throttling log writes behind background
>  	 * metadata writeback and causing priority inversions.
>  	 */
> -	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
> -				REQ_IDLE | REQ_FUA;
> -	if (need_flush)
> -		iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
> +	iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
> +	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
> +		iclog->ic_bio.bi_opf |= REQ_PREFLUSH | REQ_FUA;
> +		iclog->ic_flags &= ~XLOG_ICL_NEED_FLUSH;
> +	}
>
>  	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
>  		xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
> @@ -1949,7 +1949,7 @@ xlog_sync(
>  	unsigned int		roundoff;       /* roundoff to BB or stripe */
>  	uint64_t		bno;
>  	unsigned int		size;
> -	bool			need_flush = true, split = false;
> +	bool			split = false;
>
>  	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
>
> @@ -2007,13 +2007,14 @@ xlog_sync(
>  	 * synchronously here; for an internal log we can simply use the block
>  	 * layer state machine for preflushes.
>  	 */
> -	if (log->l_targ != log->l_mp->m_ddev_targp || split) {
> +	if (log->l_targ != log->l_mp->m_ddev_targp ||
> +	    (split && (iclog->ic_flags & XLOG_ICL_NEED_FLUSH))) {
>  		xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
> -		need_flush = false;
> +		iclog->ic_flags &= ~XLOG_ICL_NEED_FLUSH;
>  	}

When the log is stored on an external device and say the iclog above contains
the commit record of a multi-IO journal transaction, the "if" statement's
condition evaluates to true. This causes the data device's cache to be
flushed and the flag XLOG_ICL_NEED_FLUSH to be removed from iclog->ic_flags.

The call to xlog_write_iclog() below will result in submitting a bio that does
not have "REQ_PREFLUSH | REQ_FUA" flags set. Hence this causes #2 constraint
to be violated since the log device's cache has not been flushed by the time
filesystem is informed about the completion of IO.


>
>  	xlog_verify_iclog(log, iclog, count);
> -	xlog_write_iclog(log, iclog, bno, count, need_flush);
> +	xlog_write_iclog(log, iclog, bno, count);
>  }
>
>  /*
> @@ -2464,9 +2465,18 @@ xlog_write(
>  		ASSERT(log_offset <= iclog->ic_size - 1);
>  		ptr = iclog->ic_datap + log_offset;
>
> -		/* start_lsn is the first lsn written to. That's all we need. */
> -		if (!*start_lsn)
> +		/*
> +		 * Start_lsn is the first lsn written to. That's all the caller
> +		 * needs to have returned. Setting it indicates the first iclog
> +		 * of a new checkpoint or the commit record for a checkpoint, so
> +		 * also mark the iclog as requiring a pre-flush to ensure all
> +		 * metadata writeback or journal IO in the checkpoint is
> +		 * correctly ordered against this new log write.
> +		 */
> +		if (!*start_lsn) {
>  			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
> +			iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
> +		}
>
>  		/*
>  		 * This loop writes out as many regions as can fit in the amount
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index a7ac85aaff4e..9f1e627ccb74 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -133,6 +133,8 @@ enum xlog_iclog_state {
>
>  #define XLOG_COVER_OPS		5
>
> +#define XLOG_ICL_NEED_FLUSH     (1 << 0)        /* iclog needs REQ_PREFLUSH */
> +
>  /* Ticket reservation region accounting */
>  #define XLOG_TIC_LEN_MAX	15
>
> @@ -201,6 +203,7 @@ typedef struct xlog_in_core {
>  	u32			ic_size;
>  	u32			ic_offset;
>  	enum xlog_iclog_state	ic_state;
> +	unsigned int		ic_flags;
>  	char			*ic_datap;	/* pointer to iclog data */
>
>  	/* Callback structures need their own cacheline */


--
chandan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/5] xfs: various log stuff...
  2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
                   ` (4 preceding siblings ...)
  2021-01-28  4:41 ` [PATCH 5/5] xfs: reduce buffer log item shadow allocations Dave Chinner
@ 2021-02-01 12:39 ` Christoph Hellwig
  2021-02-03 21:20   ` Dave Chinner
  5 siblings, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2021-02-01 12:39 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:49PM +1100, Dave Chinner wrote:
> Hi folks,
> 
> Quick patch dump for y'all. A couple of minor cleanups to the
> log behaviour, a fix for the CIL throttle hang and a couple of
> patches to rework the cache flushing that journal IO does to reduce
> the number of cache flushes by a couple of orders of magnitude.
> 
> All passes fstests with no regressions, no performance regressions
> from fsmark, dbench and various fio workloads, some big gains even
> on fast storage.

Can you elaborate on the big gains?  Workloads for one, but also
what kind of storage.  For less FUA/flush to matter the device needs
to have a write cache, which none of the really fast SSDs even has.

So I'd only really expect gains from that on consumer grade SSDs and
hard drives.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/5] xfs: separate CIL commit record IO
  2021-01-28  4:41 ` [PATCH 2/5] xfs: separate CIL commit record IO Dave Chinner
  2021-01-28 15:07   ` Brian Foster
  2021-01-30  9:13   ` Chandan Babu R
@ 2021-02-01 12:59   ` Christoph Hellwig
  2 siblings, 0 replies; 29+ messages in thread
From: Christoph Hellwig @ 2021-02-01 12:59 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Thu, Jan 28, 2021 at 03:41:51PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> To allow for iclog IO device cache flush behaviour to be optimised,
> we first need to separate out the commit record iclog IO from the
> rest of the checkpoint so we can wait for the checkpoint IO to
> complete before we issue the commit record.
> 
> This separate is only necessary if the commit record is being

s/separate/separation/g

> written into a different iclog to the start of the checkpoint. If
> the entire checkpoint and commit is in the one iclog, then they are
> both covered by the one set of cache flush primitives on the iclog
> and hence there is no need to separate them.
> 
> Otherwise, we need to wait for all the previous iclogs to complete
> so they are ordered correctly and made stable by the REQ_PREFLUSH
> that the commit record iclog IO issues. This guarantees that if a
> reader sees the commit record in the journal, they will also see the
> entire checkpoint that commit record closes off.
> 
> This also provides the guarantee that when the commit record IO
> completes, we can safely unpin all the log items in the checkpoint
> so they can be written back because the entire checkpoint is stable
> in the journal.

I'm a little worried about the direction for devices without a volatile
write cache like all highend enterprise SSDs, Arrays and hard drives,
where we not introduce another synchronization point without any gains
from the reduction in FUA/flush traffic that is a no-op there.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/5] xfs: separate CIL commit record IO
  2021-01-29 22:25         ` Dave Chinner
@ 2021-02-01 16:07           ` Brian Foster
  0 siblings, 0 replies; 29+ messages in thread
From: Brian Foster @ 2021-02-01 16:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On Sat, Jan 30, 2021 at 09:25:59AM +1100, Dave Chinner wrote:
> On Fri, Jan 29, 2021 at 09:58:51AM -0500, Brian Foster wrote:
> > On Fri, Jan 29, 2021 at 08:22:21AM +1100, Dave Chinner wrote:
> > > On Thu, Jan 28, 2021 at 10:07:41AM -0500, Brian Foster wrote:
> > > > FYI, I finally got to reading your prior log cache flushing thread
> > > > yesterday afternoon. I was planning to revisit that and probably reply
> > > > this morning after having some time to digest, but saw this and so will
> > > > reply here..
> > > > 
> > > > On Thu, Jan 28, 2021 at 03:41:51PM +1100, Dave Chinner wrote:
> > > > > From: Dave Chinner <dchinner@redhat.com>
> > > > > 
> > > > > To allow for iclog IO device cache flush behaviour to be optimised,
> > > > > we first need to separate out the commit record iclog IO from the
> > > > > rest of the checkpoint so we can wait for the checkpoint IO to
> > > > > complete before we issue the commit record.
> > > > > 
> > > > > This separate is only necessary if the commit record is being
> > > > > written into a different iclog to the start of the checkpoint. If
> > > > > the entire checkpoint and commit is in the one iclog, then they are
> > > > > both covered by the one set of cache flush primitives on the iclog
> > > > > and hence there is no need to separate them.
> > > > > 
> > > > 
> > > > I find the description here a bit vague.. we have to separate out the
> > > > commit record I/O, but only if it's already separate..?
> > > 
> > > Yes, because if the commit record is in a different iclog to the
> > > start of the checkpoint, we have to ensure that the start of the
> > > checkpoint has been written and completed before we issue a
> > > PREFLUSH, otherwise the cache flush is not guaranteed to capture
> > > that IO. i.e. the block layer cache flushing mechanisms do not order
> > > against IOs in flight, only completed IOs. Hence if the commit
> > > record is in a different iclog, we have to wait for the other iclogs
> > > to complete before flushign the cache. If the checkpoint is wholly
> > > contained within a single iclog, then we don't have to wait for
> > > anything, the single journal IO has all the cache flushes we need.
> > > 
> > 
> > Yeah, I get the functional behavior. I think the commit log would do
> > better to refer to serializing or ordering the commit record (when split
> > or separate) as opposed to separating it.
> 
> Except we already have "ordering the commit record" functionality
> implemented in the CIL push code, where it explicitly orders the
> commit record for the push against the commit records of other
> pushes in progress.
> 
> SO I don't think naming it "ordering the commit record" improves the
> situation because it introduces ambiguity into what that means. It's
> also not serialisation...
> 

I'm not sure we can get much more vague than "separate commit record
IO." :P We can fairly easily distinguish between ordering of independent
commit records and ordering a particular commit record against log I/O
of its associated checkpoint. For example:

  xfs: block commit record iclog submission on previous iclog completion

... but feel free to come up with something better, use the term
ordering instead of blocking, etc.

> > > > > +int
> > > > > +xlog_wait_on_iclog_lsn(
> > > > > +	struct xlog_in_core	*iclog,
> > > > > +	xfs_lsn_t		start_lsn)
> > > > > +{
> > > > > +	struct xlog		*log = iclog->ic_log;
> > > > > +	struct xlog_in_core	*prev;
> > > > > +	int			error = -EIO;
> > > > > +
> > > > > +	spin_lock(&log->l_icloglock);
> > > > > +	if (XLOG_FORCED_SHUTDOWN(log))
> > > > > +		goto out_unlock;
> > > > > +
> > > > > +	error = 0;
> > > > > +	for (prev = iclog->ic_prev; prev != iclog; prev = prev->ic_prev) {
> > > > > +
> > > > > +		/* Done if the lsn is before our start lsn */
> > > > > +		if (XFS_LSN_CMP(be64_to_cpu(prev->ic_header.h_lsn),
> > > > > +				start_lsn) < 0)
> > > > > +			break;
> > > > 
> > > > Hmm.. that logic looks a bit dodgy when you consider that the iclog
> > > > header lsn is reset to zero on activation. I think it actually works as
> > > > intended because iclog completion reactivates iclogs in LSN order and
> > > > this loop walks in reverse order, but that is a very subtle connection
> > > > that might be useful to document.
> > > 
> > > It is documented in the comment above the function "We walk
> > > backwards through the iclogs to find the iclog....". A newly
> > > activated iclog will have an LSN of zero, and that means there are
> > > no prior iclogs in the list in the range we need to flush because of
> > > the completion ordering guarantees we have for iclog IO (i.e. they
> > > are always completed in ring order, not IO completion order).
> > > 
> > 
> > Can you update the comment with that sentence to be more explicit about
> > the zero LSN case?
> 
> 
> > 
> > > > > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> > > > > index b0ef071b3cb5..c5cc1b7ad25e 100644
> > > > > --- a/fs/xfs/xfs_log_cil.c
> > > > > +++ b/fs/xfs/xfs_log_cil.c
> > > > > @@ -870,6 +870,13 @@ xlog_cil_push_work(
> > > > >  	wake_up_all(&cil->xc_commit_wait);
> > > > >  	spin_unlock(&cil->xc_push_lock);
> > > > >  
> > > > > +	/*
> > > > > +	 * If the checkpoint spans multiple iclogs, wait for all previous
> > > > > +	 * iclogs to complete before we submit the commit_iclog.
> > > > > +	 */
> > > > > +	if (ctx->start_lsn != commit_lsn)
> > > > > +		xlog_wait_on_iclog_lsn(commit_iclog, ctx->start_lsn);
> > > > > +
> > > > 
> > > > This is an interesting serialization point because we don't necessarily
> > > > submit the iclog that holds the commit record. I actually think in most
> > > > cases the separate commit record won't land on disk until the next CIL
> > > > push causes a sync of the current head iclog (assuming it fills it up).
> > > 
> > > Which is exactly what happens now. It's not a big deal, because if
> > > we are running a sync transaction, the log force will push it to
> > > disk and wait for it. CIL pushes are not synchronous and never have
> > > been - the log force is what creates the appearance of synchronous
> > > behaviour to the caller...
> > > 
> > 
> > Right, but otherwise the CIL push blocks on previous checkpoints (and
> > now the majority of the current checkpoint) only to write the commit
> > record into an iclog and probably not submit it until the next CIL
> > checkpoint.
> 
> That's exactly the same behaviour we have now. Nothing guarantees
> that the iclog containing a commit record is written immediately.
> So this simply isn't a concern because we are not changing
> behaviour.
> 
> > I agree it's probably not a big deal. The same behavior
> > likely repeats for subsequent CIL pushes, so it may not even be
> > noticeable. This is just an observation that we're compounding more
> > serialization
> 
> This does not add serialisation!
> 
> Serialisation means you do something that will stall *other tasks*
> because they have to wait for the current task to finish before they
> can proceed. Adding an _ordering delay_ into an unserialised task
> does not create new serialisation points, it just lengthens the
> runtime of that task.
> 

Perhaps serialization is not the right term... I/O synchronization
perhaps? I don't know, but calling it an ordering delay is certainly not
comprehensive because it has functional ramifications..

> > on another bit of a historical impedance mismatch between
> > delayed logging and the underlying layer. We've potentially just wrote
> > out a large context, yet none of the attached items can be processed
> > until another checkpoint occurs because the callbacks that do AIL
> > insertion and whatnot don't run until the iclog with the commit record
> > hits the disk.
> 
> Yup, classic two phase commit algorithm behaviour right there. We
> already have this occurring and it most definitely is not a problem
> that we need to solve. Indeed, this behaviour is the prime reason
> for log covering to exist...
> 
> > This isn't necessarily a commentary on this patch, but it would be nice
> > if we could start thinking about handling that situation better;
> > particularly if we've just blocked waiting for the majority of that
> > checkpoint I/O to complete.
> 
> The thing is, we are already blocking  waiting for iclog IO to
> complete. We're doing it at a rate of at lease 1 in every 4 iclogs
> we need to fill in long running checkpoints. So the CIL push that
> needs to write a thousand iclogs before the commit record is written
> has already blocked on iclog IO completion at least 250 times.
> 
> In comaprison, with the cache flush changes and this commit record
> separation, I'm seeing the iclog block rate drop to 1 in 100 iclogs
> written. The fact is that cache flushing is -far more expensive- and
> results is far more blocking and IO completion waiting in the CIL
> push than this code that orders the commit record by waiting for
> iclog completion to occur.
> 

What exactly is "iclog block rate?" Does the calculation of the new rate
factor in the implicit iclog blocking we now have by default?

> CIL checkpoint submission is far more IO efficient and *much lower
> latency* with this single software level ordering delay than using
> hardware cache flushes to guarantee ordering. To focus on "this
> new ordering delay might add latency" completely misses the bigger
> picture that we just replaced 2-3 orders of magnitude of IO blocking
> delays with a single ordering delay.
> 
> That's what's important here - it's not that we've added a single
> ordering delay into the checkpoint, but that we've removed
> *hundreds* of IO blocking delays in the checkpoint writing process.
> 

Ok. I think Christoph raised a good point around storage without
volatile caches, but otherwise I find the performance tradeoff analysis
reasonable for the applicable case. Thanks.

> > That means there's an increased chance that
> > the next iclog in the ring may be active. Perhaps we could introduce
> > some logic to switch out the commit record iclog before the CIL push
> > returns in those particular cases.  For example, switch out if the
> > current checkpoint hit multiple iclogs..? Or hit multiple iclogs and the
> > next iclog in the ring is clean..?
> 
> We could just call xlog_state_switch_iclogs() to mark it WANT_SYNC,
> but then we can't aggregate more changes into it and fill it up. If
> someone starts waiting on that iclog (i.e. a log force), then it
> immediately gets marked WANT_SYNC and submitted to disk when it is
> released. But if there is no-one waiting on it, then we largely
> don't care if an asynchronous checkpoint is committed immediately,
> at the start of the next checkpoint, or at worst, within 30s when
> the log worker next kicks a log force....
> 
> It's just not necessary to ensure the commit record hits the disk
> with minimal latency...
> 
> > > > From a performance perspective, it seems like this makes the CIL push
> > > > _nearly_ synchronous by default.
> > > 
> > > Nope, not at all. There is nothing waiting on the CIL push to
> > > complete, except maybe a log force waiting for the iclogs to be
> > > synced to disk and completed.
> > > 
> > 
> > Right, that's sort of the point... we have various places that issue
> > async log forces. These async log forces push the CIL, which
> > historically only ever blocks on log I/O by virtue of waiting on iclog
> > space. IOW, there is a bit of runway between an async CIL push blocking
> > on I/O or not provided by the iclog mechanism.
> > 
> > With this patch, a CIL push (and thus an async log force) now does not
> > return until all but the commit record iclog have been submitted for I/O
> > and completed.
> 
> Same as right now. I have not changed anything.
> 
> > The log force will then submit the commit record log, but
> > then not wait on it. This is obviously not a functional problem since a
> > CIL push was never guaranteed to not block at all (iclogs are a limited
> > resource), but there's clearly a change in behavior here worth reasoning
> > about for async forces..
> 
> No, there is no change in behaviour at all. The log force in both
> cases submits the commit record.
> 

But it also blocks where it didn't before. I'm not sure why I can't seem
to get this point across. Having thought more about this, here is a more
concrete example of the behavior change wrt to async log forces:

- Various operations populate the CIL, eventually triggers background
  CIL push.
- Background CIL push writes out the majority of checkpoint A, writes
  the commit record of checkpoint A into the current head iclog and
  returns without submitting it for I/O.
- More transactions commit and some task issues an async log force (i.e.
  xfs_log_force(.., 0);):
	- xlog_cil_push_work() submits the iclog with the commit record
	  for checkpoint A as it writes out checkpoint B.
	- xlog_wait_on_iclog_lsn() waits on ->ic_force_wait of the
	  commit record iclog for checkpoint A.
  ...

At this point, we've blocked async log force on completion processing of
a previous checkpoint because ->ic_force_wait isn't woken until the
iclog is cleaned, which doesn't occur until completion callbacks
attached to the iclog complete. This means we've technically just
changed the locking semantics for async log forces as callers must be
cognizant of locks used in the log I/O completion path.

Is this a problem? I don't know. Perhaps not, but we've had similar
issues in the past[1] in cases where an async log force unexpectedly
blocked on callback processing. The example cited was a different
waitqueue associated with the CIL context as well as being a shutdown
case that I think is no longer relevant due to more recent inode flush
reworks, etc., but it's just an example that raises concern.

So unless I'm missing something in the sequence above, I think this
requires more audit and analysis than we currently have for this
particular patch. I.e., we still have oddball cases like how
xfs_buf_lock() of a stale buffer invokes an async log force (while
potentially holding other locked items/buffers in a transaction) while
similarly an unpin of a staled buffer can cycle the buffer lock on the
log I/O completion side. Is that safe or a rathole of landmines (now and
for future code)? Perhaps a more robust option is to create a new
waitqueue specifically for the purpose of blocking iclog I/O so
historical async force rules/behavior is preserved. (We do have
->ic_write_wait, though that doesn't appear to provide ordering..).

Brian

[1] 545aa41f5cba ("xfs: wake commit waiters on CIL abort before log item abort")

> > > > I.e., if we have a several hundred MB
> > > > CIL context,
> > > 
> > > Not possible. A 2GB log has a hard throttle limit now of 64MB.
> > > 
> > 
> > Well, I don't think the throttle guarantees an upper bound on the
> > context size, but that's a different topic.
> 
> Sure, it's a bit lazy. But once you capture the user tasks that run
> commits on the throttle, the size stops growing.
> 
> > That was just an example of
> > a largish sized checkpoint. 64MB is still plenty enough to require
> > multiple passes through the set of iclogs. Regardless, I think the
> > before and after change is not significant given that a 64MB checkpoint
> > already requires internal blocking to reuse iclog space as the
> > checkpoint flushes anyways. So it's really just (l_iclog_bufs - 1) of
> > additional I/O we're waiting for in the worst case.
> 
> I put the metrics that prove this assertion wrong are in the commit
> message of the next patch. i.e. I measured how many "noiclog" events
> are occuring during CIL checkpoints. I mentioned that above already
> - reducing the IO blocking rate from hundreds of events per
> checkpoint down to a small handful is a major win....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards
  2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
  2021-01-28 16:53   ` Brian Foster
@ 2021-02-02  5:52   ` Chandan Babu R
  2021-02-17 11:33   ` Paul Menzel
  2021-02-17 21:06   ` Donald Buczek
  3 siblings, 0 replies; 29+ messages in thread
From: Chandan Babu R @ 2021-02-02  5:52 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On 28 Jan 2021 at 10:11, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> A hang with tasks stuck on the CIL hard throttle was reported and
> largely diagnosed by Donald Buczek, who discovered that it was a
> result of the CIL context space usage decrementing in committed
> transactions once the hard throttle limit had been hit and processes
> were already blocked.  This resulted in the CIL push not waking up
> those waiters because the CIL context was no longer over the hard
> throttle limit.
>
> The surprising aspect of this was the CIL space usage going
> backwards regularly enough to trigger this situation. Assumptions
> had been made in design that the relogging process would only
> increase the size of the objects in the CIL, and so that space would
> only increase.
>
> This change and commit message fixes the issue and documents the
> result of an audit of the triggers that can cause the CIL space to
> go backwards, how large the backwards steps tend to be, the
> frequency in which they occur, and what the impact on the CIL
> accounting code is.
>
> Even though the CIL ctx->space_used can go backwards, it will only
> do so if the log item is already logged to the CIL and contains a
> space reservation for it's entire logged state. This is tracked by
> the shadow buffer state on the log item. If the item is not
> previously logged in the CIL it has no shadow buffer nor log vector,
> and hence the entire size of the logged item copied to the log
> vector is accounted to the CIL space usage. i.e.  it will always go
> up in this case.
>
> If the item has a log vector (i.e. already in the CIL) and the size
> decreases, then the existing log vector will be overwritten and the
> space usage will go down. This is the only condition where the space
> usage reduces, and it can only occur when an item is already tracked
> in the CIL. Hence we are safe from CIL space usage underruns as a
> result of log items decreasing in size when they are relogged.
>
> Typically this reduction in CIL usage occurs from metadta blocks
> being free, such as when a btree block merge
> occurs or a directory enter/xattr entry is removed and the da-tree
> is reduced in size. This generally results in a reduction in size of
> around a single block in the CIL, but also tends to increase the
> number of log vectors because the parent and sibling nodes in the
> tree needs to be updated when a btree block is removed. If a
> multi-level merge occurs, then we see reduction in size of 2+
> blocks, but again the log vector count goes up.
>
> The other vector is inode fork size changes, which only log the
> current size of the fork and ignore the previously logged size when
> the fork is relogged. Hence if we are removing items from the inode
> fork (dir/xattr removal in shortform, extent record removal in
> extent form, etc) the relogged size of the inode for can decrease.
>
> No other log items can decrease in size either because they are a
> fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging
> an intent actually creates a new intent log item and doesn't relog
> the old item at all.) Hence the only two vectors for CIL context
> size reduction are relogging inode forks and marking buffers active
> in the CIL as stale.
>
> Long story short: the majority of the code does the right thing and
> handles the reduction in log item size correctly, and only the CIL
> hard throttle implementation is problematic and needs fixing. This
> patch makes that fix, as well as adds comments in the log item code
> that result in items shrinking in size when they are relogged as a
> clear reminder that this can and does happen frequently.
>
> The throttle fix is based upon the change Donald proposed, though it
> goes further to ensure that once the throttle is activated, it
> captures all tasks until the CIL push issues a wakeup, regardless of
> whether the CIL space used has gone back under the throttle
> threshold.
>
> This ensures that we prevent tasks reducing the CIL slightly under
> the throttle threshold and then making more changes that push it
> well over the throttle limit. This is acheived by checking if the
> throttle wait queue is already active as a condition of throttling.
> Hence once we start throttling, we continue to apply the throttle
> until the CIL context push wakes everything on the wait queue.
>
> We can use waitqueue_active() for the waitqueue manipulations and
> checks as they are all done under the ctx->xc_push_lock. Hence the
> waitqueue has external serialisation and we can safely peek inside
> the wait queue without holding the internal waitqueue locks.
>
> Many thanks to Donald for his diagnostic and analysis work to
> isolate the cause of this hang.
>

W.r.t correctness of the changes in this patch,

Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>

> Reported-by: Donald Buczek <buczek@molgen.mpg.de>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_buf_item.c   | 37 ++++++++++++++++++-------------------
>  fs/xfs/xfs_inode_item.c | 14 ++++++++++++++
>  fs/xfs/xfs_log_cil.c    | 22 +++++++++++++++++-----
>  3 files changed, 49 insertions(+), 24 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index dc0be2a639cc..17960b1ce5ef 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -56,14 +56,12 @@ xfs_buf_log_format_size(
>  }
>  
>  /*
> - * This returns the number of log iovecs needed to log the
> - * given buf log item.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item segment.
>   *
> - * It calculates this as 1 iovec for the buf log format structure
> - * and 1 for each stretch of non-contiguous chunks to be logged.
> - * Contiguous chunks are logged in a single iovec.
> - *
> - * If the XFS_BLI_STALE flag has been set, then log nothing.
> + * It calculates this as 1 iovec for the buf log format structure and 1 for each
> + * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> + * in a single iovec.
>   */
>  STATIC void
>  xfs_buf_item_size_segment(
> @@ -119,11 +117,8 @@ xfs_buf_item_size_segment(
>  }
>  
>  /*
> - * This returns the number of log iovecs needed to log the given buf log item.
> - *
> - * It calculates this as 1 iovec for the buf log format structure and 1 for each
> - * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> - * in a single iovec.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item.
>   *
>   * Discontiguous buffers need a format structure per region that is being
>   * logged. This makes the changes in the buffer appear to log recovery as though
> @@ -133,7 +128,11 @@ xfs_buf_item_size_segment(
>   * what ends up on disk.
>   *
>   * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
> - * format structures.
> + * format structures. If the item has previously been logged and has dirty
> + * regions, we do not relog them in stale buffers. This has the effect of
> + * reducing the size of the relogged item by the amount of dirty data tracked
> + * by the log item. This can result in the committing transaction reducing the
> + * amount of space being consumed by the CIL.
>   */
>  STATIC void
>  xfs_buf_item_size(
> @@ -147,9 +146,9 @@ xfs_buf_item_size(
>  	ASSERT(atomic_read(&bip->bli_refcount) > 0);
>  	if (bip->bli_flags & XFS_BLI_STALE) {
>  		/*
> -		 * The buffer is stale, so all we need to log
> -		 * is the buf log format structure with the
> -		 * cancel flag in it.
> +		 * The buffer is stale, so all we need to log is the buf log
> +		 * format structure with the cancel flag in it as we are never
> +		 * going to replay the changes tracked in the log item.
>  		 */
>  		trace_xfs_buf_item_size_stale(bip);
>  		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
> @@ -164,9 +163,9 @@ xfs_buf_item_size(
>  
>  	if (bip->bli_flags & XFS_BLI_ORDERED) {
>  		/*
> -		 * The buffer has been logged just to order it.
> -		 * It is not being included in the transaction
> -		 * commit, so no vectors are used at all.
> +		 * The buffer has been logged just to order it. It is not being
> +		 * included in the transaction commit, so no vectors are used at
> +		 * all.
>  		 */
>  		trace_xfs_buf_item_size_ordered(bip);
>  		*nvecs = XFS_LOG_VEC_ORDERED;
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 17e20a6d8b4e..6ff91e5bf3cd 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
>  	return container_of(lip, struct xfs_inode_log_item, ili_item);
>  }
>  
> +/*
> + * The logged size of an inode fork is always the current size of the inode
> + * fork. This means that when an inode fork is relogged, the size of the logged
> + * region is determined by the current state, not the combination of the
> + * previously logged state + the current state. This is different relogging
> + * behaviour to most other log items which will retain the size of the
> + * previously logged changes when smaller regions are relogged.
> + *
> + * Hence operations that remove data from the inode fork (e.g. shortform
> + * dir/attr remove, extent form extent removal, etc), the size of the relogged
> + * inode gets -smaller- rather than stays the same size as the previously logged
> + * size and this can result in the committing transaction reducing the amount of
> + * space being consumed by the CIL.
> + */
>  STATIC void
>  xfs_inode_item_data_fork_size(
>  	struct xfs_inode_log_item *iip,
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index c5cc1b7ad25e..daf1f3eb24a8 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -668,9 +668,14 @@ xlog_cil_push_work(
>  	ASSERT(push_seq <= ctx->sequence);
>  
>  	/*
> -	 * Wake up any background push waiters now this context is being pushed.
> +	 * As we are about to switch to a new CILi, empty context, we no longer
> +	 * need to throttle tasks on CIL space overruns. Wake any waiters that
> +	 * the hard push throttle may have caught so they can start committing
> +	 * to the new context. The ctx->xc_push_lock provides the serialisation
> +	 * necessary for safely using the lockless waitqueue_active() check in
> +	 * this context.
>  	 */
> -	if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
> +	if (waitqueue_active(&cil->xc_push_wait))
>  		wake_up_all(&cil->xc_push_wait);
>  
>  	/*
> @@ -914,7 +919,7 @@ xlog_cil_push_background(
>  	ASSERT(!list_empty(&cil->xc_cil));
>  
>  	/*
> -	 * don't do a background push if we haven't used up all the
> +	 * Don't do a background push if we haven't used up all the
>  	 * space available yet.
>  	 */
>  	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
> @@ -938,9 +943,16 @@ xlog_cil_push_background(
>  
>  	/*
>  	 * If we are well over the space limit, throttle the work that is being
> -	 * done until the push work on this context has begun.
> +	 * done until the push work on this context has begun. Enforce the hard
> +	 * throttle on all transaction commits once it has been activated, even
> +	 * if the committing transactions have resulted in the space usage
> +	 * dipping back down under the hard limit.
> +	 *
> +	 * The ctx->xc_push_lock provides the serialisation necessary for safely
> +	 * using the lockless waitqueue_active() check in this context.
>  	 */
> -	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> +	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
> +	    waitqueue_active(&cil->xc_push_wait)) {
>  		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
>  		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
>  		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);


-- 
chandan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 5/5] xfs: reduce buffer log item shadow allocations
  2021-01-28  4:41 ` [PATCH 5/5] xfs: reduce buffer log item shadow allocations Dave Chinner
  2021-01-28 16:54   ` Brian Foster
@ 2021-02-02 12:01   ` Chandan Babu R
  1 sibling, 0 replies; 29+ messages in thread
From: Chandan Babu R @ 2021-02-02 12:01 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs

On 28 Jan 2021 at 10:11, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> When we modify btrees repeatedly, we regularly increase the size of
> the logged region by a single chunk at a time (per transaction
> commit). This results in the CIL formatting code having to
> reallocate the log vector buffer every time the buffer dirty region
> grows. Hence over a typical 4kB btree buffer, we might grow the log
> vector 4096/128 = 32x over a short period where we repeatedly add
> or remove records to/from the buffer over a series of running
> transaction. This means we are doing 32 memory allocations and frees
> over this time during a performance critical path in the journal.
>
> The amount of space tracked in the CIL for the object is calculated
> during the ->iop_format() call for the buffer log item, but the
> buffer memory allocated for it is calculated by the ->iop_size()
> call. The size callout determines the size of the buffer, the format
> call determines the space used in the buffer.
>
> Hence we can oversize the buffer space required in the size
> calculation without impacting the amount of space used and accounted
> to the CIL for the changes being logged. This allows us to reduce
> the number of allocations by rounding up the buffer size to allow
> for future growth. This can safe a substantial amount of CPU time in
> this path:
>
> -   46.52%     2.02%  [kernel]                  [k] xfs_log_commit_cil
>    - 44.49% xfs_log_commit_cil
>       - 30.78% _raw_spin_lock
>          - 30.75% do_raw_spin_lock
>               30.27% __pv_queued_spin_lock_slowpath
>
> (oh, ouch!)
> ....
>       - 1.05% kmem_alloc_large
>          - 1.02% kmem_alloc
>               0.94% __kmalloc
>
> This overhead here us what this patch is aimed at. After:
>
>       - 0.76% kmem_alloc_large                                                                                                                                      ▒
>          - 0.75% kmem_alloc                                                                                                                                         ▒
>               0.70% __kmalloc                                                                                                                                       ▒

Apart from the trailing whitespace above,

The changes look good to me.

Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_buf_item.c | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 17960b1ce5ef..0628a65d9c55 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -142,6 +142,7 @@ xfs_buf_item_size(
>  {
>  	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
>  	int			i;
> +	int			bytes;
>  
>  	ASSERT(atomic_read(&bip->bli_refcount) > 0);
>  	if (bip->bli_flags & XFS_BLI_STALE) {
> @@ -173,7 +174,7 @@ xfs_buf_item_size(
>  	}
>  
>  	/*
> -	 * the vector count is based on the number of buffer vectors we have
> +	 * The vector count is based on the number of buffer vectors we have
>  	 * dirty bits in. This will only be greater than one when we have a
>  	 * compound buffer with more than one segment dirty. Hence for compound
>  	 * buffers we need to track which segment the dirty bits correspond to,
> @@ -181,10 +182,18 @@ xfs_buf_item_size(
>  	 * count for the extra buf log format structure that will need to be
>  	 * written.
>  	 */
> +	bytes = 0;
>  	for (i = 0; i < bip->bli_format_count; i++) {
>  		xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
> -					  nvecs, nbytes);
> +					  nvecs, &bytes);
>  	}
> +
> +	/*
> +	 * Round up the buffer size required to minimise the number of memory
> +	 * allocations that need to be done as this item grows when relogged by
> +	 * repeated modifications.
> +	 */
> +	*nbytes = round_up(bytes, 512);
>  	trace_xfs_buf_item_size(bip);
>  }


-- 
chandan

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/5] xfs: various log stuff...
  2021-02-01 12:39 ` [PATCH 0/5] xfs: various log stuff Christoph Hellwig
@ 2021-02-03 21:20   ` Dave Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2021-02-03 21:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Mon, Feb 01, 2021 at 12:39:43PM +0000, Christoph Hellwig wrote:
> On Thu, Jan 28, 2021 at 03:41:49PM +1100, Dave Chinner wrote:
> > Hi folks,
> > 
> > Quick patch dump for y'all. A couple of minor cleanups to the
> > log behaviour, a fix for the CIL throttle hang and a couple of
> > patches to rework the cache flushing that journal IO does to reduce
> > the number of cache flushes by a couple of orders of magnitude.
> > 
> > All passes fstests with no regressions, no performance regressions
> > from fsmark, dbench and various fio workloads, some big gains even
> > on fast storage.
> 
> Can you elaborate on the big gains?

See the commit messages. dbench simulates fileserver behaviour with
extremely frequent fsync/->commit_metadata flush pointsi and that
shows gains at high client counts when logbsize=32k. fsmark is a
highly concurrent metadata modification worklaod designed to push
the journal to it's performance and scalability limits, etc, and
that shows 25% gains on logbsize=32k, bringing it up to the same
performance as logbsize=256k on the test machine.

> Workloads for one, but also
> what kind of storage.  For less FUA/flush to matter the device needs
> to have a write cache, which none of the really fast SSDs even has.

The gains are occurring on devices that have volatile caches. 
But that doesn't mean devices that have volatile caches are slow,
just that they can be faster with a better cache flushing strategy.

And yes, as you would expect, I don't see any change in behaviour on
data center SSDs that have no volatile caches because the block
layer elides cache flushes for them anyway.

But, really, device performance improvements really aren't the
motivation for this. The real motivation is removing orders of
magnitude of flush points from the software layers below the
filesystem. Stuff like software RAID, thin provisioning and other
functionality that must obey the flush/fua IOs they receive
regardless of whether the underlying hardware needs them or not.

Avoiding flush/fua for the journal IO means that RAID5/6 can cache
partial stripe writes from the XFS journal rather than having to
flush the partial stripe update for every journal IO. dm-thin
doesn't need to commit open transactions and flush all the dirty
data over newly allocated regions on every journal IO to a device
pool (i.e. cache flushes from one thinp device in a pool cause all
other thinp devices in the pool to stall new allocations until the
flush/fua is done). And so on.

There's no question at all that reducing the number of flush/fua
triggers is a good thing to be doing, regardless of the storage or
workloads that I've done validation testing on. The fact I've found
that on a decent performance SSD (120k randr IOPS, 60k randw IOPS)
shows a 25% increase in performance for journal IO bound workload
indicates just how much default configurations can be bound by the
journal cache flushes...

> So I'd only really expect gains from that on consumer grade SSDs and
> hard drives.

Sure, but those are exactly the devices we have always optimised
cache flushing for....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards
  2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
  2021-01-28 16:53   ` Brian Foster
  2021-02-02  5:52   ` Chandan Babu R
@ 2021-02-17 11:33   ` Paul Menzel
  2021-02-17 21:06   ` Donald Buczek
  3 siblings, 0 replies; 29+ messages in thread
From: Paul Menzel @ 2021-02-17 11:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, Donald Buczek, it+linux-xfs

[+cc donald, it+linux-xfs@]


Dear Dave,


Thank you for comming up with a patch, which we unfortunately missed, as 
we are not subscribed to linux-xfs@vger.kernel.org.


Am 28.01.21 um 05:41 schrieb Dave Chinner:
> From: Dave Chinner <dchinner@redhat.com>
> 
> A hang with tasks stuck on the CIL hard throttle was reported and
> largely diagnosed by Donald Buczek, who discovered that it was a
> result of the CIL context space usage decrementing in committed
> transactions once the hard throttle limit had been hit and processes
> were already blocked.  This resulted in the CIL push not waking up
> those waiters because the CIL context was no longer over the hard
> throttle limit.
> 
> The surprising aspect of this was the CIL space usage going
> backwards regularly enough to trigger this situation. Assumptions
> had been made in design that the relogging process would only
> increase the size of the objects in the CIL, and so that space would
> only increase.
> 
> This change and commit message fixes the issue and documents the
> result of an audit of the triggers that can cause the CIL space to
> go backwards, how large the backwards steps tend to be, the
> frequency in which they occur, and what the impact on the CIL
> accounting code is.
> 
> Even though the CIL ctx->space_used can go backwards, it will only
> do so if the log item is already logged to the CIL and contains a
> space reservation for it's entire logged state. This is tracked by
> the shadow buffer state on the log item. If the item is not
> previously logged in the CIL it has no shadow buffer nor log vector,
> and hence the entire size of the logged item copied to the log
> vector is accounted to the CIL space usage. i.e.  it will always go
> up in this case.
> 
> If the item has a log vector (i.e. already in the CIL) and the size
> decreases, then the existing log vector will be overwritten and the
> space usage will go down. This is the only condition where the space
> usage reduces, and it can only occur when an item is already tracked
> in the CIL. Hence we are safe from CIL space usage underruns as a
> result of log items decreasing in size when they are relogged.
> 
> Typically this reduction in CIL usage occurs from metadta blocks
> being free, such as when a btree block merge
> occurs or a directory enter/xattr entry is removed and the da-tree
> is reduced in size. This generally results in a reduction in size of
> around a single block in the CIL, but also tends to increase the
> number of log vectors because the parent and sibling nodes in the
> tree needs to be updated when a btree block is removed. If a
> multi-level merge occurs, then we see reduction in size of 2+
> blocks, but again the log vector count goes up.
> 
> The other vector is inode fork size changes, which only log the
> current size of the fork and ignore the previously logged size when
> the fork is relogged. Hence if we are removing items from the inode
> fork (dir/xattr removal in shortform, extent record removal in
> extent form, etc) the relogged size of the inode for can decrease.
> 
> No other log items can decrease in size either because they are a
> fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging
> an intent actually creates a new intent log item and doesn't relog
> the old item at all.) Hence the only two vectors for CIL context
> size reduction are relogging inode forks and marking buffers active
> in the CIL as stale.
> 
> Long story short: the majority of the code does the right thing and
> handles the reduction in log item size correctly, and only the CIL
> hard throttle implementation is problematic and needs fixing. This
> patch makes that fix, as well as adds comments in the log item code
> that result in items shrinking in size when they are relogged as a
> clear reminder that this can and does happen frequently.
> 
> The throttle fix is based upon the change Donald proposed, though it
> goes further to ensure that once the throttle is activated, it
> captures all tasks until the CIL push issues a wakeup, regardless of
> whether the CIL space used has gone back under the throttle
> threshold.
> 
> This ensures that we prevent tasks reducing the CIL slightly under
> the throttle threshold and then making more changes that push it
> well over the throttle limit. This is acheived by checking if the

achieved

> throttle wait queue is already active as a condition of throttling.
> Hence once we start throttling, we continue to apply the throttle
> until the CIL context push wakes everything on the wait queue.
> 
> We can use waitqueue_active() for the waitqueue manipulations and
> checks as they are all done under the ctx->xc_push_lock. Hence the
> waitqueue has external serialisation and we can safely peek inside
> the wait queue without holding the internal waitqueue locks.
> 
> Many thanks to Donald for his diagnostic and analysis work to
> isolate the cause of this hang.
> 
> Reported-by: Donald Buczek <buczek@molgen.mpg.de>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

No idea, how the XFS subsystem does it, but maybe add a fixes tag, so 
the stable series and distributions can pick it up more easily?

> ---
>   fs/xfs/xfs_buf_item.c   | 37 ++++++++++++++++++-------------------
>   fs/xfs/xfs_inode_item.c | 14 ++++++++++++++
>   fs/xfs/xfs_log_cil.c    | 22 +++++++++++++++++-----
>   3 files changed, 49 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index dc0be2a639cc..17960b1ce5ef 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -56,14 +56,12 @@ xfs_buf_log_format_size(
>   }
>   
>   /*
> - * This returns the number of log iovecs needed to log the
> - * given buf log item.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item segment.
>    *
> - * It calculates this as 1 iovec for the buf log format structure
> - * and 1 for each stretch of non-contiguous chunks to be logged.
> - * Contiguous chunks are logged in a single iovec.
> - *
> - * If the XFS_BLI_STALE flag has been set, then log nothing.
> + * It calculates this as 1 iovec for the buf log format structure and 1 for each
> + * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> + * in a single iovec.
>    */
>   STATIC void
>   xfs_buf_item_size_segment(
> @@ -119,11 +117,8 @@ xfs_buf_item_size_segment(
>   }
>   
>   /*
> - * This returns the number of log iovecs needed to log the given buf log item.
> - *
> - * It calculates this as 1 iovec for the buf log format structure and 1 for each
> - * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> - * in a single iovec.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item.
>    *
>    * Discontiguous buffers need a format structure per region that is being
>    * logged. This makes the changes in the buffer appear to log recovery as though
> @@ -133,7 +128,11 @@ xfs_buf_item_size_segment(
>    * what ends up on disk.
>    *
>    * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
> - * format structures.
> + * format structures. If the item has previously been logged and has dirty
> + * regions, we do not relog them in stale buffers. This has the effect of
> + * reducing the size of the relogged item by the amount of dirty data tracked
> + * by the log item. This can result in the committing transaction reducing the
> + * amount of space being consumed by the CIL.
>    */
>   STATIC void
>   xfs_buf_item_size(
> @@ -147,9 +146,9 @@ xfs_buf_item_size(
>   	ASSERT(atomic_read(&bip->bli_refcount) > 0);
>   	if (bip->bli_flags & XFS_BLI_STALE) {
>   		/*
> -		 * The buffer is stale, so all we need to log
> -		 * is the buf log format structure with the
> -		 * cancel flag in it.
> +		 * The buffer is stale, so all we need to log is the buf log
> +		 * format structure with the cancel flag in it as we are never
> +		 * going to replay the changes tracked in the log item.
>   		 */
>   		trace_xfs_buf_item_size_stale(bip);
>   		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
> @@ -164,9 +163,9 @@ xfs_buf_item_size(
>   
>   	if (bip->bli_flags & XFS_BLI_ORDERED) {
>   		/*
> -		 * The buffer has been logged just to order it.
> -		 * It is not being included in the transaction
> -		 * commit, so no vectors are used at all.
> +		 * The buffer has been logged just to order it. It is not being
> +		 * included in the transaction commit, so no vectors are used at
> +		 * all.
>   		 */
>   		trace_xfs_buf_item_size_ordered(bip);
>   		*nvecs = XFS_LOG_VEC_ORDERED;
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 17e20a6d8b4e..6ff91e5bf3cd 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
>   	return container_of(lip, struct xfs_inode_log_item, ili_item);
>   }
>   
> +/*
> + * The logged size of an inode fork is always the current size of the inode
> + * fork. This means that when an inode fork is relogged, the size of the logged
> + * region is determined by the current state, not the combination of the
> + * previously logged state + the current state. This is different relogging
> + * behaviour to most other log items which will retain the size of the
> + * previously logged changes when smaller regions are relogged.
> + *
> + * Hence operations that remove data from the inode fork (e.g. shortform
> + * dir/attr remove, extent form extent removal, etc), the size of the relogged
> + * inode gets -smaller- rather than stays the same size as the previously logged
> + * size and this can result in the committing transaction reducing the amount of
> + * space being consumed by the CIL.
> + */
>   STATIC void
>   xfs_inode_item_data_fork_size(
>   	struct xfs_inode_log_item *iip,
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index c5cc1b7ad25e..daf1f3eb24a8 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -668,9 +668,14 @@ xlog_cil_push_work(
>   	ASSERT(push_seq <= ctx->sequence);
>   
>   	/*
> -	 * Wake up any background push waiters now this context is being pushed.
> +	 * As we are about to switch to a new CILi, empty context, we no longer

I couldn’t find CILi anywhere else in the Linux tree. Do you mean CIL?

> +	 * need to throttle tasks on CIL space overruns. Wake any waiters that
> +	 * the hard push throttle may have caught so they can start committing
> +	 * to the new context. The ctx->xc_push_lock provides the serialisation
> +	 * necessary for safely using the lockless waitqueue_active() check in
> +	 * this context.
>   	 */
> -	if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
> +	if (waitqueue_active(&cil->xc_push_wait))
>   		wake_up_all(&cil->xc_push_wait);
>   
>   	/*
> @@ -914,7 +919,7 @@ xlog_cil_push_background(
>   	ASSERT(!list_empty(&cil->xc_cil));
>   
>   	/*
> -	 * don't do a background push if we haven't used up all the
> +	 * Don't do a background push if we haven't used up all the
>   	 * space available yet.
>   	 */
>   	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
> @@ -938,9 +943,16 @@ xlog_cil_push_background(
>   
>   	/*
>   	 * If we are well over the space limit, throttle the work that is being
> -	 * done until the push work on this context has begun.
> +	 * done until the push work on this context has begun. Enforce the hard
> +	 * throttle on all transaction commits once it has been activated, even
> +	 * if the committing transactions have resulted in the space usage
> +	 * dipping back down under the hard limit.
> +	 *
> +	 * The ctx->xc_push_lock provides the serialisation necessary for safely
> +	 * using the lockless waitqueue_active() check in this context.
>   	 */
> -	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> +	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
> +	    waitqueue_active(&cil->xc_push_wait)) {
>   		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
>   		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
>   		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);


Kind regards,

Paul

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards
  2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
                     ` (2 preceding siblings ...)
  2021-02-17 11:33   ` Paul Menzel
@ 2021-02-17 21:06   ` Donald Buczek
  3 siblings, 0 replies; 29+ messages in thread
From: Donald Buczek @ 2021-02-17 21:06 UTC (permalink / raw)
  To: Dave Chinner, linux-xfs

On 28.01.21 05:41, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> A hang with tasks stuck on the CIL hard throttle was reported and
> largely diagnosed by Donald Buczek, who discovered that it was a
> result of the CIL context space usage decrementing in committed
> transactions once the hard throttle limit had been hit and processes
> were already blocked.  This resulted in the CIL push not waking up
> those waiters because the CIL context was no longer over the hard
> throttle limit.
> 
> The surprising aspect of this was the CIL space usage going
> backwards regularly enough to trigger this situation. Assumptions
> had been made in design that the relogging process would only
> increase the size of the objects in the CIL, and so that space would
> only increase.
> 
> This change and commit message fixes the issue and documents the
> result of an audit of the triggers that can cause the CIL space to
> go backwards, how large the backwards steps tend to be, the
> frequency in which they occur, and what the impact on the CIL
> accounting code is.
> 
> Even though the CIL ctx->space_used can go backwards, it will only
> do so if the log item is already logged to the CIL and contains a
> space reservation for it's entire logged state. This is tracked by
> the shadow buffer state on the log item. If the item is not
> previously logged in the CIL it has no shadow buffer nor log vector,
> and hence the entire size of the logged item copied to the log
> vector is accounted to the CIL space usage. i.e.  it will always go
> up in this case.
> 
> If the item has a log vector (i.e. already in the CIL) and the size
> decreases, then the existing log vector will be overwritten and the
> space usage will go down. This is the only condition where the space
> usage reduces, and it can only occur when an item is already tracked
> in the CIL. Hence we are safe from CIL space usage underruns as a
> result of log items decreasing in size when they are relogged.
> 
> Typically this reduction in CIL usage occurs from metadta blocks
> being free, such as when a btree block merge
> occurs or a directory enter/xattr entry is removed and the da-tree
> is reduced in size. This generally results in a reduction in size of
> around a single block in the CIL, but also tends to increase the
> number of log vectors because the parent and sibling nodes in the
> tree needs to be updated when a btree block is removed. If a
> multi-level merge occurs, then we see reduction in size of 2+
> blocks, but again the log vector count goes up.
> 
> The other vector is inode fork size changes, which only log the
> current size of the fork and ignore the previously logged size when
> the fork is relogged. Hence if we are removing items from the inode
> fork (dir/xattr removal in shortform, extent record removal in
> extent form, etc) the relogged size of the inode for can decrease.
> 
> No other log items can decrease in size either because they are a
> fixed size (e.g. dquots) or they cannot be relogged (e.g. relogging
> an intent actually creates a new intent log item and doesn't relog
> the old item at all.) Hence the only two vectors for CIL context
> size reduction are relogging inode forks and marking buffers active
> in the CIL as stale.
> 
> Long story short: the majority of the code does the right thing and
> handles the reduction in log item size correctly, and only the CIL
> hard throttle implementation is problematic and needs fixing. This
> patch makes that fix, as well as adds comments in the log item code
> that result in items shrinking in size when they are relogged as a
> clear reminder that this can and does happen frequently.
> 
> The throttle fix is based upon the change Donald proposed, though it
> goes further to ensure that once the throttle is activated, it
> captures all tasks until the CIL push issues a wakeup, regardless of
> whether the CIL space used has gone back under the throttle
> threshold.
> 
> This ensures that we prevent tasks reducing the CIL slightly under
> the throttle threshold and then making more changes that push it
> well over the throttle limit. This is acheived by checking if the
> throttle wait queue is already active as a condition of throttling.
> Hence once we start throttling, we continue to apply the throttle
> until the CIL context push wakes everything on the wait queue.
> 
> We can use waitqueue_active() for the waitqueue manipulations and
> checks as they are all done under the ctx->xc_push_lock. Hence the
> waitqueue has external serialisation and we can safely peek inside
> the wait queue without holding the internal waitqueue locks.
> 
> Many thanks to Donald for his diagnostic and analysis work to
> isolate the cause of this hang.
> 
> Reported-by: Donald Buczek <buczek@molgen.mpg.de>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Tested-by: Donald Buczek <buczek@molgen.mpg.de>

... for many hours and repeated several times using a configuration with otherweise triggered the issue in typially 10, max 60 minutes.

Great, thanks!

   Donald

> ---
>   fs/xfs/xfs_buf_item.c   | 37 ++++++++++++++++++-------------------
>   fs/xfs/xfs_inode_item.c | 14 ++++++++++++++
>   fs/xfs/xfs_log_cil.c    | 22 +++++++++++++++++-----
>   3 files changed, 49 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index dc0be2a639cc..17960b1ce5ef 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -56,14 +56,12 @@ xfs_buf_log_format_size(
>   }
>   
>   /*
> - * This returns the number of log iovecs needed to log the
> - * given buf log item.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item segment.
>    *
> - * It calculates this as 1 iovec for the buf log format structure
> - * and 1 for each stretch of non-contiguous chunks to be logged.
> - * Contiguous chunks are logged in a single iovec.
> - *
> - * If the XFS_BLI_STALE flag has been set, then log nothing.
> + * It calculates this as 1 iovec for the buf log format structure and 1 for each
> + * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> + * in a single iovec.
>    */
>   STATIC void
>   xfs_buf_item_size_segment(
> @@ -119,11 +117,8 @@ xfs_buf_item_size_segment(
>   }
>   
>   /*
> - * This returns the number of log iovecs needed to log the given buf log item.
> - *
> - * It calculates this as 1 iovec for the buf log format structure and 1 for each
> - * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
> - * in a single iovec.
> + * Return the number of log iovecs and space needed to log the given buf log
> + * item.
>    *
>    * Discontiguous buffers need a format structure per region that is being
>    * logged. This makes the changes in the buffer appear to log recovery as though
> @@ -133,7 +128,11 @@ xfs_buf_item_size_segment(
>    * what ends up on disk.
>    *
>    * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
> - * format structures.
> + * format structures. If the item has previously been logged and has dirty
> + * regions, we do not relog them in stale buffers. This has the effect of
> + * reducing the size of the relogged item by the amount of dirty data tracked
> + * by the log item. This can result in the committing transaction reducing the
> + * amount of space being consumed by the CIL.
>    */
>   STATIC void
>   xfs_buf_item_size(
> @@ -147,9 +146,9 @@ xfs_buf_item_size(
>   	ASSERT(atomic_read(&bip->bli_refcount) > 0);
>   	if (bip->bli_flags & XFS_BLI_STALE) {
>   		/*
> -		 * The buffer is stale, so all we need to log
> -		 * is the buf log format structure with the
> -		 * cancel flag in it.
> +		 * The buffer is stale, so all we need to log is the buf log
> +		 * format structure with the cancel flag in it as we are never
> +		 * going to replay the changes tracked in the log item.
>   		 */
>   		trace_xfs_buf_item_size_stale(bip);
>   		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
> @@ -164,9 +163,9 @@ xfs_buf_item_size(
>   
>   	if (bip->bli_flags & XFS_BLI_ORDERED) {
>   		/*
> -		 * The buffer has been logged just to order it.
> -		 * It is not being included in the transaction
> -		 * commit, so no vectors are used at all.
> +		 * The buffer has been logged just to order it. It is not being
> +		 * included in the transaction commit, so no vectors are used at
> +		 * all.
>   		 */
>   		trace_xfs_buf_item_size_ordered(bip);
>   		*nvecs = XFS_LOG_VEC_ORDERED;
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 17e20a6d8b4e..6ff91e5bf3cd 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
>   	return container_of(lip, struct xfs_inode_log_item, ili_item);
>   }
>   
> +/*
> + * The logged size of an inode fork is always the current size of the inode
> + * fork. This means that when an inode fork is relogged, the size of the logged
> + * region is determined by the current state, not the combination of the
> + * previously logged state + the current state. This is different relogging
> + * behaviour to most other log items which will retain the size of the
> + * previously logged changes when smaller regions are relogged.
> + *
> + * Hence operations that remove data from the inode fork (e.g. shortform
> + * dir/attr remove, extent form extent removal, etc), the size of the relogged
> + * inode gets -smaller- rather than stays the same size as the previously logged
> + * size and this can result in the committing transaction reducing the amount of
> + * space being consumed by the CIL.
> + */
>   STATIC void
>   xfs_inode_item_data_fork_size(
>   	struct xfs_inode_log_item *iip,
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index c5cc1b7ad25e..daf1f3eb24a8 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -668,9 +668,14 @@ xlog_cil_push_work(
>   	ASSERT(push_seq <= ctx->sequence);
>   
>   	/*
> -	 * Wake up any background push waiters now this context is being pushed.
> +	 * As we are about to switch to a new CILi, empty context, we no longer
> +	 * need to throttle tasks on CIL space overruns. Wake any waiters that
> +	 * the hard push throttle may have caught so they can start committing
> +	 * to the new context. The ctx->xc_push_lock provides the serialisation
> +	 * necessary for safely using the lockless waitqueue_active() check in
> +	 * this context.
>   	 */
> -	if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
> +	if (waitqueue_active(&cil->xc_push_wait))
>   		wake_up_all(&cil->xc_push_wait);
>   
>   	/*
> @@ -914,7 +919,7 @@ xlog_cil_push_background(
>   	ASSERT(!list_empty(&cil->xc_cil));
>   
>   	/*
> -	 * don't do a background push if we haven't used up all the
> +	 * Don't do a background push if we haven't used up all the
>   	 * space available yet.
>   	 */
>   	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
> @@ -938,9 +943,16 @@ xlog_cil_push_background(
>   
>   	/*
>   	 * If we are well over the space limit, throttle the work that is being
> -	 * done until the push work on this context has begun.
> +	 * done until the push work on this context has begun. Enforce the hard
> +	 * throttle on all transaction commits once it has been activated, even
> +	 * if the committing transactions have resulted in the space usage
> +	 * dipping back down under the hard limit.
> +	 *
> +	 * The ctx->xc_push_lock provides the serialisation necessary for safely
> +	 * using the lockless waitqueue_active() check in this context.
>   	 */
> -	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> +	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
> +	    waitqueue_active(&cil->xc_push_wait)) {
>   		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
>   		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
>   		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2021-02-17 21:07 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-28  4:41 [PATCH 0/5] xfs: various log stuff Dave Chinner
2021-01-28  4:41 ` [PATCH 1/5] xfs: log stripe roundoff is a property of the log Dave Chinner
2021-01-28 14:57   ` Brian Foster
2021-01-28 20:59     ` Dave Chinner
2021-01-28 21:25   ` Darrick J. Wong
2021-01-28 22:00     ` Dave Chinner
2021-01-28  4:41 ` [PATCH 2/5] xfs: separate CIL commit record IO Dave Chinner
2021-01-28 15:07   ` Brian Foster
2021-01-28 21:22     ` Dave Chinner
     [not found]       ` <20210129145851.GB2660974@bfoster>
2021-01-29 22:25         ` Dave Chinner
2021-02-01 16:07           ` Brian Foster
2021-01-30  9:13   ` Chandan Babu R
2021-02-01 12:59   ` Christoph Hellwig
2021-01-28  4:41 ` [PATCH 3/5] xfs: journal IO cache flush reductions Dave Chinner
2021-01-28 15:12   ` Brian Foster
2021-01-28 21:46     ` Dave Chinner
2021-01-28 21:26   ` Dave Chinner
2021-01-30 12:56   ` Chandan Babu R
2021-01-28  4:41 ` [PATCH 4/5] xfs: Fix CIL throttle hang when CIL space used going backwards Dave Chinner
2021-01-28 16:53   ` Brian Foster
2021-02-02  5:52   ` Chandan Babu R
2021-02-17 11:33   ` Paul Menzel
2021-02-17 21:06   ` Donald Buczek
2021-01-28  4:41 ` [PATCH 5/5] xfs: reduce buffer log item shadow allocations Dave Chinner
2021-01-28 16:54   ` Brian Foster
2021-01-28 21:58     ` Dave Chinner
2021-02-02 12:01   ` Chandan Babu R
2021-02-01 12:39 ` [PATCH 0/5] xfs: various log stuff Christoph Hellwig
2021-02-03 21:20   ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.