All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks
@ 2020-03-26 18:40 Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 1/5] gfs2: don't lock sd_log_flush_lock in try_rgrp_unlink Bob Peterson
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Bob Peterson @ 2020-03-26 18:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

This patch set addresses several problems I encountered while testing writes
to journaled data (jdata) files.

Bob Peterson (5):
  gfs2: don't lock sd_log_flush_lock in try_rgrp_unlink
  gfs2: instrumentation wrt ail1 stuck
  gfs2: change from write to read lock for sd_log_flush_lock in journal
    replay
  gfs2: special log flush sequence to protect jdata writes
  gfs2: implement special transaction type for jdata sync writes

 fs/gfs2/aops.c       |  25 ++++++--
 fs/gfs2/glops.c      |   2 +-
 fs/gfs2/incore.h     |   1 +
 fs/gfs2/log.c        | 137 +++++++++++++++++++++++++++++++++----------
 fs/gfs2/log.h        |   6 +-
 fs/gfs2/ops_fstype.c |   1 +
 fs/gfs2/recovery.c   |   4 +-
 fs/gfs2/rgrp.c       |   2 -
 fs/gfs2/trans.c      |  39 +++++++++---
 fs/gfs2/trans.h      |   4 ++
 10 files changed, 171 insertions(+), 50 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Cluster-devel] [GFS2 PATCH 1/5] gfs2: don't lock sd_log_flush_lock in try_rgrp_unlink
  2020-03-26 18:40 [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks Bob Peterson
@ 2020-03-26 18:40 ` Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 2/5] gfs2: instrumentation wrt ail1 stuck Bob Peterson
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Bob Peterson @ 2020-03-26 18:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

In function try_rgrp_unlink, we added a temporary lock of the
sd_log_flush_lock while searching the bitmaps. This protected us from
problems in which dinodes being freed were still in a state of flux
because the rgrp was in an active transaction. It was a kludge.
Now that we've straightened out the code for inode eviction, deletes,
and all the recovery mess, we no longer need this kludge.
This patch removes it, and should improve performance.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/rgrp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 692dc11d0f13..a321c34e3d6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1806,10 +1806,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
 
 	while (1) {
-		down_write(&sdp->sd_log_flush_lock);
 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
 				      true);
-		up_write(&sdp->sd_log_flush_lock);
 		if (error == -ENOSPC)
 			break;
 		if (WARN_ON_ONCE(error))
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Cluster-devel] [GFS2 PATCH 2/5] gfs2: instrumentation wrt ail1 stuck
  2020-03-26 18:40 [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 1/5] gfs2: don't lock sd_log_flush_lock in try_rgrp_unlink Bob Peterson
@ 2020-03-26 18:40 ` Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 3/5] gfs2: change from write to read lock for sd_log_flush_lock in journal replay Bob Peterson
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Bob Peterson @ 2020-03-26 18:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Before this patch, if the ail1 flush got stuck for some reason, there
were no clues as to why. This patch introduces a check for getting
stuck for more than a minute, and if it happens, it dumps the items
still remaining on the ail1 list.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/log.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 87f3e892be3e..2abec43ae898 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -139,6 +139,41 @@ __acquires(&sdp->sd_ail_lock)
 	return ret;
 }
 
+static void dump_ail_list(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr;
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	fs_err(sdp, "Error: In gfs2_ail1_flush for a minute! t=%d\n",
+	       current->journal_info ? 1 : 0);
+
+	list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry_reverse(bd, &tr->tr_ail1_list,
+					    bd_ail_st_list) {
+			bh = bd->bd_bh;
+			fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd,
+			       (unsigned long long)bd->bd_blkno, bh);
+			if (!bh) {
+				fs_err(sdp, "\n");
+				continue;
+			}
+			fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d "
+			       "map:%d new:%d ar:%d aw:%d delay:%d "
+			       "io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n",
+			       (unsigned long long)bh->b_blocknr,
+			       buffer_uptodate(bh), buffer_dirty(bh),
+			       buffer_locked(bh), buffer_req(bh),
+			       buffer_mapped(bh), buffer_new(bh),
+			       buffer_async_read(bh), buffer_async_write(bh),
+			       buffer_delay(bh), buffer_write_io_error(bh),
+			       buffer_unwritten(bh),
+			       buffer_defer_completion(bh),
+			       buffer_pinned(bh), buffer_escaped(bh));
+			fs_err(sdp, "\n");
+		}
+	}
+}
 
 /**
  * gfs2_ail1_flush - start writeback of some ail1 entries 
@@ -155,11 +190,16 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 	struct gfs2_trans *tr;
 	struct blk_plug plug;
 	int ret = 0;
+	unsigned long flush_start = jiffies;
 
 	trace_gfs2_ail_flush(sdp, wbc, 1);
 	blk_start_plug(&plug);
 	spin_lock(&sdp->sd_ail_lock);
 restart:
+	if (time_after(jiffies, flush_start + (HZ * 60))) {
+		dump_ail_list(sdp);
+		goto out;
+	}
 	list_for_each_entry_reverse(tr, head, tr_list) {
 		if (wbc->nr_to_write <= 0)
 			break;
@@ -170,6 +210,7 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 			break;
 		}
 	}
+out:
 	spin_unlock(&sdp->sd_ail_lock);
 	blk_finish_plug(&plug);
 	if (ret) {
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Cluster-devel] [GFS2 PATCH 3/5] gfs2: change from write to read lock for sd_log_flush_lock in journal replay
  2020-03-26 18:40 [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 1/5] gfs2: don't lock sd_log_flush_lock in try_rgrp_unlink Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 2/5] gfs2: instrumentation wrt ail1 stuck Bob Peterson
@ 2020-03-26 18:40 ` Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 4/5] gfs2: special log flush sequence to protect jdata writes Bob Peterson
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 5/5] gfs2: implement special transaction type for jdata sync writes Bob Peterson
  4 siblings, 0 replies; 7+ messages in thread
From: Bob Peterson @ 2020-03-26 18:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Function gfs2_recover_func grabs the sd_log_flush_lock rw_semaphore in
write mode. This is unnecessary because we only need to prevent log flush
from using sd_log_bio bio while it does. Therefore, a read lock will be
enough. This is a small step in cleaning up log flush.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/recovery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 21fc44b31863..f890899aa000 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -401,7 +401,7 @@ void gfs2_recover_func(struct work_struct *work)
 		/* We take the sd_log_flush_lock here primarily to prevent log
 		 * flushes and simultaneous journal replays from stomping on
 		 * each other wrt sd_log_bio. */
-		down_write(&sdp->sd_log_flush_lock);
+		down_read(&sdp->sd_log_flush_lock);
 		for (pass = 0; pass < 2; pass++) {
 			lops_before_scan(jd, &head, pass);
 			error = foreach_descriptor(jd, head.lh_tail,
@@ -412,7 +412,7 @@ void gfs2_recover_func(struct work_struct *work)
 		}
 
 		clean_journal(jd, &head);
-		up_write(&sdp->sd_log_flush_lock);
+		up_read(&sdp->sd_log_flush_lock);
 
 		gfs2_glock_dq_uninit(&thaw_gh);
 		t_rep = ktime_get();
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Cluster-devel] [GFS2 PATCH 4/5] gfs2: special log flush sequence to protect jdata writes
  2020-03-26 18:40 [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks Bob Peterson
                   ` (2 preceding siblings ...)
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 3/5] gfs2: change from write to read lock for sd_log_flush_lock in journal replay Bob Peterson
@ 2020-03-26 18:40 ` Bob Peterson
  2020-03-27 21:38   ` Andreas Gruenbacher
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 5/5] gfs2: implement special transaction type for jdata sync writes Bob Peterson
  4 siblings, 1 reply; 7+ messages in thread
From: Bob Peterson @ 2020-03-26 18:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

This patch implements Ben Marzinkski's idea of using two locks
as different layers of protection inside the gfs2_log_flush.
To quote Ben:

  You would need two locks, the regular log lock [sd_log_flush_lock],
  and the updating log lock [sd_log_updating_lock]. Calling
  gfs2_log_flush would grab and later release the updating log lock,
  just like it currently grabs and releases sd_log_flush_lock.

  Everything else that currently grabs sd_log_flush_lock, like
  gfs2_log_reserve(), would grab the regular log lock instead.
  (No changes if we define regular log lock == log_flush_lock.

  In a normal log flush, you would grab the regular log lock
  [log_flush_lock], call gfs2_log_flush, which would grab and
  later release the updating log lock, and after it returned,
  drop the regular log lock. In this case, the two locks would
  both lock basically the same code.

  For syncs [that] don't care if new data is written after the
  call to sync, you could do the same as rhel7 gfs2_meta_syncfs()
  [did in RHEL7], grabbing the regular log lock [gfs2_log_flush_lock]
  before each of the calls to gfs_log_flush(), and dropping after
  each return. Again, the two locks would basically lock the
  same code.

  For syncs where you need to be sure that no new data is coming
  in, like freezes and shutdowns, you need to grab the regular
  log lock, call gfs2_log_flush(), clean out the active items list
  like [rhel7's] gfs2_meta_syncfs() does, and then call
  gfs2_log_flush again, this time with the freeze or shutdown type.

  Only after both gfs2_log_flush() calls can you drop the regular
  log lock. This would mean that when you are writing out the
  active items in these syncing flushes, only the regular log lock
  would be held. This is the only time you will be holding one
  lock and not the other.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/incore.h     |  1 +
 fs/gfs2/log.c        | 77 ++++++++++++++++++++++++++++----------------
 fs/gfs2/ops_fstype.c |  1 +
 3 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 84a824293a78..129cf8582a0a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -834,6 +834,7 @@ struct gfs2_sbd {
 	int sd_log_idle;
 
 	struct rw_semaphore sd_log_flush_lock;
+	struct rw_semaphore sd_log_updating_lock;
 	atomic_t sd_log_in_flight;
 	struct bio *sd_log_bio;
 	wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 2abec43ae898..5371af3cd96c 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -876,6 +876,30 @@ static void ail_drain(struct gfs2_sbd *sdp)
 	spin_unlock(&sdp->sd_ail_lock);
 }
 
+static void gfs2_meta_syncfs(struct gfs2_sbd *sdp, u32 flags)
+{
+	if (!sdp->sd_log_idle) {
+		for (;;) {
+			gfs2_ail1_start(sdp);
+			gfs2_ail1_wait(sdp);
+			if (gfs2_ail1_empty(sdp, 0))
+				break;
+		}
+		if (gfs2_withdrawn(sdp))
+			return;
+		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved
+						       buffer */
+		trace_gfs2_log_blocks(sdp, -1);
+		log_write_header(sdp, flags);
+		sdp->sd_log_head = sdp->sd_log_flush_head;
+	}
+	if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
+		     GFS2_LOG_HEAD_FLUSH_FREEZE))
+		gfs2_log_shutdown(sdp);
+	if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
+		atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
+}
+
 /**
  * gfs2_log_flush - flush incore transaction(s)
  * @sdp: the filesystem
@@ -884,13 +908,13 @@ static void ail_drain(struct gfs2_sbd *sdp)
  *
  */
 
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
+static void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+			     u32 flags)
 {
 	struct gfs2_trans *tr = NULL;
 	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
 
-	down_write(&sdp->sd_log_flush_lock);
-
+	down_write(&sdp->sd_log_updating_lock);
 	/*
 	 * Do this check while holding the log_flush_lock to prevent new
 	 * buffers from being added to the ail via gfs2_pin()
@@ -900,7 +924,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
 
 	/* Log might have been flushed while we waited for the flush lock */
 	if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
-		up_write(&sdp->sd_log_flush_lock);
+		up_write(&sdp->sd_log_updating_lock);
 		return;
 	}
 	trace_gfs2_log_flush(sdp, 1, flags);
@@ -963,28 +987,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
 	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
 
-	if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
-		if (!sdp->sd_log_idle) {
-			for (;;) {
-				gfs2_ail1_start(sdp);
-				gfs2_ail1_wait(sdp);
-				if (gfs2_ail1_empty(sdp, 0))
-					break;
-			}
-			if (gfs2_withdrawn(sdp))
-				goto out;
-			atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
-			trace_gfs2_log_blocks(sdp, -1);
-			log_write_header(sdp, flags);
-			sdp->sd_log_head = sdp->sd_log_flush_head;
-		}
-		if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
-			     GFS2_LOG_HEAD_FLUSH_FREEZE))
-			gfs2_log_shutdown(sdp);
-		if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
-			atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
-	}
-
 out:
 	if (gfs2_withdrawn(sdp)) {
 		ail_drain(sdp); /* frees all transactions */
@@ -992,11 +994,32 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
 	}
 
 	trace_gfs2_log_flush(sdp, 0, flags);
-	up_write(&sdp->sd_log_flush_lock);
+	up_write(&sdp->sd_log_updating_lock);
 
 	kfree(tr);
 }
 
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
+{
+	u32 normal = 0;
+
+	down_write(&sdp->sd_log_flush_lock);
+	if (flags & GFS2_LOG_HEAD_FLUSH_NORMAL)
+		goto normal_flush;
+
+	normal = flags & ~(GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
+			   GFS2_LOG_HEAD_FLUSH_FREEZE |
+			   GFS2_LOG_HEAD_FLUSH_SYNC);
+	normal |= GFS2_LOG_HEAD_FLUSH_NORMAL;
+	__gfs2_log_flush(sdp, NULL, normal);
+
+	gfs2_meta_syncfs(sdp, flags);
+
+normal_flush:
+	__gfs2_log_flush(sdp, gl, flags);
+	up_write(&sdp->sd_log_flush_lock);
+}
+
 /**
  * gfs2_merge_trans - Merge a new transaction into a cached transaction
  * @old: Original transaction to be expanded
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1aeb591bfd28..11a3a471124c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -133,6 +133,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_ail2_list);
 
 	init_rwsem(&sdp->sd_log_flush_lock);
+	init_rwsem(&sdp->sd_log_updating_lock);
 	atomic_set(&sdp->sd_log_in_flight, 0);
 	atomic_set(&sdp->sd_reserving_log, 0);
 	init_waitqueue_head(&sdp->sd_reserving_log_wait);
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Cluster-devel] [GFS2 PATCH 5/5] gfs2: implement special transaction type for jdata sync writes
  2020-03-26 18:40 [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks Bob Peterson
                   ` (3 preceding siblings ...)
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 4/5] gfs2: special log flush sequence to protect jdata writes Bob Peterson
@ 2020-03-26 18:40 ` Bob Peterson
  4 siblings, 0 replies; 7+ messages in thread
From: Bob Peterson @ 2020-03-26 18:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

This patch implements Ben Marzinski's idea to solve the jdata sync
write deadlock. To quote Ben:

  Once you have flushes working like this, you can make a special version
  of gfs2_trans_begin() that will allow a transaction to start as long as
  the updating log lock isn't held, instead of blocking on the regular
  log lock, like normal transactions. You would only call this special
  version of gfs2_trans_begin() when writing out jdata pages during a
  syncing log flush. So these mmaped jdata writes will be the only things
  that can ever start transactions while you are writing out the active
  items during a syncing log flush.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/aops.c  | 25 +++++++++++++++++++++----
 fs/gfs2/glops.c |  2 +-
 fs/gfs2/log.c   | 19 ++++++++++++++-----
 fs/gfs2/log.h   |  6 ++++--
 fs/gfs2/trans.c | 39 ++++++++++++++++++++++++++++++++-------
 fs/gfs2/trans.h |  4 ++++
 6 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 786c1ce8f030..cf28ff8e7a53 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -157,8 +157,15 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
 	struct inode *inode = page->mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int new_trans = 0;
+	int ret;
 
 	if (PageChecked(page)) {
+		ret = gfs2_trans_begin_jdata_sync(sdp, RES_DINODE + 1, 0);
+		if (ret)
+			return ret;
+
+		new_trans = 1;
 		ClearPageChecked(page);
 		if (!page_has_buffers(page)) {
 			create_empty_buffers(page, inode->i_sb->s_blocksize,
@@ -166,7 +173,10 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
 		}
 		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize);
 	}
-	return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	ret = gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	if (new_trans)
+		gfs2_trans_end_jdata_sync(sdp);
+	return ret;
 }
 
 /**
@@ -183,18 +193,25 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 	struct inode *inode = page->mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int ret = 0;
 
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
 		goto out;
-	if (PageChecked(page) || current->journal_info)
+	if (current->journal_info)
 		goto out_ignore;
-	return __gfs2_jdata_writepage(page, wbc);
+
+	ret = __gfs2_jdata_writepage(page, wbc);
+	if (!ret)
+		return 0;
+
+	if (ret == -EAGAIN)
+		ret = 0;
 
 out_ignore:
 	redirty_page_for_writepage(wbc, page);
 out:
 	unlock_page(page);
-	return 0;
+	return ret;
 }
 
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 9e9c7a4b8c66..3738b074719e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -125,7 +125,7 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl)
          * on the stack */
 	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes);
 	tr.tr_ip = _RET_IP_;
-	ret = gfs2_log_reserve(sdp, tr.tr_reserved);
+	ret = gfs2_log_reserve(sdp, tr.tr_reserved, true);
 	if (ret < 0)
 		return ret;
 	WARN_ON_ONCE(current->journal_info);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 5371af3cd96c..5cc3d62d38c1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -389,23 +389,29 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
  * gfs2_log_release - Release a given number of log blocks
  * @sdp: The GFS2 superblock
  * @blks: The number of blocks
+ * @is_normal: If true, release sd_log_flush_lock rwsem, else updating lock
  *
  */
 
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks,
+		      bool is_normal)
 {
 
 	atomic_add(blks, &sdp->sd_log_blks_free);
 	trace_gfs2_log_blocks(sdp, blks);
 	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
 				  sdp->sd_jdesc->jd_blocks);
-	up_read(&sdp->sd_log_flush_lock);
+	if (is_normal)
+		up_read(&sdp->sd_log_flush_lock);
+	else
+		up_read(&sdp->sd_log_updating_lock);
 }
 
 /**
  * gfs2_log_reserve - Make a log reservation
  * @sdp: The GFS2 superblock
  * @blks: The number of blocks to reserve
+ * @is_normal: If true, hold the sd_log_flush_lock rwsem, else updating lock
  *
  * Note that we never give out the last few blocks of the journal. Thats
  * due to the fact that there is a small number of header blocks
@@ -422,7 +428,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
  * Returns: errno
  */
 
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks, bool is_normal)
 {
 	int ret = 0;
 	unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
@@ -466,9 +472,12 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 	if (unlikely(did_wait))
 		wake_up(&sdp->sd_log_waitq);
 
-	down_read(&sdp->sd_log_flush_lock);
+	if (is_normal)
+		down_read(&sdp->sd_log_flush_lock);
+	else if (!down_read_trylock(&sdp->sd_log_updating_lock))
+		ret = -EAGAIN;
 	if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
-		gfs2_log_release(sdp, blks);
+		gfs2_log_release(sdp, blks, is_normal);
 		ret = -EROFS;
 	}
 	if (atomic_dec_and_test(&sdp->sd_reserving_log))
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index c1cd6ae17659..188ab09ecc7d 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -64,8 +64,10 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
 extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
 extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
 
-extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
-extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks,
+			     bool is_normal);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks,
+			    bool is_normal);
 extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
 				  u64 seq, u32 tail, u32 lblock, u32 flags,
 				  int op_flags);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ffe840505082..08b3a93d6359 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -25,8 +25,8 @@
 #include "util.h"
 #include "trace_gfs2.h"
 
-int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
-		     unsigned int revokes)
+static int __gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+			      unsigned int revokes, bool is_normal)
 {
 	struct gfs2_trans *tr;
 	int error;
@@ -55,7 +55,8 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 
 	sb_start_intwrite(sdp->sd_vfs);
 
-	error = gfs2_log_reserve(sdp, tr->tr_reserved);
+	/* We only want to take the sd_log_flush_lock for normal flushes */
+	error = gfs2_log_reserve(sdp, tr->tr_reserved, is_normal);
 	if (error)
 		goto fail;
 
@@ -70,6 +71,18 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	return error;
 }
 
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes)
+{
+	return __gfs2_trans_begin(sdp, blocks, revokes, true);
+}
+
+int gfs2_trans_begin_jdata_sync(struct gfs2_sbd *sdp, unsigned int blocks,
+				unsigned int revokes)
+{
+	return __gfs2_trans_begin(sdp, blocks, revokes, false);
+}
+
 static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr)
 {
 	fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip);
@@ -82,7 +95,7 @@ static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr)
 		tr->tr_num_revoke, tr->tr_num_revoke_rm);
 }
 
-void gfs2_trans_end(struct gfs2_sbd *sdp)
+void __gfs2_trans_end(struct gfs2_sbd *sdp, bool is_normal)
 {
 	struct gfs2_trans *tr = current->journal_info;
 	s64 nbuf;
@@ -91,7 +104,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	current->journal_info = NULL;
 
 	if (!test_bit(TR_TOUCHED, &tr->tr_flags)) {
-		gfs2_log_release(sdp, tr->tr_reserved);
+		gfs2_log_release(sdp, tr->tr_reserved, is_normal);
 		if (alloced) {
 			kfree(tr);
 			sb_end_intwrite(sdp->sd_vfs);
@@ -110,8 +123,10 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	gfs2_log_commit(sdp, tr);
 	if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags))
 		kfree(tr);
-	up_read(&sdp->sd_log_flush_lock);
-
+	if (is_normal)
+		up_read(&sdp->sd_log_flush_lock);
+	else
+		up_read(&sdp->sd_log_updating_lock);
 	if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
 			       GFS2_LFC_TRANS_END);
@@ -119,6 +134,16 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		sb_end_intwrite(sdp->sd_vfs);
 }
 
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+	__gfs2_trans_end(sdp, true);
+}
+
+void gfs2_trans_end_jdata_sync(struct gfs2_sbd *sdp)
+{
+	__gfs2_trans_end(sdp, false);
+}
+
 static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
 					       struct buffer_head *bh)
 {
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 6071334de035..e384f9dddbc7 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -36,8 +36,12 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned
 
 extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 			    unsigned int revokes);
+extern int gfs2_trans_begin_jdata_sync(struct gfs2_sbd *sdp,
+				       unsigned int blocks,
+				       unsigned int revokes);
 
 extern void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_end_jdata_sync(struct gfs2_sbd *sdp);
 extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
 extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
 extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Cluster-devel] [GFS2 PATCH 4/5] gfs2: special log flush sequence to protect jdata writes
  2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 4/5] gfs2: special log flush sequence to protect jdata writes Bob Peterson
@ 2020-03-27 21:38   ` Andreas Gruenbacher
  0 siblings, 0 replies; 7+ messages in thread
From: Andreas Gruenbacher @ 2020-03-27 21:38 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi Bob,

On Thu, Mar 26, 2020 at 7:40 PM Bob Peterson <rpeterso@redhat.com> wrote:
> This patch implements Ben Marzinkski's idea of using two locks
> as different layers of protection inside the gfs2_log_flush.
> To quote Ben:

this is all looking rather promising, but could you please describe
what problem this and the next patch are supposed to fix, and why this
approach achieves that?

>   You would need two locks, the regular log lock [sd_log_flush_lock],
>   and the updating log lock [sd_log_updating_lock]. Calling
>   gfs2_log_flush would grab and later release the updating log lock,
>   just like it currently grabs and releases sd_log_flush_lock.
>
>   Everything else that currently grabs sd_log_flush_lock, like
>   gfs2_log_reserve(), would grab the regular log lock instead.
>   (No changes if we define regular log lock == log_flush_lock.
>
>   In a normal log flush, you would grab the regular log lock
>   [log_flush_lock], call gfs2_log_flush, which would grab and
>   later release the updating log lock, and after it returned,
>   drop the regular log lock. In this case, the two locks would
>   both lock basically the same code.
>
>   For syncs [that] don't care if new data is written after the
>   call to sync, you could do the same as rhel7 gfs2_meta_syncfs()
>   [did in RHEL7], grabbing the regular log lock [gfs2_log_flush_lock]
>   before each of the calls to gfs_log_flush(), and dropping after
>   each return. Again, the two locks would basically lock the
>   same code.
>
>   For syncs where you need to be sure that no new data is coming
>   in, like freezes and shutdowns, you need to grab the regular
>   log lock, call gfs2_log_flush(), clean out the active items list
>   like [rhel7's] gfs2_meta_syncfs() does, and then call
>   gfs2_log_flush again, this time with the freeze or shutdown type.
>
>   Only after both gfs2_log_flush() calls can you drop the regular
>   log lock. This would mean that when you are writing out the
>   active items in these syncing flushes, only the regular log lock
>   would be held. This is the only time you will be holding one
>   lock and not the other.
>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
>  fs/gfs2/incore.h     |  1 +
>  fs/gfs2/log.c        | 77 ++++++++++++++++++++++++++++----------------
>  fs/gfs2/ops_fstype.c |  1 +
>  3 files changed, 52 insertions(+), 27 deletions(-)
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 84a824293a78..129cf8582a0a 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -834,6 +834,7 @@ struct gfs2_sbd {
>         int sd_log_idle;
>
>         struct rw_semaphore sd_log_flush_lock;
> +       struct rw_semaphore sd_log_updating_lock;
>         atomic_t sd_log_in_flight;
>         struct bio *sd_log_bio;
>         wait_queue_head_t sd_log_flush_wait;
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index 2abec43ae898..5371af3cd96c 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -876,6 +876,30 @@ static void ail_drain(struct gfs2_sbd *sdp)
>         spin_unlock(&sdp->sd_ail_lock);
>  }
>
> +static void gfs2_meta_syncfs(struct gfs2_sbd *sdp, u32 flags)
> +{
> +       if (!sdp->sd_log_idle) {
> +               for (;;) {
> +                       gfs2_ail1_start(sdp);
> +                       gfs2_ail1_wait(sdp);
> +                       if (gfs2_ail1_empty(sdp, 0))
> +                               break;
> +               }
> +               if (gfs2_withdrawn(sdp))
> +                       return;
> +               atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved
> +                                                      buffer */
> +               trace_gfs2_log_blocks(sdp, -1);
> +               log_write_header(sdp, flags);
> +               sdp->sd_log_head = sdp->sd_log_flush_head;
> +       }
> +       if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
> +                    GFS2_LOG_HEAD_FLUSH_FREEZE))
> +               gfs2_log_shutdown(sdp);
> +       if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
> +               atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
> +}
> +
>  /**
>   * gfs2_log_flush - flush incore transaction(s)
>   * @sdp: the filesystem
> @@ -884,13 +908,13 @@ static void ail_drain(struct gfs2_sbd *sdp)
>   *
>   */
>
> -void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
> +static void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
> +                            u32 flags)
>  {
>         struct gfs2_trans *tr = NULL;
>         enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
>
> -       down_write(&sdp->sd_log_flush_lock);
> -
> +       down_write(&sdp->sd_log_updating_lock);
>         /*
>          * Do this check while holding the log_flush_lock to prevent new
>          * buffers from being added to the ail via gfs2_pin()
> @@ -900,7 +924,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
>
>         /* Log might have been flushed while we waited for the flush lock */
>         if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
> -               up_write(&sdp->sd_log_flush_lock);
> +               up_write(&sdp->sd_log_updating_lock);
>                 return;
>         }
>         trace_gfs2_log_flush(sdp, 1, flags);
> @@ -963,28 +987,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
>         spin_unlock(&sdp->sd_ail_lock);
>         gfs2_log_unlock(sdp);
>
> -       if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
> -               if (!sdp->sd_log_idle) {
> -                       for (;;) {
> -                               gfs2_ail1_start(sdp);
> -                               gfs2_ail1_wait(sdp);
> -                               if (gfs2_ail1_empty(sdp, 0))
> -                                       break;
> -                       }
> -                       if (gfs2_withdrawn(sdp))
> -                               goto out;
> -                       atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
> -                       trace_gfs2_log_blocks(sdp, -1);
> -                       log_write_header(sdp, flags);
> -                       sdp->sd_log_head = sdp->sd_log_flush_head;
> -               }
> -               if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
> -                            GFS2_LOG_HEAD_FLUSH_FREEZE))
> -                       gfs2_log_shutdown(sdp);
> -               if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
> -                       atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
> -       }
> -
>  out:
>         if (gfs2_withdrawn(sdp)) {
>                 ail_drain(sdp); /* frees all transactions */
> @@ -992,11 +994,32 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
>         }
>
>         trace_gfs2_log_flush(sdp, 0, flags);
> -       up_write(&sdp->sd_log_flush_lock);
> +       up_write(&sdp->sd_log_updating_lock);
>
>         kfree(tr);
>  }
>
> +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
> +{
> +       u32 normal = 0;
> +
> +       down_write(&sdp->sd_log_flush_lock);
> +       if (flags & GFS2_LOG_HEAD_FLUSH_NORMAL)
> +               goto normal_flush;
> +
> +       normal = flags & ~(GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
> +                          GFS2_LOG_HEAD_FLUSH_FREEZE |
> +                          GFS2_LOG_HEAD_FLUSH_SYNC);
> +       normal |= GFS2_LOG_HEAD_FLUSH_NORMAL;
> +       __gfs2_log_flush(sdp, NULL, normal);
> +
> +       gfs2_meta_syncfs(sdp, flags);
> +
> +normal_flush:
> +       __gfs2_log_flush(sdp, gl, flags);
> +       up_write(&sdp->sd_log_flush_lock);
> +}
> +
>  /**
>   * gfs2_merge_trans - Merge a new transaction into a cached transaction
>   * @old: Original transaction to be expanded
> diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
> index 1aeb591bfd28..11a3a471124c 100644
> --- a/fs/gfs2/ops_fstype.c
> +++ b/fs/gfs2/ops_fstype.c
> @@ -133,6 +133,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
>         INIT_LIST_HEAD(&sdp->sd_ail2_list);
>
>         init_rwsem(&sdp->sd_log_flush_lock);
> +       init_rwsem(&sdp->sd_log_updating_lock);
>         atomic_set(&sdp->sd_log_in_flight, 0);
>         atomic_set(&sdp->sd_reserving_log, 0);
>         init_waitqueue_head(&sdp->sd_reserving_log_wait);
> --
> 2.25.1
>

Thanks,
Andreas




^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-03-27 21:38 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-26 18:40 [Cluster-devel] [GFS2 PATCH 0/5] gfs2; jdata io deadlocks Bob Peterson
2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 1/5] gfs2: don't lock sd_log_flush_lock in try_rgrp_unlink Bob Peterson
2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 2/5] gfs2: instrumentation wrt ail1 stuck Bob Peterson
2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 3/5] gfs2: change from write to read lock for sd_log_flush_lock in journal replay Bob Peterson
2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 4/5] gfs2: special log flush sequence to protect jdata writes Bob Peterson
2020-03-27 21:38   ` Andreas Gruenbacher
2020-03-26 18:40 ` [Cluster-devel] [GFS2 PATCH 5/5] gfs2: implement special transaction type for jdata sync writes Bob Peterson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.