All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems
@ 2021-09-29 13:21 Bob Peterson
  2021-09-29 13:27 ` Bob Peterson
  2021-09-29 15:35 ` Andreas Gruenbacher
  0 siblings, 2 replies; 4+ messages in thread
From: Bob Peterson @ 2021-09-29 13:21 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Before this patch, when a glock was locked, the very first holder on the
queue would unlock the lockref and call the go_lock glops function (if
one existed), unless GL_SKIP was specified. When we introduced the new
node-scope concept, we allowed multiple holders to lock glocks in EX mode
and share the lock.

But node-scope introduced a new problem: if the first holder has GL_SKIP
and the next one does NOT, since it is not the first holder on the queue,
the go_lock op was not called. Eventually the GL_SKIP holder may call the
go_lock sub-function (e.g. gfs2_rgrp_bh_get) but there was still a
window of time in which another non-GL_SKIP holder assumes the go_lock
function had been called by the first holder. In the case of rgrp glocks,
this led to a NULL pointer dereference on the buffer_heads.

This patch tries to fix the problem by introducing two new glock flags:

GLF_GO_LOCK_NEEDED, which keeps track of when the go_lock function needs
to be called to "fill in" or "read" the object before it is referenced.

GLF_GO_LOCK_IN_PROG which is used to determine when a process is
in the process of reading in the object. Whenever a function needs to
reference the object, it checks the GLF_GO_LOCK_NEEDED flag, and if
set, it sets GLF_GO_LOCK_IN_PROG and calls the glops "go_lock" function.

As before, the gl_lockref spin_lock is unlocked during the IO operation,
which may take a relatively long amount of time to complete. While
unlocked, if another process determines go_lock is still needed, it sees
the GLF_GO_LOCK_IN_PROG flag is set, and waits for the go_lock glop
operation to be completed. Once GLF_GO_LOCK_IN_PROG is cleared, it needs
to check GLF_GO_LOCK_NEEDED again because the other process's go_lock
operation may not have been successful.

To faciliate this change, the go_lock section of function do_promote
was extracted to its own new function, gfs2_go_lock. The reason we do
this is because GL_SKIP callers often read in the object later.
Before this patch, those GL_SKIP callers (like gfs2_inode_lookup and
update_rgrp_lvb) called directly into the object-read functions
(gfs2_inode_refresh and gfs2_rgrp_bh_get respectively), but that never
cleared the new GLF_GO_LOCK_NEEDED flag. This patch changes those
functions so they call into the new gfs2_go_lock directly, which takes
care of all that.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c  | 136 +++++++++++++++++++++++++++++++++--------------
 fs/gfs2/glock.h  |   1 +
 fs/gfs2/glops.c  |  21 ++++----
 fs/gfs2/incore.h |   3 +-
 fs/gfs2/inode.c  |   4 +-
 fs/gfs2/rgrp.c   |  12 ++---
 fs/gfs2/super.c  |   6 ++-
 7 files changed, 121 insertions(+), 62 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4fcf340603e7..750ea07b4173 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -380,6 +380,66 @@ static void do_error(struct gfs2_glock *gl, const int ret)
 	}
 }
 
+/**
+ * gfs2_go_lock - Call the go_lock glops function
+ * @gl: The glock
+ *
+ * Note: the GLF bits used throughout are protected by the lockref spinlock.
+ *
+ * Returns: 0 if go_lock was successful, 2 if type specific operation is
+ * underway, -EAGAIN if secondary go_lock was waiting for another caller of
+ * go_lock, or error.
+ */
+int gfs2_go_lock(struct gfs2_holder *gh)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	int ret;
+
+	/*
+	 * Since we unlock the lockref lock, we set a flag to indicate
+	 * go_lock is in progress.
+	 */
+	if (test_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags)) {
+		wait_on_bit(&gl->gl_flags, GLF_GO_LOCK_IN_PROG,
+			    TASK_UNINTERRUPTIBLE);
+		/*
+		 * Here we just waited for a first go_lock to finish.
+		 * But that go_lock may not have been successful, as when
+		 * a process locks an inode glock _before_ it has an actual
+		 * inode to populate. So we check again. This secondary  waiter
+		 * _might_ have an inode to populate.
+		 */
+		return -EAGAIN;
+	}
+
+	set_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags);
+
+	/* FIXME: eliminate this eventually */
+	ret = glops->go_lock(gh);
+
+	switch(ret) {
+	case 0:
+		clear_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags);
+		break;
+	case 1:
+		ret = 2;
+		break;
+	default:
+		gh->gh_error = ret;
+		list_del_init(&gh->gh_list);
+		trace_gfs2_glock_queue(gh, 0);
+		gfs2_holder_wake(gh);
+		break;
+	}
+	clear_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&gl->gl_flags, GLF_GO_LOCK_IN_PROG);
+	return ret;
+}
+
 /**
  * do_promote - promote as many requests as possible on the current queue
  * @gl: The glock
@@ -389,54 +449,50 @@ static void do_error(struct gfs2_glock *gl, const int ret)
  */
 
 static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_lockref.lock)
-__acquires(&gl->gl_lockref.lock)
 {
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh, *tmp;
 	int first;
+	bool lock_released;
 	int ret;
 
 restart:
 	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		lock_released = false;
 		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
 			continue;
-		if (may_grant(gl, gh)) {
-			first = gfs2_first_holder(gh);
-			if (first && glops->go_lock) {
-				if (!(gh->gh_flags & GL_SKIP)) {
-					spin_unlock(&gl->gl_lockref.lock);
-					/* FIXME: eliminate this eventually */
-					ret = glops->go_lock(gh);
-					spin_lock(&gl->gl_lockref.lock);
-					if (ret) {
-						if (ret == 1)
-							return 2;
-						gh->gh_error = ret;
-						list_del_init(&gh->gh_list);
-						trace_gfs2_glock_queue(gh, 0);
-						gfs2_holder_wake(gh);
-						goto restart;
-					}
-				}
-			}
-			set_bit(HIF_HOLDER, &gh->gh_iflags);
-			trace_gfs2_promote(gh, first);
-			gfs2_holder_wake(gh);
-			/*
-			 * If this was the first holder, we may have released
-			 * the gl_lockref.lock, so the holders list may have
-			 * changed. For that reason, we start again at the
-			 * start of the holders queue.
-			 */
-			if (first)
-				goto restart;
-			continue;
+		if (!may_grant(gl, gh)) {
+			if (gh->gh_list.prev == &gl->gl_holders)
+				return 1;
+			do_error(gl, 0);
+			break;
 		}
-		if (gh->gh_list.prev == &gl->gl_holders)
-			return 1;
-		do_error(gl, 0);
-		break;
+
+		first = gfs2_first_holder(gh);
+check_go_lock:
+		if (test_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags) &&
+		    !(gh->gh_flags & GL_SKIP)) {
+			spin_unlock(&gl->gl_lockref.lock);
+			ret = gfs2_go_lock(gh);
+			spin_lock(&gl->gl_lockref.lock);
+			if (ret) {
+				if (ret == -EAGAIN)
+					goto check_go_lock;
+				else
+					goto restart;
+			} else
+				lock_released = true;
+		}
+
+		set_bit(HIF_HOLDER, &gh->gh_iflags);
+		trace_gfs2_promote(gh, first);
+		gfs2_holder_wake(gh);
+		/*
+		 * If we released the gl_lockref.lock the holders list may have
+		 * changed. For that reason, we start again at the start of
+		 * the holders queue.
+		 */
+		if (lock_released)
+			goto restart;
 	}
 	return 0;
 }
@@ -1064,7 +1120,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 
 	atomic_inc(&sdp->sd_glock_disposal);
 	gl->gl_node.next = NULL;
-	gl->gl_flags = 0;
+	gl->gl_flags = glops->go_lock ? BIT(GLF_GO_LOCK_NEEDED) : 0;
 	gl->gl_name = name;
 	lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
 	gl->gl_lockref.count = 1;
@@ -2153,6 +2209,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 		*p++ = 'P';
 	if (test_bit(GLF_FREEING, gflags))
 		*p++ = 'x';
+	if (test_bit(GLF_GO_LOCK_NEEDED, gflags))
+		*p++ = 'g';
 	*p = 0;
 	return buf;
 }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 8a09379dbf66..2e348e250fd2 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -195,6 +195,7 @@ extern void gfs2_holder_reinit(unsigned int state, u16 flags,
 extern void gfs2_holder_uninit(struct gfs2_holder *gh);
 extern int gfs2_glock_nq(struct gfs2_holder *gh);
 extern int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_go_lock(struct gfs2_holder *gh);
 extern int gfs2_glock_wait(struct gfs2_holder *gh);
 extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq(struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4b19f513570f..4823ba63c3bf 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -356,7 +356,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 		struct address_space *mapping = gfs2_glock2aspace(gl);
 		truncate_inode_pages(mapping, 0);
 		if (ip) {
-			set_bit(GIF_INVALID, &ip->i_flags);
+			set_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags);
 			forget_all_cached_acls(&ip->i_inode);
 			security_inode_invalidate_secctx(&ip->i_inode);
 			gfs2_dir_hash_inval(ip);
@@ -476,8 +476,6 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 
 	error = gfs2_dinode_in(ip, dibh->b_data);
 	brelse(dibh);
-	clear_bit(GIF_INVALID, &ip->i_flags);
-
 	return error;
 }
 
@@ -493,16 +491,14 @@ static int inode_go_lock(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct gfs2_inode *ip = gl->gl_object;
-	int error = 0;
+	int error = 1;
 
-	if (!ip)
-		return 0;
+	if (!ip) /* no inode to populate - read it in later */
+		goto out;
 
-	if (test_bit(GIF_INVALID, &ip->i_flags)) {
-		error = gfs2_inode_refresh(ip);
-		if (error)
-			return error;
-	}
+	error = gfs2_inode_refresh(ip);
+	if (error)
+		goto out;
 
 	if (gh->gh_state != LM_ST_DEFERRED)
 		inode_dio_wait(&ip->i_inode);
@@ -515,9 +511,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
 			list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
 		spin_unlock(&sdp->sd_trunc_lock);
 		wake_up(&sdp->sd_quota_wait);
-		return 1;
+		error = 1;
 	}
 
+out:
 	return error;
 }
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0fe49770166e..c98b37545f2c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -315,6 +315,7 @@ struct gfs2_alloc_parms {
 
 enum {
 	GLF_LOCK			= 1,
+	GLF_GO_LOCK_NEEDED		= 2, /* needs go_lock */
 	GLF_DEMOTE			= 3,
 	GLF_PENDING_DEMOTE		= 4,
 	GLF_DEMOTE_IN_PROGRESS		= 5,
@@ -324,6 +325,7 @@ enum {
 	GLF_REPLY_PENDING		= 9,
 	GLF_INITIAL			= 10,
 	GLF_FROZEN			= 11,
+	GLF_GO_LOCK_IN_PROG		= 12, /* go_lock happening now */
 	GLF_LRU				= 13,
 	GLF_OBJECT			= 14, /* Used only for tracing */
 	GLF_BLOCKING			= 15,
@@ -370,7 +372,6 @@ struct gfs2_glock {
 };
 
 enum {
-	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
 	GIF_ALLOC_FAILED	= 2,
 	GIF_SW_PAGED		= 3,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e15434b23ac..4e583e9a380c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 		}
 
 		glock_set_object(ip->i_gl, ip);
-		set_bit(GIF_INVALID, &ip->i_flags);
+		set_bit(GLF_GO_LOCK_NEEDED, &ip->i_gl->gl_flags);
 		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
 		if (unlikely(error))
 			goto fail;
@@ -196,7 +196,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 
 		if (type == DT_UNKNOWN) {
 			/* Inode glock must be locked already */
-			error = gfs2_inode_refresh(GFS2_I(inode));
+			error = gfs2_go_lock(&i_gh);
 			if (error)
 				goto fail;
 		} else {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 96b2fbed6bf1..796f9e015218 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1261,7 +1261,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	return error;
 }
 
-static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
 {
 	u32 rl_flags;
 
@@ -1269,7 +1269,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
 		return 0;
 
 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
-		return gfs2_rgrp_bh_get(rgd);
+		gfs2_go_lock(gh);
 
 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
 	rl_flags &= ~GFS2_RDF_MASK;
@@ -1312,6 +1312,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
 			bi->bi_bh = NULL;
 		}
 	}
+	set_bit(GLF_GO_LOCK_NEEDED, &rgd->rd_gl->gl_flags);
 }
 
 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
@@ -2110,7 +2111,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 			    gfs2_rgrp_congested(rs->rs_rgd, loops))
 				goto skip_rgrp;
 			if (sdp->sd_args.ar_rgrplvb) {
-				error = update_rgrp_lvb(rs->rs_rgd);
+				error = update_rgrp_lvb(rs->rs_rgd,
+							&ip->i_rgd_gh);
 				if (unlikely(error)) {
 					rgrp_unlock_local(rs->rs_rgd);
 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
@@ -2126,7 +2128,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 			goto skip_rgrp;
 
 		if (sdp->sd_args.ar_rgrplvb)
-			gfs2_rgrp_bh_get(rs->rs_rgd);
+			gfs2_go_lock(&ip->i_rgd_gh);
 
 		/* Get a reservation if we don't already have one */
 		if (!gfs2_rs_active(rs))
@@ -2762,8 +2764,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
 
 void rgrp_lock_local(struct gfs2_rgrpd *rgd)
 {
-	GLOCK_BUG_ON(rgd->rd_gl, !gfs2_glock_is_held_excl(rgd->rd_gl) &&
-		     !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
 	mutex_lock(&rgd->rd_mutex);
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 6e00d15ef0a8..20698232774a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1244,8 +1244,10 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
 	if (ret)
 		return SHOULD_NOT_DELETE_DINODE;
 
-	if (test_bit(GIF_INVALID, &ip->i_flags)) {
-		ret = gfs2_inode_refresh(ip);
+	if (test_bit(GLF_GO_LOCK_NEEDED, &ip->i_gl->gl_flags)) {
+		do {
+			ret = gfs2_go_lock(gh);
+		} while (ret == -EAGAIN);
 		if (ret)
 			return SHOULD_NOT_DELETE_DINODE;
 	}
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems
  2021-09-29 13:21 [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems Bob Peterson
@ 2021-09-29 13:27 ` Bob Peterson
  2021-09-29 15:35 ` Andreas Gruenbacher
  1 sibling, 0 replies; 4+ messages in thread
From: Bob Peterson @ 2021-09-29 13:27 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On 9/29/21 8:21 AM, Bob Peterson wrote:
> Before this patch, when a glock was locked, the very first holder on the
> queue would unlock the lockref and call the go_lock glops function (if
> one existed), unless GL_SKIP was specified. When we introduced the new
> node-scope concept, we allowed multiple holders to lock glocks in EX mode
> and share the lock.
> 
> But node-scope introduced a new problem: if the first holder has GL_SKIP
> and the next one does NOT, since it is not the first holder on the queue,
> the go_lock op was not called. Eventually the GL_SKIP holder may call the
> go_lock sub-function (e.g. gfs2_rgrp_bh_get) but there was still a
> window of time in which another non-GL_SKIP holder assumes the go_lock
> function had been called by the first holder. In the case of rgrp glocks,
> this led to a NULL pointer dereference on the buffer_heads.
> 
> This patch tries to fix the problem by introducing two new glock flags:
> 
> GLF_GO_LOCK_NEEDED, which keeps track of when the go_lock function needs
> to be called to "fill in" or "read" the object before it is referenced.
> 
> GLF_GO_LOCK_IN_PROG which is used to determine when a process is
> in the process of reading in the object. Whenever a function needs to
> reference the object, it checks the GLF_GO_LOCK_NEEDED flag, and if
> set, it sets GLF_GO_LOCK_IN_PROG and calls the glops "go_lock" function.
> 
> As before, the gl_lockref spin_lock is unlocked during the IO operation,
> which may take a relatively long amount of time to complete. While
> unlocked, if another process determines go_lock is still needed, it sees
> the GLF_GO_LOCK_IN_PROG flag is set, and waits for the go_lock glop
> operation to be completed. Once GLF_GO_LOCK_IN_PROG is cleared, it needs
> to check GLF_GO_LOCK_NEEDED again because the other process's go_lock
> operation may not have been successful.
> 
> To faciliate this change, the go_lock section of function do_promote
> was extracted to its own new function, gfs2_go_lock. The reason we do
> this is because GL_SKIP callers often read in the object later.
> Before this patch, those GL_SKIP callers (like gfs2_inode_lookup and
> update_rgrp_lvb) called directly into the object-read functions
> (gfs2_inode_refresh and gfs2_rgrp_bh_get respectively), but that never
> cleared the new GLF_GO_LOCK_NEEDED flag. This patch changes those
> functions so they call into the new gfs2_go_lock directly, which takes
> care of all that.
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
>   fs/gfs2/glock.c  | 136 +++++++++++++++++++++++++++++++++--------------
>   fs/gfs2/glock.h  |   1 +
>   fs/gfs2/glops.c  |  21 ++++----
>   fs/gfs2/incore.h |   3 +-
>   fs/gfs2/inode.c  |   4 +-
>   fs/gfs2/rgrp.c   |  12 ++---
>   fs/gfs2/super.c  |   6 ++-
>   7 files changed, 121 insertions(+), 62 deletions(-)
(snip)
> @@ -2153,6 +2209,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
>   		*p++ = 'P';
>   	if (test_bit(GLF_FREEING, gflags))
>   		*p++ = 'x';
> +	if (test_bit(GLF_GO_LOCK_NEEDED, gflags))
> +		*p++ = 'g';
>   	*p = 0;
>   	return buf;
>   }

Hi,

As soon as I sent this patch out I realized I forgot to add the second
new GLF bit, GLF_GO_LOCK_IN_PROG, to gflags2str. So the final version
will include it.

Also, this version passed 500 iterations of the failing test case and
a full run of xfstests.

Regards,

Bob Peterson



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems
  2021-09-29 13:21 [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems Bob Peterson
  2021-09-29 13:27 ` Bob Peterson
@ 2021-09-29 15:35 ` Andreas Gruenbacher
  2021-09-29 16:42   ` Andreas Gruenbacher
  1 sibling, 1 reply; 4+ messages in thread
From: Andreas Gruenbacher @ 2021-09-29 15:35 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi Bob,

On Wed, Sep 29, 2021 at 3:21 PM Bob Peterson <rpeterso@redhat.com> wrote:
> Before this patch, when a glock was locked, the very first holder on the
> queue would unlock the lockref and call the go_lock glops function (if
> one existed), unless GL_SKIP was specified. When we introduced the new
> node-scope concept, we allowed multiple holders to lock glocks in EX mode
> and share the lock.
>
> But node-scope introduced a new problem: if the first holder has GL_SKIP
> and the next one does NOT, since it is not the first holder on the queue,
> the go_lock op was not called. Eventually the GL_SKIP holder may call the
> go_lock sub-function (e.g. gfs2_rgrp_bh_get) but there was still a
> window of time in which another non-GL_SKIP holder assumes the go_lock
> function had been called by the first holder. In the case of rgrp glocks,
> this led to a NULL pointer dereference on the buffer_heads.
>
> This patch tries to fix the problem by introducing two new glock flags:
>
> GLF_GO_LOCK_NEEDED, which keeps track of when the go_lock function needs
> to be called to "fill in" or "read" the object before it is referenced.
>
> GLF_GO_LOCK_IN_PROG which is used to determine when a process is
> in the process of reading in the object. Whenever a function needs to
> reference the object, it checks the GLF_GO_LOCK_NEEDED flag, and if
> set, it sets GLF_GO_LOCK_IN_PROG and calls the glops "go_lock" function.
>
> As before, the gl_lockref spin_lock is unlocked during the IO operation,
> which may take a relatively long amount of time to complete. While
> unlocked, if another process determines go_lock is still needed, it sees
> the GLF_GO_LOCK_IN_PROG flag is set, and waits for the go_lock glop
> operation to be completed. Once GLF_GO_LOCK_IN_PROG is cleared, it needs
> to check GLF_GO_LOCK_NEEDED again because the other process's go_lock
> operation may not have been successful.
>
> To faciliate this change, the go_lock section of function do_promote
> was extracted to its own new function, gfs2_go_lock. The reason we do
> this is because GL_SKIP callers often read in the object later.
> Before this patch, those GL_SKIP callers (like gfs2_inode_lookup and
> update_rgrp_lvb) called directly into the object-read functions
> (gfs2_inode_refresh and gfs2_rgrp_bh_get respectively), but that never
> cleared the new GLF_GO_LOCK_NEEDED flag. This patch changes those
> functions so they call into the new gfs2_go_lock directly, which takes
> care of all that.

this is looking much better now. The naming of the go_lock callback
and consequently the new flags is poor though; go_lock doesn't
actually do any locking. Something like revalidate would make a lot
more sense.

> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
>  fs/gfs2/glock.c  | 136 +++++++++++++++++++++++++++++++++--------------
>  fs/gfs2/glock.h  |   1 +
>  fs/gfs2/glops.c  |  21 ++++----
>  fs/gfs2/incore.h |   3 +-
>  fs/gfs2/inode.c  |   4 +-
>  fs/gfs2/rgrp.c   |  12 ++---
>  fs/gfs2/super.c  |   6 ++-
>  7 files changed, 121 insertions(+), 62 deletions(-)
>
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 4fcf340603e7..750ea07b4173 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -380,6 +380,66 @@ static void do_error(struct gfs2_glock *gl, const int ret)
>         }
>  }
>
> +/**
> + * gfs2_go_lock - Call the go_lock glops function
> + * @gl: The glock
> + *
> + * Note: the GLF bits used throughout are protected by the lockref spinlock.

They are? How?

> + *
> + * Returns: 0 if go_lock was successful, 2 if type specific operation is
> + * underway, -EAGAIN if secondary go_lock was waiting for another caller of
> + * go_lock, or error.
> + */
> +int gfs2_go_lock(struct gfs2_holder *gh)
> +__releases(&gl->gl_lockref.lock)
> +__acquires(&gl->gl_lockref.lock)

These annotations seem wrong.

> +{
> +       struct gfs2_glock *gl = gh->gh_gl;
> +       const struct gfs2_glock_operations *glops = gl->gl_ops;
> +       int ret;
> +
> +       /*
> +        * Since we unlock the lockref lock, we set a flag to indicate
> +        * go_lock is in progress.
> +        */
> +       if (test_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags)) {
> +               wait_on_bit(&gl->gl_flags, GLF_GO_LOCK_IN_PROG,
> +                           TASK_UNINTERRUPTIBLE);
> +               /*
> +                * Here we just waited for a st go_lock to finish.
> +                * But that go_lock may not have been successful, as when
> +                * a process locks an inode glock _before_ it has an actual
> +                * inode to populate. So we check again. This secondary  waiter
> +                * _might_ have an inode to populate.
> +                */
> +               return -EAGAIN;

I'm not sure why the retrying needs to be handled by the caller. Also,
there are callers now that don't handle -EAGAIN; what about those?

> +       }
> +
> +       set_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags);
> +
> +       /* FIXME: eliminate this eventually */
> +       ret = glops->go_lock(gh);
> +
> +       switch(ret) {
> +       case 0:
> +               clear_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags);
> +               break;
> +       case 1:
> +               ret = 2;
> +               break;
> +       default:
> +               gh->gh_error = ret;
> +               list_del_init(&gh->gh_list);
> +               trace_gfs2_glock_queue(gh, 0);
> +               gfs2_holder_wake(gh);
> +               break;
> +       }
> +       clear_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags);
> +       smp_mb__after_atomic();
> +       wake_up_bit(&gl->gl_flags, GLF_GO_LOCK_IN_PROG);
> +       return ret;
> +}
> +
>  /**
>   * do_promote - promote as many requests as possible on the current queue
>   * @gl: The glock
> @@ -389,54 +449,50 @@ static void do_error(struct gfs2_glock *gl, const int ret)
>   */
>
>  static int do_promote(struct gfs2_glock *gl)
> -__releases(&gl->gl_lockref.lock)
> -__acquires(&gl->gl_lockref.lock)
>  {
> -       const struct gfs2_glock_operations *glops = gl->gl_ops;
>         struct gfs2_holder *gh, *tmp;
>         int first;
>
> +       bool lock_released;
>         int ret;
>
>  restart:
>         list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
> +               lock_released = false;
>                 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
>                         continue;
> -               if (may_grant(gl, gh)) {
> -                       first = gfs2_first_holder(gh);
> -                       if (first && glops->go_lock) {
> -                               if (!(gh->gh_flags & GL_SKIP)) {
> -                                       spin_unlock(&gl->gl_lockref.lock);
> -                                       /* FIXME: eliminate this eventually */
> -                                       ret = glops->go_lock(gh);
> -                                       spin_lock(&gl->gl_lockref.lock);
> -                                       if (ret) {
> -                                               if (ret == 1)
> -                                                       return 2;
> -                                               gh->gh_error = ret;
> -                                               list_del_init(&gh->gh_list);
> -                                               trace_gfs2_glock_queue(gh, 0);
> -                                               gfs2_holder_wake(gh);
> -                                               goto restart;
> -                                       }
> -                               }
> -                       }
> -                       set_bit(HIF_HOLDER, &gh->gh_iflags);
> -                       trace_gfs2_promote(gh, first);
> -                       gfs2_holder_wake(gh);
> -                       /*
> -                        * If this was the first holder, we may have released
> -                        * the gl_lockref.lock, so the holders list may have
> -                        * changed. For that reason, we start again at the
> -                        * start of the holders queue.
> -                        */
> -                       if (first)
> -                               goto restart;
> -                       continue;
> +               if (!may_grant(gl, gh)) {
> +                       if (gh->gh_list.prev == &gl->gl_holders)
> +                               return 1;
> +                       do_error(gl, 0);
> +                       break;
>                 }
> -               if (gh->gh_list.prev == &gl->gl_holders)
> -                       return 1;
> -               do_error(gl, 0);
> -               break;
> +
> +               first = gfs2_first_holder(gh);
> +check_go_lock:
> +               if (test_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags) &&
> +                   !(gh->gh_flags & GL_SKIP)) {
> +                       spin_unlock(&gl->gl_lockref.lock);
> +                       ret = gfs2_go_lock(gh);
> +                       spin_lock(&gl->gl_lockref.lock);

I'm wondering if we can change go_lock to take a glock argument now.
After that, we should be able to move the go_lock call to when we're
done with the glock, when we've dropped the lockref spin lock already.
Also, maybe we can move waiting for the flag to gfs2_glock_nq
eventually.

> +                       if (ret) {
> +                               if (ret == -EAGAIN)
> +                                       goto check_go_lock;
> +                               else
> +                                       goto restart;
> +                       } else
> +                               lock_released = true;
> +               }
> +
> +               set_bit(HIF_HOLDER, &gh->gh_iflags);
> +               trace_gfs2_promote(gh, first);

I think 'first' can be killed now.

> +               gfs2_holder_wake(gh);
> +               /*
> +                * If we released the gl_lockref.lock the holders list may have
> +                * changed. For that reason, we start again at the start of
> +                * the holders queue.
> +                */
> +               if (lock_released)
> +                       goto restart;
>         }
>         return 0;
>  }
> @@ -1064,7 +1120,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>
>         atomic_inc(&sdp->sd_glock_disposal);
>         gl->gl_node.next = NULL;
> -       gl->gl_flags = 0;
> +       gl->gl_flags = glops->go_lock ? BIT(GLF_GO_LOCK_NEEDED) : 0;
>         gl->gl_name = name;
>         lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
>         gl->gl_lockref.count = 1;
> @@ -2153,6 +2209,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
>                 *p++ = 'P';
>         if (test_bit(GLF_FREEING, gflags))
>                 *p++ = 'x';
> +       if (test_bit(GLF_GO_LOCK_NEEDED, gflags))
> +               *p++ = 'g';
>         *p = 0;
>         return buf;
>  }
> diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
> index 8a09379dbf66..2e348e250fd2 100644
> --- a/fs/gfs2/glock.h
> +++ b/fs/gfs2/glock.h
> @@ -195,6 +195,7 @@ extern void gfs2_holder_reinit(unsigned int state, u16 flags,
>  extern void gfs2_holder_uninit(struct gfs2_holder *gh);
>  extern int gfs2_glock_nq(struct gfs2_holder *gh);
>  extern int gfs2_glock_poll(struct gfs2_holder *gh);
> +extern int gfs2_go_lock(struct gfs2_holder *gh);
>  extern int gfs2_glock_wait(struct gfs2_holder *gh);
>  extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
>  extern void gfs2_glock_dq(struct gfs2_holder *gh);
> diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> index 4b19f513570f..4823ba63c3bf 100644
> --- a/fs/gfs2/glops.c
> +++ b/fs/gfs2/glops.c
> @@ -356,7 +356,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
>                 struct address_space *mapping = gfs2_glock2aspace(gl);
>                 truncate_inode_pages(mapping, 0);
>                 if (ip) {
> -                       set_bit(GIF_INVALID, &ip->i_flags);
> +                       set_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags);
>                         forget_all_cached_acls(&ip->i_inode);
>                         security_inode_invalidate_secctx(&ip->i_inode);
>                         gfs2_dir_hash_inval(ip);
> @@ -476,8 +476,6 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
>
>         error = gfs2_dinode_in(ip, dibh->b_data);
>         brelse(dibh);
> -       clear_bit(GIF_INVALID, &ip->i_flags);

Yikes. Can replacing GIF_INVALID with GLF_GO_LOCK_NEEDED go in a separate patch?

> -
>         return error;
>  }
>
> @@ -493,16 +491,14 @@ static int inode_go_lock(struct gfs2_holder *gh)
>         struct gfs2_glock *gl = gh->gh_gl;
>         struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
>         struct gfs2_inode *ip = gl->gl_object;
> -       int error = 0;
> +       int error = 1;
>
> -       if (!ip)
> -               return 0;
> +       if (!ip) /* no inode to populate - read it in later */
> +               goto out;
>
> -       if (test_bit(GIF_INVALID, &ip->i_flags)) {
> -               error = gfs2_inode_refresh(ip);
> -               if (error)
> -                       return error;
> -       }
> +       error = gfs2_inode_refresh(ip);
> +       if (error)
> +               goto out;
>
>         if (gh->gh_state != LM_ST_DEFERRED)
>                 inode_dio_wait(&ip->i_inode);
> @@ -515,9 +511,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
>                         list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
>                 spin_unlock(&sdp->sd_trunc_lock);
>                 wake_up(&sdp->sd_quota_wait);
> -               return 1;
> +               error = 1;
>         }
>
> +out:
>         return error;
>  }
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 0fe49770166e..c98b37545f2c 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -315,6 +315,7 @@ struct gfs2_alloc_parms {
>
>  enum {
>         GLF_LOCK                        = 1,
> +       GLF_GO_LOCK_NEEDED              = 2, /* needs go_lock */
>         GLF_DEMOTE                      = 3,
>         GLF_PENDING_DEMOTE              = 4,
>         GLF_DEMOTE_IN_PROGRESS          = 5,
> @@ -324,6 +325,7 @@ enum {
>         GLF_REPLY_PENDING               = 9,
>         GLF_INITIAL                     = 10,
>         GLF_FROZEN                      = 11,
> +       GLF_GO_LOCK_IN_PROG             = 12, /* go_lock happening now */
>         GLF_LRU                         = 13,
>         GLF_OBJECT                      = 14, /* Used only for tracing */
>         GLF_BLOCKING                    = 15,
> @@ -370,7 +372,6 @@ struct gfs2_glock {
>  };
>
>  enum {
> -       GIF_INVALID             = 0,
>         GIF_QD_LOCKED           = 1,
>         GIF_ALLOC_FAILED        = 2,
>         GIF_SW_PAGED            = 3,
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 6e15434b23ac..4e583e9a380c 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
>                 }
>
>                 glock_set_object(ip->i_gl, ip);
> -               set_bit(GIF_INVALID, &ip->i_flags);
> +               set_bit(GLF_GO_LOCK_NEEDED, &ip->i_gl->gl_flags);
>                 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
>                 if (unlikely(error))
>                         goto fail;
> @@ -196,7 +196,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
>
>                 if (type == DT_UNKNOWN) {
>                         /* Inode glock must be locked already */
> -                       error = gfs2_inode_refresh(GFS2_I(inode));
> +                       error = gfs2_go_lock(&i_gh);
>                         if (error)
>                                 goto fail;
>                 } else {
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 96b2fbed6bf1..796f9e015218 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1261,7 +1261,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
>         return error;
>  }
>
> -static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
> +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
>  {
>         u32 rl_flags;
>
> @@ -1269,7 +1269,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
>                 return 0;
>
>         if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
> -               return gfs2_rgrp_bh_get(rgd);
> +               gfs2_go_lock(gh);
>
>         rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
>         rl_flags &= ~GFS2_RDF_MASK;
> @@ -1312,6 +1312,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
>                         bi->bi_bh = NULL;
>                 }
>         }
> +       set_bit(GLF_GO_LOCK_NEEDED, &rgd->rd_gl->gl_flags);
>  }
>
>  int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
> @@ -2110,7 +2111,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
>                             gfs2_rgrp_congested(rs->rs_rgd, loops))
>                                 goto skip_rgrp;
>                         if (sdp->sd_args.ar_rgrplvb) {
> -                               error = update_rgrp_lvb(rs->rs_rgd);
> +                               error = update_rgrp_lvb(rs->rs_rgd,
> +                                                       &ip->i_rgd_gh);
>                                 if (unlikely(error)) {
>                                         rgrp_unlock_local(rs->rs_rgd);
>                                         gfs2_glock_dq_uninit(&ip->i_rgd_gh);
> @@ -2126,7 +2128,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
>                         goto skip_rgrp;
>
>                 if (sdp->sd_args.ar_rgrplvb)
> -                       gfs2_rgrp_bh_get(rs->rs_rgd);
> +                       gfs2_go_lock(&ip->i_rgd_gh);
>
>                 /* Get a reservation if we don't already have one */
>                 if (!gfs2_rs_active(rs))
> @@ -2762,8 +2764,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
>
>  void rgrp_lock_local(struct gfs2_rgrpd *rgd)
>  {
> -       GLOCK_BUG_ON(rgd->rd_gl, !gfs2_glock_is_held_excl(rgd->rd_gl) &&
> -                    !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
>         mutex_lock(&rgd->rd_mutex);
>  }
>
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 6e00d15ef0a8..20698232774a 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1244,8 +1244,10 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
>         if (ret)
>                 return SHOULD_NOT_DELETE_DINODE;
>
> -       if (test_bit(GIF_INVALID, &ip->i_flags)) {
> -               ret = gfs2_inode_refresh(ip);
> +       if (test_bit(GLF_GO_LOCK_NEEDED, &ip->i_gl->gl_flags)) {
> +               do {
> +                       ret = gfs2_go_lock(gh);
> +               } while (ret == -EAGAIN);
>                 if (ret)
>                         return SHOULD_NOT_DELETE_DINODE;
>         }
> --
> 2.31.1
>

Thanks,
Andreas



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems
  2021-09-29 15:35 ` Andreas Gruenbacher
@ 2021-09-29 16:42   ` Andreas Gruenbacher
  0 siblings, 0 replies; 4+ messages in thread
From: Andreas Gruenbacher @ 2021-09-29 16:42 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Wed, Sep 29, 2021 at 5:35 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
>
> Hi Bob,
>
> On Wed, Sep 29, 2021 at 3:21 PM Bob Peterson <rpeterso@redhat.com> wrote:
> > Before this patch, when a glock was locked, the very first holder on the
> > queue would unlock the lockref and call the go_lock glops function (if
> > one existed), unless GL_SKIP was specified. When we introduced the new
> > node-scope concept, we allowed multiple holders to lock glocks in EX mode
> > and share the lock.
> >
> > But node-scope introduced a new problem: if the first holder has GL_SKIP
> > and the next one does NOT, since it is not the first holder on the queue,
> > the go_lock op was not called. Eventually the GL_SKIP holder may call the
> > go_lock sub-function (e.g. gfs2_rgrp_bh_get) but there was still a
> > window of time in which another non-GL_SKIP holder assumes the go_lock
> > function had been called by the first holder. In the case of rgrp glocks,
> > this led to a NULL pointer dereference on the buffer_heads.
> >
> > This patch tries to fix the problem by introducing two new glock flags:
> >
> > GLF_GO_LOCK_NEEDED, which keeps track of when the go_lock function needs
> > to be called to "fill in" or "read" the object before it is referenced.
> >
> > GLF_GO_LOCK_IN_PROG which is used to determine when a process is
> > in the process of reading in the object. Whenever a function needs to
> > reference the object, it checks the GLF_GO_LOCK_NEEDED flag, and if
> > set, it sets GLF_GO_LOCK_IN_PROG and calls the glops "go_lock" function.
> >
> > As before, the gl_lockref spin_lock is unlocked during the IO operation,
> > which may take a relatively long amount of time to complete. While
> > unlocked, if another process determines go_lock is still needed, it sees
> > the GLF_GO_LOCK_IN_PROG flag is set, and waits for the go_lock glop
> > operation to be completed. Once GLF_GO_LOCK_IN_PROG is cleared, it needs
> > to check GLF_GO_LOCK_NEEDED again because the other process's go_lock
> > operation may not have been successful.
> >
> > To faciliate this change, the go_lock section of function do_promote
> > was extracted to its own new function, gfs2_go_lock. The reason we do
> > this is because GL_SKIP callers often read in the object later.
> > Before this patch, those GL_SKIP callers (like gfs2_inode_lookup and
> > update_rgrp_lvb) called directly into the object-read functions
> > (gfs2_inode_refresh and gfs2_rgrp_bh_get respectively), but that never
> > cleared the new GLF_GO_LOCK_NEEDED flag. This patch changes those
> > functions so they call into the new gfs2_go_lock directly, which takes
> > care of all that.
>
> this is looking much better now. The naming of the go_lock callback
> and consequently the new flags is poor though; go_lock doesn't
> actually do any locking. Something like revalidate would make a lot
> more sense.
>
> > Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> > ---
> >  fs/gfs2/glock.c  | 136 +++++++++++++++++++++++++++++++++--------------
> >  fs/gfs2/glock.h  |   1 +
> >  fs/gfs2/glops.c  |  21 ++++----
> >  fs/gfs2/incore.h |   3 +-
> >  fs/gfs2/inode.c  |   4 +-
> >  fs/gfs2/rgrp.c   |  12 ++---
> >  fs/gfs2/super.c  |   6 ++-
> >  7 files changed, 121 insertions(+), 62 deletions(-)
> >
> > diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> > index 4fcf340603e7..750ea07b4173 100644
> > --- a/fs/gfs2/glock.c
> > +++ b/fs/gfs2/glock.c
> > @@ -380,6 +380,66 @@ static void do_error(struct gfs2_glock *gl, const int ret)
> >         }
> >  }
> >
> > +/**
> > + * gfs2_go_lock - Call the go_lock glops function
> > + * @gl: The glock
> > + *
> > + * Note: the GLF bits used throughout are protected by the lockref spinlock.
>
> They are? How?
>
> > + *
> > + * Returns: 0 if go_lock was successful, 2 if type specific operation is
> > + * underway, -EAGAIN if secondary go_lock was waiting for another caller of
> > + * go_lock, or error.
> > + */
> > +int gfs2_go_lock(struct gfs2_holder *gh)
> > +__releases(&gl->gl_lockref.lock)
> > +__acquires(&gl->gl_lockref.lock)
>
> These annotations seem wrong.
>
> > +{
> > +       struct gfs2_glock *gl = gh->gh_gl;
> > +       const struct gfs2_glock_operations *glops = gl->gl_ops;
> > +       int ret;
> > +
> > +       /*
> > +        * Since we unlock the lockref lock, we set a flag to indicate
> > +        * go_lock is in progress.
> > +        */
> > +       if (test_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags)) {
> > +               wait_on_bit(&gl->gl_flags, GLF_GO_LOCK_IN_PROG,
> > +                           TASK_UNINTERRUPTIBLE);
> > +               /*
> > +                * Here we just waited for a st go_lock to finish.
> > +                * But that go_lock may not have been successful, as when
> > +                * a process locks an inode glock _before_ it has an actual
> > +                * inode to populate. So we check again. This secondary  waiter
> > +                * _might_ have an inode to populate.
> > +                */
> > +               return -EAGAIN;
>
> I'm not sure why the retrying needs to be handled by the caller. Also,
> there are callers now that don't handle -EAGAIN; what about those?
>
> > +       }
> > +
> > +       set_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags);
> > +
> > +       /* FIXME: eliminate this eventually */
> > +       ret = glops->go_lock(gh);
> > +
> > +       switch(ret) {
> > +       case 0:
> > +               clear_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags);
> > +               break;
> > +       case 1:
> > +               ret = 2;
> > +               break;
> > +       default:
> > +               gh->gh_error = ret;
> > +               list_del_init(&gh->gh_list);
> > +               trace_gfs2_glock_queue(gh, 0);
> > +               gfs2_holder_wake(gh);
> > +               break;
> > +       }
> > +       clear_bit(GLF_GO_LOCK_IN_PROG, &gl->gl_flags);
> > +       smp_mb__after_atomic();
> > +       wake_up_bit(&gl->gl_flags, GLF_GO_LOCK_IN_PROG);
> > +       return ret;
> > +}
> > +
> >  /**
> >   * do_promote - promote as many requests as possible on the current queue
> >   * @gl: The glock
> > @@ -389,54 +449,50 @@ static void do_error(struct gfs2_glock *gl, const int ret)
> >   */
> >
> >  static int do_promote(struct gfs2_glock *gl)
> > -__releases(&gl->gl_lockref.lock)
> > -__acquires(&gl->gl_lockref.lock)
> >  {
> > -       const struct gfs2_glock_operations *glops = gl->gl_ops;
> >         struct gfs2_holder *gh, *tmp;
> >         int first;
> >
> > +       bool lock_released;
> >         int ret;
> >
> >  restart:
> >         list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
> > +               lock_released = false;
> >                 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
> >                         continue;
> > -               if (may_grant(gl, gh)) {
> > -                       first = gfs2_first_holder(gh);
> > -                       if (first && glops->go_lock) {
> > -                               if (!(gh->gh_flags & GL_SKIP)) {
> > -                                       spin_unlock(&gl->gl_lockref.lock);
> > -                                       /* FIXME: eliminate this eventually */
> > -                                       ret = glops->go_lock(gh);
> > -                                       spin_lock(&gl->gl_lockref.lock);
> > -                                       if (ret) {
> > -                                               if (ret == 1)
> > -                                                       return 2;
> > -                                               gh->gh_error = ret;
> > -                                               list_del_init(&gh->gh_list);
> > -                                               trace_gfs2_glock_queue(gh, 0);
> > -                                               gfs2_holder_wake(gh);
> > -                                               goto restart;
> > -                                       }
> > -                               }
> > -                       }
> > -                       set_bit(HIF_HOLDER, &gh->gh_iflags);
> > -                       trace_gfs2_promote(gh, first);
> > -                       gfs2_holder_wake(gh);
> > -                       /*
> > -                        * If this was the first holder, we may have released
> > -                        * the gl_lockref.lock, so the holders list may have
> > -                        * changed. For that reason, we start again at the
> > -                        * start of the holders queue.
> > -                        */
> > -                       if (first)
> > -                               goto restart;
> > -                       continue;
> > +               if (!may_grant(gl, gh)) {
> > +                       if (gh->gh_list.prev == &gl->gl_holders)
> > +                               return 1;
> > +                       do_error(gl, 0);
> > +                       break;
> >                 }
> > -               if (gh->gh_list.prev == &gl->gl_holders)
> > -                       return 1;
> > -               do_error(gl, 0);
> > -               break;
> > +
> > +               first = gfs2_first_holder(gh);
> > +check_go_lock:
> > +               if (test_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags) &&
> > +                   !(gh->gh_flags & GL_SKIP)) {
> > +                       spin_unlock(&gl->gl_lockref.lock);
> > +                       ret = gfs2_go_lock(gh);
> > +                       spin_lock(&gl->gl_lockref.lock);
>
> I'm wondering if we can change go_lock to take a glock argument now.
> After that, we should be able to move the go_lock call to when we're
> done with the glock, when we've dropped the lockref spin lock already.
> Also, maybe we can move waiting for the flag to gfs2_glock_nq
> eventually.

Actually, I think this entire if can go into gfs2_glock_nq.

Andreas

> > +                       if (ret) {
> > +                               if (ret == -EAGAIN)
> > +                                       goto check_go_lock;
> > +                               else
> > +                                       goto restart;
> > +                       } else
> > +                               lock_released = true;
> > +               }
> > +
> > +               set_bit(HIF_HOLDER, &gh->gh_iflags);
> > +               trace_gfs2_promote(gh, first);
>
> I think 'first' can be killed now.
>
> > +               gfs2_holder_wake(gh);
> > +               /*
> > +                * If we released the gl_lockref.lock the holders list may have
> > +                * changed. For that reason, we start again at the start of
> > +                * the holders queue.
> > +                */
> > +               if (lock_released)
> > +                       goto restart;
> >         }
> >         return 0;
> >  }
> > @@ -1064,7 +1120,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
> >
> >         atomic_inc(&sdp->sd_glock_disposal);
> >         gl->gl_node.next = NULL;
> > -       gl->gl_flags = 0;
> > +       gl->gl_flags = glops->go_lock ? BIT(GLF_GO_LOCK_NEEDED) : 0;
> >         gl->gl_name = name;
> >         lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
> >         gl->gl_lockref.count = 1;
> > @@ -2153,6 +2209,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
> >                 *p++ = 'P';
> >         if (test_bit(GLF_FREEING, gflags))
> >                 *p++ = 'x';
> > +       if (test_bit(GLF_GO_LOCK_NEEDED, gflags))
> > +               *p++ = 'g';
> >         *p = 0;
> >         return buf;
> >  }
> > diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
> > index 8a09379dbf66..2e348e250fd2 100644
> > --- a/fs/gfs2/glock.h
> > +++ b/fs/gfs2/glock.h
> > @@ -195,6 +195,7 @@ extern void gfs2_holder_reinit(unsigned int state, u16 flags,
> >  extern void gfs2_holder_uninit(struct gfs2_holder *gh);
> >  extern int gfs2_glock_nq(struct gfs2_holder *gh);
> >  extern int gfs2_glock_poll(struct gfs2_holder *gh);
> > +extern int gfs2_go_lock(struct gfs2_holder *gh);
> >  extern int gfs2_glock_wait(struct gfs2_holder *gh);
> >  extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
> >  extern void gfs2_glock_dq(struct gfs2_holder *gh);
> > diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> > index 4b19f513570f..4823ba63c3bf 100644
> > --- a/fs/gfs2/glops.c
> > +++ b/fs/gfs2/glops.c
> > @@ -356,7 +356,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
> >                 struct address_space *mapping = gfs2_glock2aspace(gl);
> >                 truncate_inode_pages(mapping, 0);
> >                 if (ip) {
> > -                       set_bit(GIF_INVALID, &ip->i_flags);
> > +                       set_bit(GLF_GO_LOCK_NEEDED, &gl->gl_flags);
> >                         forget_all_cached_acls(&ip->i_inode);
> >                         security_inode_invalidate_secctx(&ip->i_inode);
> >                         gfs2_dir_hash_inval(ip);
> > @@ -476,8 +476,6 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
> >
> >         error = gfs2_dinode_in(ip, dibh->b_data);
> >         brelse(dibh);
> > -       clear_bit(GIF_INVALID, &ip->i_flags);
>
> Yikes. Can replacing GIF_INVALID with GLF_GO_LOCK_NEEDED go in a separate patch?
>
> > -
> >         return error;
> >  }
> >
> > @@ -493,16 +491,14 @@ static int inode_go_lock(struct gfs2_holder *gh)
> >         struct gfs2_glock *gl = gh->gh_gl;
> >         struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> >         struct gfs2_inode *ip = gl->gl_object;
> > -       int error = 0;
> > +       int error = 1;
> >
> > -       if (!ip)
> > -               return 0;
> > +       if (!ip) /* no inode to populate - read it in later */
> > +               goto out;
> >
> > -       if (test_bit(GIF_INVALID, &ip->i_flags)) {
> > -               error = gfs2_inode_refresh(ip);
> > -               if (error)
> > -                       return error;
> > -       }
> > +       error = gfs2_inode_refresh(ip);
> > +       if (error)
> > +               goto out;
> >
> >         if (gh->gh_state != LM_ST_DEFERRED)
> >                 inode_dio_wait(&ip->i_inode);
> > @@ -515,9 +511,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
> >                         list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
> >                 spin_unlock(&sdp->sd_trunc_lock);
> >                 wake_up(&sdp->sd_quota_wait);
> > -               return 1;
> > +               error = 1;
> >         }
> >
> > +out:
> >         return error;
> >  }
> >
> > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> > index 0fe49770166e..c98b37545f2c 100644
> > --- a/fs/gfs2/incore.h
> > +++ b/fs/gfs2/incore.h
> > @@ -315,6 +315,7 @@ struct gfs2_alloc_parms {
> >
> >  enum {
> >         GLF_LOCK                        = 1,
> > +       GLF_GO_LOCK_NEEDED              = 2, /* needs go_lock */
> >         GLF_DEMOTE                      = 3,
> >         GLF_PENDING_DEMOTE              = 4,
> >         GLF_DEMOTE_IN_PROGRESS          = 5,
> > @@ -324,6 +325,7 @@ enum {
> >         GLF_REPLY_PENDING               = 9,
> >         GLF_INITIAL                     = 10,
> >         GLF_FROZEN                      = 11,
> > +       GLF_GO_LOCK_IN_PROG             = 12, /* go_lock happening now */
> >         GLF_LRU                         = 13,
> >         GLF_OBJECT                      = 14, /* Used only for tracing */
> >         GLF_BLOCKING                    = 15,
> > @@ -370,7 +372,6 @@ struct gfs2_glock {
> >  };
> >
> >  enum {
> > -       GIF_INVALID             = 0,
> >         GIF_QD_LOCKED           = 1,
> >         GIF_ALLOC_FAILED        = 2,
> >         GIF_SW_PAGED            = 3,
> > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > index 6e15434b23ac..4e583e9a380c 100644
> > --- a/fs/gfs2/inode.c
> > +++ b/fs/gfs2/inode.c
> > @@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
> >                 }
> >
> >                 glock_set_object(ip->i_gl, ip);
> > -               set_bit(GIF_INVALID, &ip->i_flags);
> > +               set_bit(GLF_GO_LOCK_NEEDED, &ip->i_gl->gl_flags);
> >                 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
> >                 if (unlikely(error))
> >                         goto fail;
> > @@ -196,7 +196,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
> >
> >                 if (type == DT_UNKNOWN) {
> >                         /* Inode glock must be locked already */
> > -                       error = gfs2_inode_refresh(GFS2_I(inode));
> > +                       error = gfs2_go_lock(&i_gh);
> >                         if (error)
> >                                 goto fail;
> >                 } else {
> > diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> > index 96b2fbed6bf1..796f9e015218 100644
> > --- a/fs/gfs2/rgrp.c
> > +++ b/fs/gfs2/rgrp.c
> > @@ -1261,7 +1261,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
> >         return error;
> >  }
> >
> > -static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
> > +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
> >  {
> >         u32 rl_flags;
> >
> > @@ -1269,7 +1269,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
> >                 return 0;
> >
> >         if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
> > -               return gfs2_rgrp_bh_get(rgd);
> > +               gfs2_go_lock(gh);
> >
> >         rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
> >         rl_flags &= ~GFS2_RDF_MASK;
> > @@ -1312,6 +1312,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
> >                         bi->bi_bh = NULL;
> >                 }
> >         }
> > +       set_bit(GLF_GO_LOCK_NEEDED, &rgd->rd_gl->gl_flags);
> >  }
> >
> >  int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
> > @@ -2110,7 +2111,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
> >                             gfs2_rgrp_congested(rs->rs_rgd, loops))
> >                                 goto skip_rgrp;
> >                         if (sdp->sd_args.ar_rgrplvb) {
> > -                               error = update_rgrp_lvb(rs->rs_rgd);
> > +                               error = update_rgrp_lvb(rs->rs_rgd,
> > +                                                       &ip->i_rgd_gh);
> >                                 if (unlikely(error)) {
> >                                         rgrp_unlock_local(rs->rs_rgd);
> >                                         gfs2_glock_dq_uninit(&ip->i_rgd_gh);
> > @@ -2126,7 +2128,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
> >                         goto skip_rgrp;
> >
> >                 if (sdp->sd_args.ar_rgrplvb)
> > -                       gfs2_rgrp_bh_get(rs->rs_rgd);
> > +                       gfs2_go_lock(&ip->i_rgd_gh);
> >
> >                 /* Get a reservation if we don't already have one */
> >                 if (!gfs2_rs_active(rs))
> > @@ -2762,8 +2764,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
> >
> >  void rgrp_lock_local(struct gfs2_rgrpd *rgd)
> >  {
> > -       GLOCK_BUG_ON(rgd->rd_gl, !gfs2_glock_is_held_excl(rgd->rd_gl) &&
> > -                    !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
> >         mutex_lock(&rgd->rd_mutex);
> >  }
> >
> > diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> > index 6e00d15ef0a8..20698232774a 100644
> > --- a/fs/gfs2/super.c
> > +++ b/fs/gfs2/super.c
> > @@ -1244,8 +1244,10 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
> >         if (ret)
> >                 return SHOULD_NOT_DELETE_DINODE;
> >
> > -       if (test_bit(GIF_INVALID, &ip->i_flags)) {
> > -               ret = gfs2_inode_refresh(ip);
> > +       if (test_bit(GLF_GO_LOCK_NEEDED, &ip->i_gl->gl_flags)) {
> > +               do {
> > +                       ret = gfs2_go_lock(gh);
> > +               } while (ret == -EAGAIN);
> >                 if (ret)
> >                         return SHOULD_NOT_DELETE_DINODE;
> >         }
> > --
> > 2.31.1
> >
>
> Thanks,
> Andreas



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-09-29 16:42 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-29 13:21 [Cluster-devel] [PATCH] gfs2: fix GL_SKIP node_scope problems Bob Peterson
2021-09-29 13:27 ` Bob Peterson
2021-09-29 15:35 ` Andreas Gruenbacher
2021-09-29 16:42   ` Andreas Gruenbacher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.