All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock
@ 2017-07-03 14:56 Andreas Gruenbacher
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks Andreas Gruenbacher
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-03 14:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

These are the remaining patches for fixing a cluster-wide GFS2 and DLM
deadlock.

As explained in the previous posting of this patch queue, when inodes
are evicted, GFS2 currently calls into DLM.  Inode eviction can be
triggered by memory pressure, in the context of a random user-space
process.  If DLM happens to block in the process in question (for
example, it that process is a fence agent), GFS2 and DLM will deadlock.

This patch queue stops GFS2 from calling into DLM on the inode evict
path when under memory pressure.  It does so by first decoupling
destroying inodes and putting their associated glocks, which is what
ends up calling into DLM.  Second, when under memory pressure, it moves
putting glocks into work queue context where it cannot block DLM.
Third, when gfs2_drop_inode determines that an inode's link count has
hit zero under memory pressure, it puts that inode on the delete
workqueue (and keeps the inode in the icache) instead of causing
gfs2_evict_inode to delete the inode immediately.  The delete workqueue
will not be processed under memory pressure, so deleting inodes from
there is safe.

Thanks,
Andreas

Andreas Gruenbacher (4):
  gfs2: gfs2_glock_get: Wait on freeing glocks
  gfs2: Get rid of gfs2_set_nlink
  gfs2: gfs2_evict_inode: Put glocks asynchronously
  gfs2: Defer deleting inodes under memory pressure

 fs/gfs2/glock.c  | 145 ++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/gfs2/glock.h  |   2 +
 fs/gfs2/glops.c  |  28 +----------
 fs/gfs2/incore.h |   1 +
 fs/gfs2/super.c  |  39 ++++++++++++++-
 5 files changed, 162 insertions(+), 53 deletions(-)

-- 
2.7.5



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks
  2017-07-03 14:56 [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Andreas Gruenbacher
@ 2017-07-03 14:56 ` Andreas Gruenbacher
  2017-07-04 12:13   ` Steven Whitehouse
  2017-07-05 14:22   ` Andreas Gruenbacher
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 2/4] gfs2: Get rid of gfs2_set_nlink Andreas Gruenbacher
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-03 14:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Keep glocks in their hash table until they are freed instead of removing
them when their last reference is dropped.  This allows to wait for
glocks to go away before recreating them in gfs2_glock_get so that the
previous use of the glock won't overlap with the next.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c  | 135 +++++++++++++++++++++++++++++++++++++++++++++----------
 fs/gfs2/incore.h |   1 +
 2 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6cd71c5..8a7944b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/delay.h>
 #include <linux/sort.h>
+#include <linux/hash.h>
 #include <linux/jhash.h>
 #include <linux/kallsyms.h>
 #include <linux/gfs2_ondisk.h>
@@ -80,9 +81,69 @@ static struct rhashtable_params ht_parms = {
 
 static struct rhashtable gl_hash_table;
 
-void gfs2_glock_free(struct gfs2_glock *gl)
+#define GLOCK_WAIT_TABLE_BITS 12
+#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS)
+static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+struct wait_glock_queue {
+	struct lm_lockname *name;
+	wait_queue_t wait;
+};
+
+static int glock_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+			       void *key)
 {
-	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+	struct wait_glock_queue *wait_glock =
+		container_of(wait, struct wait_glock_queue, wait);
+	struct lm_lockname *wait_name = wait_glock->name;
+	struct lm_lockname *wake_name = key;
+
+	if (wake_name->ln_sbd != wait_name->ln_sbd ||
+	    wake_name->ln_number != wait_name->ln_number ||
+	    wake_name->ln_type != wait_name->ln_type)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
+{
+	u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+
+	return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
+}
+
+static void prepare_to_wait_on_glock(wait_queue_head_t **wq,
+				     struct wait_glock_queue *wait,
+				     struct lm_lockname *name)
+{
+	wait->name = name;
+	init_wait(&wait->wait);
+	wait->wait.func = glock_wake_function;
+	*wq = glock_waitqueue(name);
+	prepare_to_wait(*wq, &wait->wait, TASK_UNINTERRUPTIBLE);
+}
+
+static void finish_wait_on_glock(wait_queue_head_t *wq,
+				 struct wait_glock_queue *wait)
+{
+	finish_wait(wq, &wait->wait);
+}
+
+/**
+ * wake_up_glock  -  Wake up waiters on a glock
+ * @gl: the glock
+ */
+static void wake_up_glock(struct gfs2_glock *gl)
+{
+	wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name);
+
+	if (waitqueue_active(wq))
+		__wake_up(wq, TASK_NORMAL, 1, &gl->gl_name);
+}
+
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
 
 	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
 		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -90,6 +151,15 @@ void gfs2_glock_free(struct gfs2_glock *gl)
 		kfree(gl->gl_lksb.sb_lvbptr);
 		kmem_cache_free(gfs2_glock_cachep, gl);
 	}
+}
+
+void gfs2_glock_free(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
+	wake_up_glock(gl);
+	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
 	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
 		wake_up(&sdp->sd_glock_wait);
 }
@@ -184,7 +254,6 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
 
 	gfs2_glock_remove_from_lru(gl);
 	spin_unlock(&gl->gl_lockref.lock);
-	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
 	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
 	trace_gfs2_glock_put(gl);
@@ -669,6 +738,36 @@ static void glock_work_func(struct work_struct *work)
 	spin_unlock(&gl->gl_lockref.lock);
 }
 
+static struct gfs2_glock *find_insert_glock(struct lm_lockname *name,
+					    struct gfs2_glock *new)
+{
+	struct wait_glock_queue wait;
+	wait_queue_head_t *wq;
+	struct gfs2_glock *gl;
+
+again:
+	prepare_to_wait_on_glock(&wq, &wait, name);
+	rcu_read_lock();
+	if (new) {
+		gl = rhashtable_lookup_get_insert_fast(&gl_hash_table,
+			&new->gl_node, ht_parms);
+		if (IS_ERR(gl))
+			goto out;
+	} else {
+		gl = rhashtable_lookup_fast(&gl_hash_table,
+			name, ht_parms);
+	}
+	if (gl && !lockref_get_not_dead(&gl->gl_lockref)) {
+		rcu_read_unlock();
+		schedule();
+		goto again;
+	}
+out:
+	rcu_read_unlock();
+	finish_wait_on_glock(wq, &wait);
+	return gl;
+}
+
 /**
  * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
  * @sdp: The GFS2 superblock
@@ -695,15 +794,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	struct kmem_cache *cachep;
 	int ret = 0;
 
-	rcu_read_lock();
-	gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
-	if (gl && !lockref_get_not_dead(&gl->gl_lockref))
-		gl = NULL;
-	rcu_read_unlock();
-
-	*glp = gl;
-	if (gl)
+	gl = find_insert_glock(&name, NULL);
+	if (gl) {
+		*glp = gl;
 		return 0;
+	}
 	if (!create)
 		return -ENOENT;
 
@@ -757,10 +852,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		mapping->writeback_index = 0;
 	}
 
-again:
-	rcu_read_lock();
-	tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
-						ht_parms);
+	tmp = find_insert_glock(&name, gl);
 	if (!tmp) {
 		*glp = gl;
 		goto out;
@@ -769,13 +861,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		ret = PTR_ERR(tmp);
 		goto out_free;
 	}
-	if (lockref_get_not_dead(&tmp->gl_lockref)) {
-		*glp = tmp;
-		goto out_free;
-	}
-	rcu_read_unlock();
-	cond_resched();
-	goto again;
+	*glp = tmp;
 
 out_free:
 	kfree(gl->gl_lksb.sb_lvbptr);
@@ -1796,7 +1882,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 
 int __init gfs2_glock_init(void)
 {
-	int ret;
+	int i, ret;
 
 	ret = rhashtable_init(&gl_hash_table, &ht_parms);
 	if (ret < 0)
@@ -1825,6 +1911,9 @@ int __init gfs2_glock_init(void)
 		return ret;
 	}
 
+	for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(glock_wait_table + i);
+
 	return 0;
 }
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index de42384..2ba9403 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -373,6 +373,7 @@ struct gfs2_glock {
 			loff_t end;
 		} gl_vm;
 	};
+	struct rcu_head gl_rcu;
 	struct rhash_head gl_node;
 };
 
-- 
2.7.5



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 2/4] gfs2: Get rid of gfs2_set_nlink
  2017-07-03 14:56 [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Andreas Gruenbacher
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks Andreas Gruenbacher
@ 2017-07-03 14:56 ` Andreas Gruenbacher
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 3/4] gfs2: gfs2_evict_inode: Put glocks asynchronously Andreas Gruenbacher
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-03 14:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Remove gfs2_set_nlink which prevents the link count of an inode to
become non-zero once it has reached zero.  The following changes will
reduce the amount of waiting on glocks when an inode is evicted from
memory.  With that, an inode can become reallocated before all the
remote-unlink callbacks from a previous delete have been processed,
which causes the link count to change from zero to non-zero.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glops.c | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 5e69636..263b20f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
 	return 1;
 }
 
-/**
- * gfs2_set_nlink - Set the inode's link count based on on-disk info
- * @inode: The inode in question
- * @nlink: The link count
- *
- * If the link count has hit zero, it must never be raised, whatever the
- * on-disk inode might say. When new struct inodes are created the link
- * count is set to 1, so that we can safely use this test even when reading
- * in on disk information for the first time.
- */
-
-static void gfs2_set_nlink(struct inode *inode, u32 nlink)
-{
-	/*
-	 * We will need to review setting the nlink count here in the
-	 * light of the forthcoming ro bind mount work. This is a reminder
-	 * to do that.
-	 */
-	if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
-		if (nlink == 0)
-			clear_nlink(inode);
-		else
-			set_nlink(inode, nlink);
-	}
-}
-
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	const struct gfs2_dinode *str = buf;
@@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 
 	i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
 	i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
-	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+	set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
 	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
-- 
2.7.5



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 3/4] gfs2: gfs2_evict_inode: Put glocks asynchronously
  2017-07-03 14:56 [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Andreas Gruenbacher
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks Andreas Gruenbacher
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 2/4] gfs2: Get rid of gfs2_set_nlink Andreas Gruenbacher
@ 2017-07-03 14:56 ` Andreas Gruenbacher
  2017-07-04 12:16   ` Steven Whitehouse
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 4/4] gfs2: Defer deleting inodes under memory pressure Andreas Gruenbacher
  2017-07-04 12:11 ` [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Steven Whitehouse
  4 siblings, 1 reply; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-03 14:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

gfs2_evict_inode is called to free inodes under memory pressure.  The
function calls into DLM when an inode's last cluster-wide reference goes
away (remote unlink) and to release the glock and associated DLM lock
before finally destroying the inode.  However, if DLM is blocked on
memory to become available, calling into DLM again will deadlock.

Avoid that by decoupling releasing glocks from destroying inodes in that
case: with gfs2_glock_queue_put, glocks will be dequeued asynchronously
in work queue context, when the associated inodes have likely already
been destroyed.

With this change, inodes can end up being unlinked, remote-unlink can be
triggered, and then the inode can be reallocated before all
remote-unlink callbacks are processed.  To detect that, revalidate the
link count in gfs2_evict_inode to make sure we're not deleting an
allocated, referenced inode.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c | 10 +++++++++-
 fs/gfs2/glock.h |  2 ++
 fs/gfs2/super.c | 20 ++++++++++++++++++--
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8a7944b..0236ef5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -170,7 +170,7 @@ void gfs2_glock_free(struct gfs2_glock *gl)
  *
  */
 
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
 {
 	GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
 	lockref_get(&gl->gl_lockref);
@@ -260,6 +260,14 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
 	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
 }
 
+/*
+ * Cause the glock to be put in work queue context.
+ */
+void gfs2_glock_queue_put(struct gfs2_glock *gl)
+{
+	gfs2_glock_queue_work(gl, 0);
+}
+
 /**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 9ad4a6a..33e0511 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -181,7 +181,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
 extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 			  const struct gfs2_glock_operations *glops,
 			  int create, struct gfs2_glock **glp);
+extern void gfs2_glock_hold(struct gfs2_glock *gl);
 extern void gfs2_glock_put(struct gfs2_glock *gl);
+extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
 extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
 			     u16 flags, struct gfs2_holder *gh);
 extern void gfs2_holder_reinit(unsigned int state, u16 flags,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index fdedec3..d53364e 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1562,6 +1562,12 @@ static void gfs2_evict_inode(struct inode *inode)
 			goto out_truncate;
 	}
 
+	/*
+	 * The inode may have been recreated in the meantime.
+	 */
+	if (inode->i_nlink)
+		goto out_truncate;
+
 alloc_failed:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
 	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
@@ -1643,12 +1649,22 @@ static void gfs2_evict_inode(struct inode *inode)
 	glock_set_object(ip->i_gl, NULL);
 	wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
 	gfs2_glock_add_to_lru(ip->i_gl);
-	gfs2_glock_put(ip->i_gl);
+	if (current->flags & PF_MEMALLOC)
+		gfs2_glock_queue_put(ip->i_gl);
+	else
+		gfs2_glock_put(ip->i_gl);
 	ip->i_gl = NULL;
 	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
-		glock_set_object(ip->i_iopen_gh.gh_gl, NULL);
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+
+		glock_set_object(gl, NULL);
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+		gfs2_glock_hold(gl);
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		if (current->flags & PF_MEMALLOC)
+			gfs2_glock_queue_put(gl);
+		else
+			gfs2_glock_put(gl);
 	}
 }
 
-- 
2.7.5



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 4/4] gfs2: Defer deleting inodes under memory pressure
  2017-07-03 14:56 [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Andreas Gruenbacher
                   ` (2 preceding siblings ...)
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 3/4] gfs2: gfs2_evict_inode: Put glocks asynchronously Andreas Gruenbacher
@ 2017-07-03 14:56 ` Andreas Gruenbacher
  2017-07-04 12:17   ` Steven Whitehouse
  2017-07-04 12:11 ` [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Steven Whitehouse
  4 siblings, 1 reply; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-03 14:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

When under memory pressure and an inode's link count has dropped to
zero, defer deleting the inode to the delete workqueue.  This avoids
calling into DLM under memory pressure which can deadlock.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/super.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d53364e..f54af47 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1317,6 +1317,21 @@ static int gfs2_drop_inode(struct inode *inode)
 		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
 	}
+
+	/*
+	 * When under memory pressure when an inode's link count has dropped to
+	 * zero, defer deleting the inode to the delete workqueue.  This avoids
+	 * calling into DLM under memory pressure, which can deadlock.
+	 */
+	if (!inode->i_nlink &&
+	    unlikely(current->flags & PF_MEMALLOC) &&
+	    gfs2_holder_initialized(&ip->i_iopen_gh)) {
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+
+		queue_work(gfs2_delete_workqueue, &gl->gl_delete);
+		return false;
+	}
+
 	return generic_drop_inode(inode);
 }
 
@@ -1544,6 +1559,10 @@ static void gfs2_evict_inode(struct inode *inode)
 		goto alloc_failed;
 	}
 
+	/* Deletes should never happen under memory pressure anymore.  */
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+		goto out;
+
 	/* Must not read inode block until block type has been verified */
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
 	if (unlikely(error)) {
-- 
2.7.5



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock
  2017-07-03 14:56 [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Andreas Gruenbacher
                   ` (3 preceding siblings ...)
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 4/4] gfs2: Defer deleting inodes under memory pressure Andreas Gruenbacher
@ 2017-07-04 12:11 ` Steven Whitehouse
  2017-07-05  9:57   ` Andreas Gruenbacher
  4 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2017-07-04 12:11 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 03/07/17 15:56, Andreas Gruenbacher wrote:
> These are the remaining patches for fixing a cluster-wide GFS2 and DLM
> deadlock.
>
> As explained in the previous posting of this patch queue, when inodes
> are evicted, GFS2 currently calls into DLM.  Inode eviction can be
> triggered by memory pressure, in the context of a random user-space
> process.  If DLM happens to block in the process in question (for
> example, it that process is a fence agent), GFS2 and DLM will deadlock.
>
> This patch queue stops GFS2 from calling into DLM on the inode evict
> path when under memory pressure.  It does so by first decoupling
> destroying inodes and putting their associated glocks, which is what
> ends up calling into DLM.  Second, when under memory pressure, it moves
> putting glocks into work queue context where it cannot block DLM.
> Third, when gfs2_drop_inode determines that an inode's link count has
> hit zero under memory pressure, it puts that inode on the delete
> workqueue (and keeps the inode in the icache) instead of causing
> gfs2_evict_inode to delete the inode immediately.  The delete workqueue
> will not be processed under memory pressure, so deleting inodes from
> there is safe.
Does this mean that all the corner cases are now covered and that this 
is now passing all the tests? If so that is a really good step forward. 
I know it has been a real slog to get to this point, but the patch 
series is looking much better for all the hard work that has gone into 
it I think,

Steve.

> Thanks,
> Andreas
>
> Andreas Gruenbacher (4):
>    gfs2: gfs2_glock_get: Wait on freeing glocks
>    gfs2: Get rid of gfs2_set_nlink
>    gfs2: gfs2_evict_inode: Put glocks asynchronously
>    gfs2: Defer deleting inodes under memory pressure
>
>   fs/gfs2/glock.c  | 145 ++++++++++++++++++++++++++++++++++++++++++++++---------
>   fs/gfs2/glock.h  |   2 +
>   fs/gfs2/glops.c  |  28 +----------
>   fs/gfs2/incore.h |   1 +
>   fs/gfs2/super.c  |  39 ++++++++++++++-
>   5 files changed, 162 insertions(+), 53 deletions(-)
>



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks Andreas Gruenbacher
@ 2017-07-04 12:13   ` Steven Whitehouse
  2017-07-05 14:22   ` Andreas Gruenbacher
  1 sibling, 0 replies; 12+ messages in thread
From: Steven Whitehouse @ 2017-07-04 12:13 UTC (permalink / raw)
  To: cluster-devel.redhat.com



On 03/07/17 15:56, Andreas Gruenbacher wrote:
> Keep glocks in their hash table until they are freed instead of removing
> them when their last reference is dropped.  This allows to wait for
> glocks to go away before recreating them in gfs2_glock_get so that the
> previous use of the glock won't overlap with the next.
I did a double take when I looked at this and saw it reintroducing 
gl_rcu and I wondered where that had gone in the mean time. Is that a 
bug fix that is included alongside this new feature? At least if it is 
not required anyway, then I can't quite figure out why - maybe something 
hidden in the rhashtable code perhaps?

Steve.

> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>   fs/gfs2/glock.c  | 135 +++++++++++++++++++++++++++++++++++++++++++++----------
>   fs/gfs2/incore.h |   1 +
>   2 files changed, 113 insertions(+), 23 deletions(-)
>
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 6cd71c5..8a7944b 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -15,6 +15,7 @@
>   #include <linux/buffer_head.h>
>   #include <linux/delay.h>
>   #include <linux/sort.h>
> +#include <linux/hash.h>
>   #include <linux/jhash.h>
>   #include <linux/kallsyms.h>
>   #include <linux/gfs2_ondisk.h>
> @@ -80,9 +81,69 @@ static struct rhashtable_params ht_parms = {
>   
>   static struct rhashtable gl_hash_table;
>   
> -void gfs2_glock_free(struct gfs2_glock *gl)
> +#define GLOCK_WAIT_TABLE_BITS 12
> +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS)
> +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned;
> +
> +struct wait_glock_queue {
> +	struct lm_lockname *name;
> +	wait_queue_t wait;
> +};
> +
> +static int glock_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
> +			       void *key)
>   {
> -	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> +	struct wait_glock_queue *wait_glock =
> +		container_of(wait, struct wait_glock_queue, wait);
> +	struct lm_lockname *wait_name = wait_glock->name;
> +	struct lm_lockname *wake_name = key;
> +
> +	if (wake_name->ln_sbd != wait_name->ln_sbd ||
> +	    wake_name->ln_number != wait_name->ln_number ||
> +	    wake_name->ln_type != wait_name->ln_type)
> +		return 0;
> +	return autoremove_wake_function(wait, mode, sync, key);
> +}
> +
> +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
> +{
> +	u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
> +
> +	return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
> +}
> +
> +static void prepare_to_wait_on_glock(wait_queue_head_t **wq,
> +				     struct wait_glock_queue *wait,
> +				     struct lm_lockname *name)
> +{
> +	wait->name = name;
> +	init_wait(&wait->wait);
> +	wait->wait.func = glock_wake_function;
> +	*wq = glock_waitqueue(name);
> +	prepare_to_wait(*wq, &wait->wait, TASK_UNINTERRUPTIBLE);
> +}
> +
> +static void finish_wait_on_glock(wait_queue_head_t *wq,
> +				 struct wait_glock_queue *wait)
> +{
> +	finish_wait(wq, &wait->wait);
> +}
> +
> +/**
> + * wake_up_glock  -  Wake up waiters on a glock
> + * @gl: the glock
> + */
> +static void wake_up_glock(struct gfs2_glock *gl)
> +{
> +	wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name);
> +
> +	if (waitqueue_active(wq))
> +		__wake_up(wq, TASK_NORMAL, 1, &gl->gl_name);
> +}
> +
> +static void gfs2_glock_dealloc(struct rcu_head *rcu)
> +{
> +	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
>   
>   	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
>   		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
> @@ -90,6 +151,15 @@ void gfs2_glock_free(struct gfs2_glock *gl)
>   		kfree(gl->gl_lksb.sb_lvbptr);
>   		kmem_cache_free(gfs2_glock_cachep, gl);
>   	}
> +}
> +
> +void gfs2_glock_free(struct gfs2_glock *gl)
> +{
> +	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> +
> +	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
> +	wake_up_glock(gl);
> +	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
>   	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
>   		wake_up(&sdp->sd_glock_wait);
>   }
> @@ -184,7 +254,6 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
>   
>   	gfs2_glock_remove_from_lru(gl);
>   	spin_unlock(&gl->gl_lockref.lock);
> -	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
>   	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
>   	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
>   	trace_gfs2_glock_put(gl);
> @@ -669,6 +738,36 @@ static void glock_work_func(struct work_struct *work)
>   	spin_unlock(&gl->gl_lockref.lock);
>   }
>   
> +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name,
> +					    struct gfs2_glock *new)
> +{
> +	struct wait_glock_queue wait;
> +	wait_queue_head_t *wq;
> +	struct gfs2_glock *gl;
> +
> +again:
> +	prepare_to_wait_on_glock(&wq, &wait, name);
> +	rcu_read_lock();
> +	if (new) {
> +		gl = rhashtable_lookup_get_insert_fast(&gl_hash_table,
> +			&new->gl_node, ht_parms);
> +		if (IS_ERR(gl))
> +			goto out;
> +	} else {
> +		gl = rhashtable_lookup_fast(&gl_hash_table,
> +			name, ht_parms);
> +	}
> +	if (gl && !lockref_get_not_dead(&gl->gl_lockref)) {
> +		rcu_read_unlock();
> +		schedule();
> +		goto again;
> +	}
> +out:
> +	rcu_read_unlock();
> +	finish_wait_on_glock(wq, &wait);
> +	return gl;
> +}
> +
>   /**
>    * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
>    * @sdp: The GFS2 superblock
> @@ -695,15 +794,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   	struct kmem_cache *cachep;
>   	int ret = 0;
>   
> -	rcu_read_lock();
> -	gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
> -	if (gl && !lockref_get_not_dead(&gl->gl_lockref))
> -		gl = NULL;
> -	rcu_read_unlock();
> -
> -	*glp = gl;
> -	if (gl)
> +	gl = find_insert_glock(&name, NULL);
> +	if (gl) {
> +		*glp = gl;
>   		return 0;
> +	}
>   	if (!create)
>   		return -ENOENT;
>   
> @@ -757,10 +852,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   		mapping->writeback_index = 0;
>   	}
>   
> -again:
> -	rcu_read_lock();
> -	tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
> -						ht_parms);
> +	tmp = find_insert_glock(&name, gl);
>   	if (!tmp) {
>   		*glp = gl;
>   		goto out;
> @@ -769,13 +861,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   		ret = PTR_ERR(tmp);
>   		goto out_free;
>   	}
> -	if (lockref_get_not_dead(&tmp->gl_lockref)) {
> -		*glp = tmp;
> -		goto out_free;
> -	}
> -	rcu_read_unlock();
> -	cond_resched();
> -	goto again;
> +	*glp = tmp;
>   
>   out_free:
>   	kfree(gl->gl_lksb.sb_lvbptr);
> @@ -1796,7 +1882,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
>   
>   int __init gfs2_glock_init(void)
>   {
> -	int ret;
> +	int i, ret;
>   
>   	ret = rhashtable_init(&gl_hash_table, &ht_parms);
>   	if (ret < 0)
> @@ -1825,6 +1911,9 @@ int __init gfs2_glock_init(void)
>   		return ret;
>   	}
>   
> +	for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
> +		init_waitqueue_head(glock_wait_table + i);
> +
>   	return 0;
>   }
>   
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index de42384..2ba9403 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -373,6 +373,7 @@ struct gfs2_glock {
>   			loff_t end;
>   		} gl_vm;
>   	};
> +	struct rcu_head gl_rcu;
>   	struct rhash_head gl_node;
>   };
>   



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 3/4] gfs2: gfs2_evict_inode: Put glocks asynchronously
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 3/4] gfs2: gfs2_evict_inode: Put glocks asynchronously Andreas Gruenbacher
@ 2017-07-04 12:16   ` Steven Whitehouse
  0 siblings, 0 replies; 12+ messages in thread
From: Steven Whitehouse @ 2017-07-04 12:16 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 03/07/17 15:56, Andreas Gruenbacher wrote:
> gfs2_evict_inode is called to free inodes under memory pressure.  The
> function calls into DLM when an inode's last cluster-wide reference goes
> away (remote unlink) and to release the glock and associated DLM lock
> before finally destroying the inode.  However, if DLM is blocked on
> memory to become available, calling into DLM again will deadlock.
>
> Avoid that by decoupling releasing glocks from destroying inodes in that
> case: with gfs2_glock_queue_put, glocks will be dequeued asynchronously
> in work queue context, when the associated inodes have likely already
> been destroyed.
>
> With this change, inodes can end up being unlinked, remote-unlink can be
> triggered, and then the inode can be reallocated before all
> remote-unlink callbacks are processed.  To detect that, revalidate the
> link count in gfs2_evict_inode to make sure we're not deleting an
> allocated, referenced inode.
Do we need to preserve ordering between releasing the inode and iopen 
glocks I wonder? Not sure that is guaranteed by the workqueue. Hopefully 
we don't need to worry about that, since it looks good otherwise,

Steve.

>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>   fs/gfs2/glock.c | 10 +++++++++-
>   fs/gfs2/glock.h |  2 ++
>   fs/gfs2/super.c | 20 ++++++++++++++++++--
>   3 files changed, 29 insertions(+), 3 deletions(-)
>
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 8a7944b..0236ef5 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -170,7 +170,7 @@ void gfs2_glock_free(struct gfs2_glock *gl)
>    *
>    */
>   
> -static void gfs2_glock_hold(struct gfs2_glock *gl)
> +void gfs2_glock_hold(struct gfs2_glock *gl)
>   {
>   	GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
>   	lockref_get(&gl->gl_lockref);
> @@ -260,6 +260,14 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
>   	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
>   }
>   
> +/*
> + * Cause the glock to be put in work queue context.
> + */
> +void gfs2_glock_queue_put(struct gfs2_glock *gl)
> +{
> +	gfs2_glock_queue_work(gl, 0);
> +}
> +
>   /**
>    * gfs2_glock_put() - Decrement reference count on glock
>    * @gl: The glock to put
> diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
> index 9ad4a6a..33e0511 100644
> --- a/fs/gfs2/glock.h
> +++ b/fs/gfs2/glock.h
> @@ -181,7 +181,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
>   extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   			  const struct gfs2_glock_operations *glops,
>   			  int create, struct gfs2_glock **glp);
> +extern void gfs2_glock_hold(struct gfs2_glock *gl);
>   extern void gfs2_glock_put(struct gfs2_glock *gl);
> +extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
>   extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
>   			     u16 flags, struct gfs2_holder *gh);
>   extern void gfs2_holder_reinit(unsigned int state, u16 flags,
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index fdedec3..d53364e 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1562,6 +1562,12 @@ static void gfs2_evict_inode(struct inode *inode)
>   			goto out_truncate;
>   	}
>   
> +	/*
> +	 * The inode may have been recreated in the meantime.
> +	 */
> +	if (inode->i_nlink)
> +		goto out_truncate;
> +
>   alloc_failed:
>   	if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
>   	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
> @@ -1643,12 +1649,22 @@ static void gfs2_evict_inode(struct inode *inode)
>   	glock_set_object(ip->i_gl, NULL);
>   	wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
>   	gfs2_glock_add_to_lru(ip->i_gl);
> -	gfs2_glock_put(ip->i_gl);
> +	if (current->flags & PF_MEMALLOC)
> +		gfs2_glock_queue_put(ip->i_gl);
> +	else
> +		gfs2_glock_put(ip->i_gl);
>   	ip->i_gl = NULL;
>   	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
> -		glock_set_object(ip->i_iopen_gh.gh_gl, NULL);
> +		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
> +
> +		glock_set_object(gl, NULL);
>   		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
> +		gfs2_glock_hold(gl);
>   		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
> +		if (current->flags & PF_MEMALLOC)
> +			gfs2_glock_queue_put(gl);
> +		else
> +			gfs2_glock_put(gl);
>   	}
>   }
>   



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 4/4] gfs2: Defer deleting inodes under memory pressure
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 4/4] gfs2: Defer deleting inodes under memory pressure Andreas Gruenbacher
@ 2017-07-04 12:17   ` Steven Whitehouse
  0 siblings, 0 replies; 12+ messages in thread
From: Steven Whitehouse @ 2017-07-04 12:17 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 03/07/17 15:56, Andreas Gruenbacher wrote:
> When under memory pressure and an inode's link count has dropped to
> zero, defer deleting the inode to the delete workqueue.  This avoids
> calling into DLM under memory pressure which can deadlock.
Yes, that looks like a good solution to me,

Steve.

>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>   fs/gfs2/super.c | 19 +++++++++++++++++++
>   1 file changed, 19 insertions(+)
>
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index d53364e..f54af47 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1317,6 +1317,21 @@ static int gfs2_drop_inode(struct inode *inode)
>   		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
>   			clear_nlink(inode);
>   	}
> +
> +	/*
> +	 * When under memory pressure when an inode's link count has dropped to
> +	 * zero, defer deleting the inode to the delete workqueue.  This avoids
> +	 * calling into DLM under memory pressure, which can deadlock.
> +	 */
> +	if (!inode->i_nlink &&
> +	    unlikely(current->flags & PF_MEMALLOC) &&
> +	    gfs2_holder_initialized(&ip->i_iopen_gh)) {
> +		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
> +
> +		queue_work(gfs2_delete_workqueue, &gl->gl_delete);
> +		return false;
> +	}
> +
>   	return generic_drop_inode(inode);
>   }
>   
> @@ -1544,6 +1559,10 @@ static void gfs2_evict_inode(struct inode *inode)
>   		goto alloc_failed;
>   	}
>   
> +	/* Deletes should never happen under memory pressure anymore.  */
> +	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
> +		goto out;
> +
>   	/* Must not read inode block until block type has been verified */
>   	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
>   	if (unlikely(error)) {



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock
  2017-07-04 12:11 ` [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Steven Whitehouse
@ 2017-07-05  9:57   ` Andreas Gruenbacher
  0 siblings, 0 replies; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-05  9:57 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Tue, Jul 4, 2017 at 2:11 PM, Steven Whitehouse <swhiteho@redhat.com> wrote:
> Hi,
>
>
> On 03/07/17 15:56, Andreas Gruenbacher wrote:
>>
>> These are the remaining patches for fixing a cluster-wide GFS2 and DLM
>> deadlock.
>>
>> As explained in the previous posting of this patch queue, when inodes
>> are evicted, GFS2 currently calls into DLM.  Inode eviction can be
>> triggered by memory pressure, in the context of a random user-space
>> process.  If DLM happens to block in the process in question (for
>> example, it that process is a fence agent), GFS2 and DLM will deadlock.
>>
>> This patch queue stops GFS2 from calling into DLM on the inode evict
>> path when under memory pressure.  It does so by first decoupling
>> destroying inodes and putting their associated glocks, which is what
>> ends up calling into DLM.  Second, when under memory pressure, it moves
>> putting glocks into work queue context where it cannot block DLM.
>> Third, when gfs2_drop_inode determines that an inode's link count has
>> hit zero under memory pressure, it puts that inode on the delete
>> workqueue (and keeps the inode in the icache) instead of causing
>> gfs2_evict_inode to delete the inode immediately.  The delete workqueue
>> will not be processed under memory pressure, so deleting inodes from
>> there is safe.
>
> Does this mean that all the corner cases are now covered and that this is
> now passing all the tests?

It did look like that for a a while but unfortunately, we did get
directory corruption again ("Number of entries corrupt in dir"). All
signs are pointing at patch "Put glocks asynchronously" at this point;
I'm trying to figure out how to analyze this further.

Andreas



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks
  2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks Andreas Gruenbacher
  2017-07-04 12:13   ` Steven Whitehouse
@ 2017-07-05 14:22   ` Andreas Gruenbacher
  2017-07-05 14:28     ` Steven Whitehouse
  1 sibling, 1 reply; 12+ messages in thread
From: Andreas Gruenbacher @ 2017-07-05 14:22 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Tue, Jul 4, 2017 at 2:13 PM, Steven Whitehouse <swhiteho@redhat.com> wrote:
> On 03/07/17 15:56, Andreas Gruenbacher wrote:
>>
>> Keep glocks in their hash table until they are freed instead of removing
>> them when their last reference is dropped.  This allows to wait for
>> glocks to go away before recreating them in gfs2_glock_get so that the
>> previous use of the glock won't overlap with the next.
>
> I did a double take when I looked at this and saw it reintroducing gl_rcu
> and I wondered where that had gone in the mean time. Is that a bug fix that
> is included alongside this new feature? At least if it is not required
> anyway, then I can't quite figure out why - maybe something hidden in the
> rhashtable code perhaps?

Hmm, now that you're mentioning it, I can't actually see how the rhashtable
code can be safe unless entries are freed in an RCU-safe way: otherwise,
processes reading the rhashtable can end up accessing random memory while
walking hash chains.  In this case that's rather unlikely, but still possible.

How do you like the following fix?

Thanks,
Andreas

--

gfs2: Fix freeing glock rhashtable entries

The glock rhashtable is accessed using RCU locking, so entries in the
hash table must only be freed in an RCU-safe way.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/glock.c  | 11 +++++++++--
 fs/gfs2/incore.h |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6cd71c5..c38ab6c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = {
 
 static struct rhashtable gl_hash_table;
 
-void gfs2_glock_free(struct gfs2_glock *gl)
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
 {
-	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
 
 	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
 		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl)
 		kfree(gl->gl_lksb.sb_lvbptr);
 		kmem_cache_free(gfs2_glock_cachep, gl);
 	}
+}
+
+void gfs2_glock_free(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
 	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
 		wake_up(&sdp->sd_glock_wait);
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index de42384..7500566 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -374,6 +374,7 @@ struct gfs2_glock {
 		} gl_vm;
 	};
 	struct rhash_head gl_node;
+	struct rcu_head gl_rcu;
 };
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
-- 
2.7.5



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks
  2017-07-05 14:22   ` Andreas Gruenbacher
@ 2017-07-05 14:28     ` Steven Whitehouse
  0 siblings, 0 replies; 12+ messages in thread
From: Steven Whitehouse @ 2017-07-05 14:28 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 05/07/17 15:22, Andreas Gruenbacher wrote:
> On Tue, Jul 4, 2017 at 2:13 PM, Steven Whitehouse <swhiteho@redhat.com> wrote:
>> On 03/07/17 15:56, Andreas Gruenbacher wrote:
>>> Keep glocks in their hash table until they are freed instead of removing
>>> them when their last reference is dropped.  This allows to wait for
>>> glocks to go away before recreating them in gfs2_glock_get so that the
>>> previous use of the glock won't overlap with the next.
>> I did a double take when I looked at this and saw it reintroducing gl_rcu
>> and I wondered where that had gone in the mean time. Is that a bug fix that
>> is included alongside this new feature? At least if it is not required
>> anyway, then I can't quite figure out why - maybe something hidden in the
>> rhashtable code perhaps?
> Hmm, now that you're mentioning it, I can't actually see how the rhashtable
> code can be safe unless entries are freed in an RCU-safe way: otherwise,
> processes reading the rhashtable can end up accessing random memory while
> walking hash chains.  In this case that's rather unlikely, but still possible.
>
> How do you like the following fix?
>
> Thanks,
> Andreas
Yes, that was what I was wondering - that needs to go to -stable too I 
think,

Steve.

> --
>
> gfs2: Fix freeing glock rhashtable entries
>
> The glock rhashtable is accessed using RCU locking, so entries in the
> hash table must only be freed in an RCU-safe way.
>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>   fs/gfs2/glock.c  | 11 +++++++++--
>   fs/gfs2/incore.h |  1 +
>   2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 6cd71c5..c38ab6c 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = {
>   
>   static struct rhashtable gl_hash_table;
>   
> -void gfs2_glock_free(struct gfs2_glock *gl)
> +static void gfs2_glock_dealloc(struct rcu_head *rcu)
>   {
> -	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> +	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
>   
>   	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
>   		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
> @@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl)
>   		kfree(gl->gl_lksb.sb_lvbptr);
>   		kmem_cache_free(gfs2_glock_cachep, gl);
>   	}
> +}
> +
> +void gfs2_glock_free(struct gfs2_glock *gl)
> +{
> +	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> +
> +	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
>   	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
>   		wake_up(&sdp->sd_glock_wait);
>   }
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index de42384..7500566 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -374,6 +374,7 @@ struct gfs2_glock {
>   		} gl_vm;
>   	};
>   	struct rhash_head gl_node;
> +	struct rcu_head gl_rcu;
>   };
>   
>   #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-07-05 14:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-03 14:56 [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Andreas Gruenbacher
2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 1/4] gfs2: gfs2_glock_get: Wait on freeing glocks Andreas Gruenbacher
2017-07-04 12:13   ` Steven Whitehouse
2017-07-05 14:22   ` Andreas Gruenbacher
2017-07-05 14:28     ` Steven Whitehouse
2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 2/4] gfs2: Get rid of gfs2_set_nlink Andreas Gruenbacher
2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 3/4] gfs2: gfs2_evict_inode: Put glocks asynchronously Andreas Gruenbacher
2017-07-04 12:16   ` Steven Whitehouse
2017-07-03 14:56 ` [Cluster-devel] [PATCH v2 4/4] gfs2: Defer deleting inodes under memory pressure Andreas Gruenbacher
2017-07-04 12:17   ` Steven Whitehouse
2017-07-04 12:11 ` [Cluster-devel] [PATCH v2 0/4] GFS2 shrinker deadlock Steven Whitehouse
2017-07-05  9:57   ` Andreas Gruenbacher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.