All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention
       [not found] <1321492262.27823235.1498846645921.JavaMail.zimbra@redhat.com>
@ 2017-06-30 18:18 ` Bob Peterson
  2017-07-03  9:21   ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Peterson @ 2017-06-30 18:18 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This patch changes GFS2's LRU list from a single linked list with
a much-contended spin_lock to a hash table of linked lists, thus
having more locks with less contention. The key for the table is
set round-robin to evenly distribute the glocks into the buckets.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6cd71c5..e1e16c4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,10 +63,32 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+static atomic_t lru_bucket = ATOMIC_INIT(0);
+
 struct workqueue_struct *gfs2_delete_workqueue;
-static LIST_HEAD(lru_list);
+
+#define LRU_HASH_BITS 12
+#define LRU_HASH_BUCKETS BIT(LRU_HASH_BITS)
+#define LRU_HASH_MASK (LRU_HASH_BUCKETS - 1)
+
+struct gfs2_lru_bucket {
+	struct list_head lru_list;
+	rwlock_t lru_lock;
+};
+
+static struct gfs2_lru_bucket lru_locks[LRU_HASH_BUCKETS];
+
+static inline void lock_lru_bucket(unsigned bucket)
+{
+	write_lock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
+}
+
+static inline void unlock_lru_bucket(unsigned bucket)
+{
+	write_unlock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
+}
+
 static atomic_t lru_count = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(lru_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       BIT(GFS2_GL_HASH_SHIFT)
@@ -126,30 +148,36 @@ static int demote_ok(const struct gfs2_glock *gl)
 	return 1;
 }
 
-
-void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+static void remove_from_lru_locked(struct gfs2_glock *gl)
 {
-	spin_lock(&lru_lock);
-
-	if (!list_empty(&gl->gl_lru))
+	if (!list_empty(&gl->gl_lru)) {
 		list_del_init(&gl->gl_lru);
-	else
-		atomic_inc(&lru_count);
+		atomic_dec(&lru_count);
+	}
+}
+
+static void glock_remove_from_lru(struct gfs2_glock *gl)
+{
+	if (!test_and_clear_bit(GLF_LRU, &gl->gl_flags))
+		return;
 
-	list_add_tail(&gl->gl_lru, &lru_list);
-	set_bit(GLF_LRU, &gl->gl_flags);
-	spin_unlock(&lru_lock);
+	lock_lru_bucket(gl->gl_lru_bucket);
+	remove_from_lru_locked(gl);
+	unlock_lru_bucket(gl->gl_lru_bucket);
 }
 
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
 {
-	spin_lock(&lru_lock);
-	if (!list_empty(&gl->gl_lru)) {
-		list_del_init(&gl->gl_lru);
-		atomic_dec(&lru_count);
-		clear_bit(GLF_LRU, &gl->gl_flags);
-	}
-	spin_unlock(&lru_lock);
+	glock_remove_from_lru(gl);
+	if (test_and_set_bit(GLF_LRU, &gl->gl_flags))
+		return;
+
+	lock_lru_bucket(gl->gl_lru_bucket);
+	BUG_ON(!list_empty(&gl->gl_lru));
+	atomic_inc(&lru_count);
+	list_add_tail(&gl->gl_lru,
+		      &lru_locks[gl->gl_lru_bucket & LRU_HASH_MASK].lru_list);
+	unlock_lru_bucket(gl->gl_lru_bucket);
 }
 
 /*
@@ -182,7 +210,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
 
 	lockref_mark_dead(&gl->gl_lockref);
 
-	gfs2_glock_remove_from_lru(gl);
+	glock_remove_from_lru(gl);
 	spin_unlock(&gl->gl_lockref.lock);
 	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
 	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
@@ -746,6 +774,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 	INIT_WORK(&gl->gl_delete, delete_work_func);
+	gl->gl_lru_bucket = atomic_inc_return(&lru_bucket);
+	if (gl->gl_lru_bucket >= LRU_HASH_BUCKETS) {
+		gl->gl_lru_bucket = 0;
+		atomic_set(&lru_bucket, gl->gl_lru_bucket);
+	}
 
 	mapping = gfs2_glock2aspace(gl);
 	if (mapping) {
@@ -1010,8 +1043,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		return -EIO;
 
-	if (test_bit(GLF_LRU, &gl->gl_flags))
-		gfs2_glock_remove_from_lru(gl);
+	glock_remove_from_lru(gl);
 
 	spin_lock(&gl->gl_lockref.lock);
 	add_to_queue(gh);
@@ -1343,24 +1375,20 @@ static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
 }
 
 /**
- * gfs2_dispose_glock_lru - Demote a list of glocks
+ * gfs2_dispose_glock_lru - Demote a list of glocks from the same LRU bucket
  * @list: The list to dispose of
  *
  * Disposing of glocks may involve disk accesses, so that here we sort
  * the glocks by number (i.e. disk location of the inodes) so that if
  * there are any such accesses, they'll be sent in order (mostly).
- *
- * Must be called under the lru_lock, but may drop and retake this
- * lock. While the lru_lock is dropped, entries may vanish from the
- * list, but no new entries will appear on the list (since it is
- * private)
  */
 
-static void gfs2_dispose_glock_lru(struct list_head *list)
-__releases(&lru_lock)
-__acquires(&lru_lock)
+static long gfs2_dispose_glock_lru(struct list_head *list)
 {
-	struct gfs2_glock *gl;
+	struct gfs2_glock *gl = NULL;
+	LIST_HEAD(rejects);
+	int reject_count = 0;
+	long disposed = 0;
 
 	list_sort(NULL, list, glock_cmp);
 
@@ -1369,23 +1397,30 @@ __acquires(&lru_lock)
 		list_del_init(&gl->gl_lru);
 		if (!spin_trylock(&gl->gl_lockref.lock)) {
 add_back_to_lru:
-			list_add(&gl->gl_lru, &lru_list);
-			atomic_inc(&lru_count);
+			list_add_tail(&gl->gl_lru, &rejects);
+			reject_count++;
 			continue;
 		}
 		if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 			spin_unlock(&gl->gl_lockref.lock);
 			goto add_back_to_lru;
 		}
-		clear_bit(GLF_LRU, &gl->gl_flags);
+		disposed++;
 		gl->gl_lockref.count++;
 		if (demote_ok(gl))
 			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
 		__gfs2_glock_queue_work(gl, 0);
 		spin_unlock(&gl->gl_lockref.lock);
-		cond_resched_lock(&lru_lock);
 	}
+	if (!list_empty(&rejects)) {
+		gl = list_first_entry(&rejects, struct gfs2_glock, gl_lru);
+		lock_lru_bucket(gl->gl_lru_bucket);
+		list_splice(&rejects, &lru_locks[gl->gl_lru_bucket].lru_list);
+		atomic_add(reject_count, &lru_count);
+		unlock_lru_bucket(gl->gl_lru_bucket);
+	}
+	return disposed;
 }
 
 /**
@@ -1399,30 +1434,33 @@ __acquires(&lru_lock)
 
 static long gfs2_scan_glock_lru(int nr)
 {
-	struct gfs2_glock *gl;
-	LIST_HEAD(skipped);
+	struct gfs2_glock *gl, *tmp;
 	LIST_HEAD(dispose);
+	int start_bucket, bucket;
 	long freed = 0;
+	static int last_lru_scan = 0;
 
-	spin_lock(&lru_lock);
-	while ((nr-- >= 0) && !list_empty(&lru_list)) {
-		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-
-		/* Test for being demotable */
-		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
-			list_move(&gl->gl_lru, &dispose);
-			atomic_dec(&lru_count);
-			freed++;
-			continue;
+	start_bucket = bucket = (last_lru_scan & LRU_HASH_MASK);
+	do {
+		lock_lru_bucket(bucket);
+		list_for_each_entry_safe(gl, tmp, &lru_locks[bucket].lru_list,
+					 gl_lru) {
+			/* Test for being demotable */
+			if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
+				if (test_and_clear_bit(GLF_LRU, &gl->gl_flags))
+					remove_from_lru_locked(gl);
+				list_add_tail(&gl->gl_lru, &dispose);
+				nr--;
+				if (nr == 0)
+					break;
+			}
 		}
-
-		list_move(&gl->gl_lru, &skipped);
-	}
-	list_splice(&skipped, &lru_list);
-	if (!list_empty(&dispose))
-		gfs2_dispose_glock_lru(&dispose);
-	spin_unlock(&lru_lock);
-
+		unlock_lru_bucket(bucket);
+		if (!list_empty(&dispose))
+			freed += gfs2_dispose_glock_lru(&dispose);
+		bucket = (bucket + 1) & LRU_HASH_MASK;
+	} while (nr && (bucket != start_bucket));
+	last_lru_scan = bucket;
 	return freed;
 }
 
@@ -1504,7 +1542,7 @@ static void thaw_glock(struct gfs2_glock *gl)
 
 static void clear_glock(struct gfs2_glock *gl)
 {
-	gfs2_glock_remove_from_lru(gl);
+	glock_remove_from_lru(gl);
 
 	spin_lock(&gl->gl_lockref.lock);
 	if (gl->gl_state != LM_ST_UNLOCKED)
@@ -1704,7 +1742,7 @@ void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld l:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
@@ -1713,7 +1751,8 @@ void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 		  state2str(gl->gl_demote_state), dtime,
 		  atomic_read(&gl->gl_ail_count),
 		  atomic_read(&gl->gl_revokes),
-		  (int)gl->gl_lockref.count, gl->gl_hold_time);
+		  (int)gl->gl_lockref.count, gl->gl_hold_time,
+		  gl->gl_lru_bucket);
 
 	list_for_each_entry(gh, &gl->gl_holders, gh_list)
 		dump_holder(seq, gh);
@@ -1797,6 +1836,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 int __init gfs2_glock_init(void)
 {
 	int ret;
+	unsigned i;
+
+	for (i = 0; i < LRU_HASH_BUCKETS; i++) {
+		INIT_LIST_HEAD(&lru_locks[i].lru_list);
+		rwlock_init(&lru_locks[i].lru_lock);
+	}
 
 	ret = rhashtable_init(&gl_hash_table, &ht_parms);
 	if (ret < 0)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 2ad003b..e8c0f2b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -374,6 +374,7 @@ struct gfs2_glock {
 		} gl_vm;
 	};
 	struct rhash_head gl_node;
+	int gl_lru_bucket;
 };
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention
  2017-06-30 18:18 ` [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention Bob Peterson
@ 2017-07-03  9:21   ` Steven Whitehouse
  2017-07-03 20:31     ` Bob Peterson
  0 siblings, 1 reply; 4+ messages in thread
From: Steven Whitehouse @ 2017-07-03  9:21 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 30/06/17 19:18, Bob Peterson wrote:
> Hi,
>
> This patch changes GFS2's LRU list from a single linked list with
> a much-contended spin_lock to a hash table of linked lists, thus
> having more locks with less contention. The key for the table is
> set round-robin to evenly distribute the glocks into the buckets.
>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
This is a bit confused I think. There needs to be some more work done 
here to figure out exactly what the problem is, but I'm not very keen on 
this solution for a number of reasons...

  - It still claims to be LRU, but in fact it isn't any more
  - The list might have been broken up, but the counter is still global, 
so that there is still the possibility for contention on that
  - There is no performance data to say whether or not there is any 
improvement or not
  - There should not in general be huge numbers of LRU operations on 
glocks anyway, so why is the LRU list contended in the first place?
  - It introduces a new global counter at glock creation time, so 
something else that might land up being a contention point

The important question is what causes the contention on the LRU list in 
the first place - ideally we'd look at that issue in order to avoid so 
many list operations, rather than making it very complicated and non-LRU 
- at the very least there should be a justification for the new algorithm,

Steve.


> ---
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 6cd71c5..e1e16c4 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -63,10 +63,32 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
>   
>   static struct dentry *gfs2_root;
>   static struct workqueue_struct *glock_workqueue;
> +static atomic_t lru_bucket = ATOMIC_INIT(0);
> +
>   struct workqueue_struct *gfs2_delete_workqueue;
> -static LIST_HEAD(lru_list);
> +
> +#define LRU_HASH_BITS 12
> +#define LRU_HASH_BUCKETS BIT(LRU_HASH_BITS)
> +#define LRU_HASH_MASK (LRU_HASH_BUCKETS - 1)
> +
> +struct gfs2_lru_bucket {
> +	struct list_head lru_list;
> +	rwlock_t lru_lock;
> +};
> +
> +static struct gfs2_lru_bucket lru_locks[LRU_HASH_BUCKETS];
> +
> +static inline void lock_lru_bucket(unsigned bucket)
> +{
> +	write_lock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
> +}
> +
> +static inline void unlock_lru_bucket(unsigned bucket)
> +{
> +	write_unlock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
> +}
> +
>   static atomic_t lru_count = ATOMIC_INIT(0);
> -static DEFINE_SPINLOCK(lru_lock);
>   
>   #define GFS2_GL_HASH_SHIFT      15
>   #define GFS2_GL_HASH_SIZE       BIT(GFS2_GL_HASH_SHIFT)
> @@ -126,30 +148,36 @@ static int demote_ok(const struct gfs2_glock *gl)
>   	return 1;
>   }
>   
> -
> -void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
> +static void remove_from_lru_locked(struct gfs2_glock *gl)
>   {
> -	spin_lock(&lru_lock);
> -
> -	if (!list_empty(&gl->gl_lru))
> +	if (!list_empty(&gl->gl_lru)) {
>   		list_del_init(&gl->gl_lru);
> -	else
> -		atomic_inc(&lru_count);
> +		atomic_dec(&lru_count);
> +	}
> +}
> +
> +static void glock_remove_from_lru(struct gfs2_glock *gl)
> +{
> +	if (!test_and_clear_bit(GLF_LRU, &gl->gl_flags))
> +		return;
>   
> -	list_add_tail(&gl->gl_lru, &lru_list);
> -	set_bit(GLF_LRU, &gl->gl_flags);
> -	spin_unlock(&lru_lock);
> +	lock_lru_bucket(gl->gl_lru_bucket);
> +	remove_from_lru_locked(gl);
> +	unlock_lru_bucket(gl->gl_lru_bucket);
>   }
>   
> -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
> +void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
>   {
> -	spin_lock(&lru_lock);
> -	if (!list_empty(&gl->gl_lru)) {
> -		list_del_init(&gl->gl_lru);
> -		atomic_dec(&lru_count);
> -		clear_bit(GLF_LRU, &gl->gl_flags);
> -	}
> -	spin_unlock(&lru_lock);
> +	glock_remove_from_lru(gl);
> +	if (test_and_set_bit(GLF_LRU, &gl->gl_flags))
> +		return;
> +
> +	lock_lru_bucket(gl->gl_lru_bucket);
> +	BUG_ON(!list_empty(&gl->gl_lru));
> +	atomic_inc(&lru_count);
> +	list_add_tail(&gl->gl_lru,
> +		      &lru_locks[gl->gl_lru_bucket & LRU_HASH_MASK].lru_list);
> +	unlock_lru_bucket(gl->gl_lru_bucket);
>   }
>   
>   /*
> @@ -182,7 +210,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
>   
>   	lockref_mark_dead(&gl->gl_lockref);
>   
> -	gfs2_glock_remove_from_lru(gl);
> +	glock_remove_from_lru(gl);
>   	spin_unlock(&gl->gl_lockref.lock);
>   	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
>   	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
> @@ -746,6 +774,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
>   	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
>   	INIT_WORK(&gl->gl_delete, delete_work_func);
> +	gl->gl_lru_bucket = atomic_inc_return(&lru_bucket);
> +	if (gl->gl_lru_bucket >= LRU_HASH_BUCKETS) {
> +		gl->gl_lru_bucket = 0;
> +		atomic_set(&lru_bucket, gl->gl_lru_bucket);
> +	}
>   
>   	mapping = gfs2_glock2aspace(gl);
>   	if (mapping) {
> @@ -1010,8 +1043,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
>   	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
>   		return -EIO;
>   
> -	if (test_bit(GLF_LRU, &gl->gl_flags))
> -		gfs2_glock_remove_from_lru(gl);
> +	glock_remove_from_lru(gl);
>   
>   	spin_lock(&gl->gl_lockref.lock);
>   	add_to_queue(gh);
> @@ -1343,24 +1375,20 @@ static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
>   }
>   
>   /**
> - * gfs2_dispose_glock_lru - Demote a list of glocks
> + * gfs2_dispose_glock_lru - Demote a list of glocks from the same LRU bucket
>    * @list: The list to dispose of
>    *
>    * Disposing of glocks may involve disk accesses, so that here we sort
>    * the glocks by number (i.e. disk location of the inodes) so that if
>    * there are any such accesses, they'll be sent in order (mostly).
> - *
> - * Must be called under the lru_lock, but may drop and retake this
> - * lock. While the lru_lock is dropped, entries may vanish from the
> - * list, but no new entries will appear on the list (since it is
> - * private)
>    */
>   
> -static void gfs2_dispose_glock_lru(struct list_head *list)
> -__releases(&lru_lock)
> -__acquires(&lru_lock)
> +static long gfs2_dispose_glock_lru(struct list_head *list)
>   {
> -	struct gfs2_glock *gl;
> +	struct gfs2_glock *gl = NULL;
> +	LIST_HEAD(rejects);
> +	int reject_count = 0;
> +	long disposed = 0;
>   
>   	list_sort(NULL, list, glock_cmp);
>   
> @@ -1369,23 +1397,30 @@ __acquires(&lru_lock)
>   		list_del_init(&gl->gl_lru);
>   		if (!spin_trylock(&gl->gl_lockref.lock)) {
>   add_back_to_lru:
> -			list_add(&gl->gl_lru, &lru_list);
> -			atomic_inc(&lru_count);
> +			list_add_tail(&gl->gl_lru, &rejects);
> +			reject_count++;
>   			continue;
>   		}
>   		if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
>   			spin_unlock(&gl->gl_lockref.lock);
>   			goto add_back_to_lru;
>   		}
> -		clear_bit(GLF_LRU, &gl->gl_flags);
> +		disposed++;
>   		gl->gl_lockref.count++;
>   		if (demote_ok(gl))
>   			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
>   		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
>   		__gfs2_glock_queue_work(gl, 0);
>   		spin_unlock(&gl->gl_lockref.lock);
> -		cond_resched_lock(&lru_lock);
>   	}
> +	if (!list_empty(&rejects)) {
> +		gl = list_first_entry(&rejects, struct gfs2_glock, gl_lru);
> +		lock_lru_bucket(gl->gl_lru_bucket);
> +		list_splice(&rejects, &lru_locks[gl->gl_lru_bucket].lru_list);
> +		atomic_add(reject_count, &lru_count);
> +		unlock_lru_bucket(gl->gl_lru_bucket);
> +	}
> +	return disposed;
>   }
>   
>   /**
> @@ -1399,30 +1434,33 @@ __acquires(&lru_lock)
>   
>   static long gfs2_scan_glock_lru(int nr)
>   {
> -	struct gfs2_glock *gl;
> -	LIST_HEAD(skipped);
> +	struct gfs2_glock *gl, *tmp;
>   	LIST_HEAD(dispose);
> +	int start_bucket, bucket;
>   	long freed = 0;
> +	static int last_lru_scan = 0;
>   
> -	spin_lock(&lru_lock);
> -	while ((nr-- >= 0) && !list_empty(&lru_list)) {
> -		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
> -
> -		/* Test for being demotable */
> -		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
> -			list_move(&gl->gl_lru, &dispose);
> -			atomic_dec(&lru_count);
> -			freed++;
> -			continue;
> +	start_bucket = bucket = (last_lru_scan & LRU_HASH_MASK);
> +	do {
> +		lock_lru_bucket(bucket);
> +		list_for_each_entry_safe(gl, tmp, &lru_locks[bucket].lru_list,
> +					 gl_lru) {
> +			/* Test for being demotable */
> +			if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
> +				if (test_and_clear_bit(GLF_LRU, &gl->gl_flags))
> +					remove_from_lru_locked(gl);
> +				list_add_tail(&gl->gl_lru, &dispose);
> +				nr--;
> +				if (nr == 0)
> +					break;
> +			}
>   		}
> -
> -		list_move(&gl->gl_lru, &skipped);
> -	}
> -	list_splice(&skipped, &lru_list);
> -	if (!list_empty(&dispose))
> -		gfs2_dispose_glock_lru(&dispose);
> -	spin_unlock(&lru_lock);
> -
> +		unlock_lru_bucket(bucket);
> +		if (!list_empty(&dispose))
> +			freed += gfs2_dispose_glock_lru(&dispose);
> +		bucket = (bucket + 1) & LRU_HASH_MASK;
> +	} while (nr && (bucket != start_bucket));
> +	last_lru_scan = bucket;
>   	return freed;
>   }
>   
> @@ -1504,7 +1542,7 @@ static void thaw_glock(struct gfs2_glock *gl)
>   
>   static void clear_glock(struct gfs2_glock *gl)
>   {
> -	gfs2_glock_remove_from_lru(gl);
> +	glock_remove_from_lru(gl);
>   
>   	spin_lock(&gl->gl_lockref.lock);
>   	if (gl->gl_state != LM_ST_UNLOCKED)
> @@ -1704,7 +1742,7 @@ void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
>   	dtime *= 1000000/HZ; /* demote time in uSec */
>   	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
>   		dtime = 0;
> -	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n",
> +	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld l:%d\n",
>   		  state2str(gl->gl_state),
>   		  gl->gl_name.ln_type,
>   		  (unsigned long long)gl->gl_name.ln_number,
> @@ -1713,7 +1751,8 @@ void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
>   		  state2str(gl->gl_demote_state), dtime,
>   		  atomic_read(&gl->gl_ail_count),
>   		  atomic_read(&gl->gl_revokes),
> -		  (int)gl->gl_lockref.count, gl->gl_hold_time);
> +		  (int)gl->gl_lockref.count, gl->gl_hold_time,
> +		  gl->gl_lru_bucket);
>   
>   	list_for_each_entry(gh, &gl->gl_holders, gh_list)
>   		dump_holder(seq, gh);
> @@ -1797,6 +1836,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
>   int __init gfs2_glock_init(void)
>   {
>   	int ret;
> +	unsigned i;
> +
> +	for (i = 0; i < LRU_HASH_BUCKETS; i++) {
> +		INIT_LIST_HEAD(&lru_locks[i].lru_list);
> +		rwlock_init(&lru_locks[i].lru_lock);
> +	}
>   
>   	ret = rhashtable_init(&gl_hash_table, &ht_parms);
>   	if (ret < 0)
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 2ad003b..e8c0f2b 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -374,6 +374,7 @@ struct gfs2_glock {
>   		} gl_vm;
>   	};
>   	struct rhash_head gl_node;
> +	int gl_lru_bucket;
>   };
>   
>   #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention
  2017-07-03  9:21   ` Steven Whitehouse
@ 2017-07-03 20:31     ` Bob Peterson
  2017-07-04  8:55       ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Peterson @ 2017-07-03 20:31 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Hi,
| 
| 
| On 30/06/17 19:18, Bob Peterson wrote:
| > Hi,
| >
| > This patch changes GFS2's LRU list from a single linked list with
| > a much-contended spin_lock to a hash table of linked lists, thus
| > having more locks with less contention. The key for the table is
| > set round-robin to evenly distribute the glocks into the buckets.
| >
| > Signed-off-by: Bob Peterson <rpeterso@redhat.com>
| This is a bit confused I think. There needs to be some more work done
| here to figure out exactly what the problem is, but I'm not very keen on
| this solution for a number of reasons...
| 
|   - It still claims to be LRU, but in fact it isn't any more
|   - The list might have been broken up, but the counter is still global,
| so that there is still the possibility for contention on that
|   - There is no performance data to say whether or not there is any
| improvement or not
|   - There should not in general be huge numbers of LRU operations on
| glocks anyway, so why is the LRU list contended in the first place?
|   - It introduces a new global counter at glock creation time, so
| something else that might land up being a contention point
| 
| The important question is what causes the contention on the LRU list in
| the first place - ideally we'd look at that issue in order to avoid so
| many list operations, rather than making it very complicated and non-LRU
| - at the very least there should be a justification for the new algorithm,
| 
| Steve.

Hi,

Let me see if I can address some of the issues you pointed out.

| There needs to be some more work done
| here to figure out exactly what the problem is

The problem, as I see it, is contention on the lru_lock, which is a single
point of contention between all GFS2 glock users as documented in this
bug: https://bugzilla.redhat.com/show_bug.cgi?id=1456886.

Before I posted the patch, I asked our performance group to run specsfs
on an instrumented kernel. The instrumented kernel basically tells me
how much total time was being spent in each 30-second iterval holding the
lru_lock, and how much time was spent waiting for it.
GFS2 only locks the lru_lock in five places, which I numbered 1 through 4.
The five are:

(1) gfs2_glock_add_to_lru
(2) gfs2_glock_remove_from_lru
(3) gfs2_glock_put
(4) gfs2_scan_glock_lru

The goal was to determine if the lru scanning code was holding the lru
lock for long periods of time. I examined the results in detail for the
duration of the specsfs run. Here is a very small excerpt of the output
that shows one of the anomalies:

Jun 14 15:21:34 dellfsvr1 kernel: held: 0 23 22 0 0   wait: 0 191 187 0 0
Jun 14 15:21:35 dellfsvr1 kernel: held: 0 110 115 0 0   wait: 0 563 523 1 0
Jun 14 15:21:50 dellfsvr1 kernel: held: 0 1541 1686 7 0   wait: 0 10596 10478 30 0
Jun 14 15:21:50 dellfsvr1 kernel: held: 0 67 74 1 0   wait: 0 412 453 0 0
Jun 14 15:21:51 dellfsvr1 kernel: held: 0 49 68 0 0   wait: 0 232 248 0 0

As in this excerpt, the "held" times imply that no function is holding
the lru_lock for very long. In this excerpt, the longest time the lru lock
was held was 1686 milliseconds from remove_from_lru, followed closely by
add_to_lru. (In other words, the total time spent in add_to_lru was 1541 ms
during that 30 second reporting period.) Functions glock_put and
scan_glock_lru were so small they're barely a blip.

The "wait" times show that during a 30 second period, add_to_lru waited
a total of 10596 milliseconds, and remove_from_lru waited a total of 10478
milliseconds.

As an experiment, I tried disabling interrupts while the lru lock was
held. The result: Absolutely no change in performance, so the problem
is apparently not made worse by context switches.

My conclusion was that nobody is holding the lru lock for long; the
contention is just based on the sheer number of calls to lock it.

That's why I broke the lock into a hash table to split it up.

I'm going to ask the perf team to try a patched version with specsfs
because I can't seem to find the results of the last test.

(1) It still claims to be LRU, but in fact it isn't any more.

This is true. I don't know how much of a problem that is, but we
might be able to rearrange things to make it more like one.

(2) The list might have been broken up, but the counter is still global.

The counter is just an atomic, so it seems unlikely to me to contend
enough to make a difference. Worst case, it would be no worse than what
we've got now. Hopefully the performance numbers will reflect it.

(3) There is no performance data to say whether or not there is any improvement or not.

Again, I'll try to get those numbers.

(4) There should not in general be huge numbers of LRU operations on
glocks anyway, so why is the LRU list contended in the first place?

The glocks are being added to and taken off the lru list a lot.
If there aren't huge numbers of lru operations, there is no need for
bz#1456886. The "why is it contended in the first place" is explained
above: It's the sheer number of processes hitting that lock at any
given time.

(5) It introduces a new global counter at glock creation time, so
something else that might land up being a contention point.

Point taken. The performance numbers should hopefully reflect whether
that's true.

Stay tuned and I'll see if I can get some performance statistics.

Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention
  2017-07-03 20:31     ` Bob Peterson
@ 2017-07-04  8:55       ` Steven Whitehouse
  0 siblings, 0 replies; 4+ messages in thread
From: Steven Whitehouse @ 2017-07-04  8:55 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 03/07/17 21:31, Bob Peterson wrote:
> ----- Original Message -----
> | Hi,
> |
> |
> | On 30/06/17 19:18, Bob Peterson wrote:
> | > Hi,
> | >
> | > This patch changes GFS2's LRU list from a single linked list with
> | > a much-contended spin_lock to a hash table of linked lists, thus
> | > having more locks with less contention. The key for the table is
> | > set round-robin to evenly distribute the glocks into the buckets.
> | >
> | > Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> | This is a bit confused I think. There needs to be some more work done
> | here to figure out exactly what the problem is, but I'm not very keen on
> | this solution for a number of reasons...
> |
> |   - It still claims to be LRU, but in fact it isn't any more
> |   - The list might have been broken up, but the counter is still global,
> | so that there is still the possibility for contention on that
> |   - There is no performance data to say whether or not there is any
> | improvement or not
> |   - There should not in general be huge numbers of LRU operations on
> | glocks anyway, so why is the LRU list contended in the first place?
> |   - It introduces a new global counter at glock creation time, so
> | something else that might land up being a contention point
> |
> | The important question is what causes the contention on the LRU list in
> | the first place - ideally we'd look at that issue in order to avoid so
> | many list operations, rather than making it very complicated and non-LRU
> | - at the very least there should be a justification for the new algorithm,
> |
> | Steve.
>
> Hi,
>
> Let me see if I can address some of the issues you pointed out.
>
> | There needs to be some more work done
> | here to figure out exactly what the problem is
>
> The problem, as I see it, is contention on the lru_lock, which is a single
> point of contention between all GFS2 glock users as documented in this
> bug: https://bugzilla.redhat.com/show_bug.cgi?id=1456886.
>
> Before I posted the patch, I asked our performance group to run specsfs
> on an instrumented kernel. The instrumented kernel basically tells me
> how much total time was being spent in each 30-second iterval holding the
> lru_lock, and how much time was spent waiting for it.
> GFS2 only locks the lru_lock in five places, which I numbered 1 through 4.
> The five are:
>
> (1) gfs2_glock_add_to_lru
> (2) gfs2_glock_remove_from_lru
> (3) gfs2_glock_put
> (4) gfs2_scan_glock_lru
>
> The goal was to determine if the lru scanning code was holding the lru
> lock for long periods of time. I examined the results in detail for the
> duration of the specsfs run. Here is a very small excerpt of the output
> that shows one of the anomalies:
>
> Jun 14 15:21:34 dellfsvr1 kernel: held: 0 23 22 0 0   wait: 0 191 187 0 0
> Jun 14 15:21:35 dellfsvr1 kernel: held: 0 110 115 0 0   wait: 0 563 523 1 0
> Jun 14 15:21:50 dellfsvr1 kernel: held: 0 1541 1686 7 0   wait: 0 10596 10478 30 0
> Jun 14 15:21:50 dellfsvr1 kernel: held: 0 67 74 1 0   wait: 0 412 453 0 0
> Jun 14 15:21:51 dellfsvr1 kernel: held: 0 49 68 0 0   wait: 0 232 248 0 0
>
> As in this excerpt, the "held" times imply that no function is holding
> the lru_lock for very long. In this excerpt, the longest time the lru lock
> was held was 1686 milliseconds from remove_from_lru, followed closely by
That is a very long time for the lock to be held, assuming that is a 
single call to the lock. That implies that something is wrong, since 
that function does very little. If that includes the wait time for the 
lock, then it is just an artifact of the contention and doesn't by 
itself tell us anything that we didn't already know.

> add_to_lru. (In other words, the total time spent in add_to_lru was 1541 ms
> during that 30 second reporting period.) Functions glock_put and
> scan_glock_lru were so small they're barely a blip.
>
> The "wait" times show that during a 30 second period, add_to_lru waited
> a total of 10596 milliseconds, and remove_from_lru waited a total of 10478
> milliseconds.
>
> As an experiment, I tried disabling interrupts while the lru lock was
> held. The result: Absolutely no change in performance, so the problem
> is apparently not made worse by context switches.
I'm not sure I understand, there would not ever be any context switches 
while spinlocks are held anyway.

>
> My conclusion was that nobody is holding the lru lock for long; the
> contention is just based on the sheer number of calls to lock it.
>
> That's why I broke the lock into a hash table to split it up.
>
> I'm going to ask the perf team to try a patched version with specsfs
> because I can't seem to find the results of the last test.
>
> (1) It still claims to be LRU, but in fact it isn't any more.
>
> This is true. I don't know how much of a problem that is, but we
> might be able to rearrange things to make it more like one.
The new algorithm is more or less selecting random glocks to demote. We 
know from previous experience that this doesn't work well, so I'd rather 
not substitute one bug for a different one, since we are no better off.

>
> (2) The list might have been broken up, but the counter is still global.
>
> The counter is just an atomic, so it seems unlikely to me to contend
> enough to make a difference. Worst case, it would be no worse than what
> we've got now. Hopefully the performance numbers will reflect it.
Whether it is atomic or not, has nothing to do with whether it will 
contend. The issue is not the operation itself but whether it is likely 
to be sharing cache lines with other threads on different cpus. So at 
the very least it should have been made a per_cpu counter, assuming that 
this really is the right solution.

>
> (3) There is no performance data to say whether or not there is any improvement or not.
>
> Again, I'll try to get those numbers.
>
> (4) There should not in general be huge numbers of LRU operations on
> glocks anyway, so why is the LRU list contended in the first place?
>
> The glocks are being added to and taken off the lru list a lot.
Yes, and we need to get to the bottom of why that is happening. This is 
the missing piece of the puzzle. What glock types are implicated in 
this? what operations are causing this to happen? can we avoid doing 
that in the first place?

> If there aren't huge numbers of lru operations, there is no need for
> bz#1456886. The "why is it contended in the first place" is explained
> above: It's the sheer number of processes hitting that lock at any
> given time.
>
> (5) It introduces a new global counter at glock creation time, so
> something else that might land up being a contention point.
>
> Point taken. The performance numbers should hopefully reflect whether
> that's true.
>
> Stay tuned and I'll see if I can get some performance statistics.
>
> Regards,
>
> Bob Peterson
> Red Hat File Systems

There is already a generic NUMA aware LRU (or nearly LRU) implementation 
which we are using. It would be better to simply make the changes to the 
glock shrinker so that it can use the NUMA aware implementation, as per 
the quota shrinker. I seem to remember there was an outstanding issue 
which meant that this was not so easy to do, and I don't remember what 
it was off the top of my head. At least if we are going to break up that 
list then lets make sure that we do it in a way that makes sense,

Steve.




^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-07-04  8:55 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1321492262.27823235.1498846645921.JavaMail.zimbra@redhat.com>
2017-06-30 18:18 ` [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention Bob Peterson
2017-07-03  9:21   ` Steven Whitehouse
2017-07-03 20:31     ` Bob Peterson
2017-07-04  8:55       ` Steven Whitehouse

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.