linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Waiman Long <Waiman.Long@hpe.com>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Jan Kara <jack@suse.com>, Jeff Layton <jlayton@poochiereds.net>,
	"J. Bruce Fields" <bfields@fieldses.org>,
	Tejun Heo <tj@kernel.org>,
	Christoph Lameter <cl@linux-foundation.org>
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andi Kleen <andi@firstfloor.org>,
	Dave Chinner <dchinner@redhat.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	Scott J Norton <scott.norton@hpe.com>,
	Douglas Hatch <doug.hatch@hpe.com>,
	Waiman Long <Waiman.Long@hpe.com>
Subject: [PATCH v2 1/7] lib/dlock-list: Distributed and lock-protected lists
Date: Mon, 11 Jul 2016 13:32:06 -0400	[thread overview]
Message-ID: <1468258332-61537-2-git-send-email-Waiman.Long@hpe.com> (raw)
In-Reply-To: <1468258332-61537-1-git-send-email-Waiman.Long@hpe.com>

Linked list is used everywhere in the Linux kernel. However, if many
threads are trying to add or delete entries into the same linked list,
it can create a performance bottleneck.

This patch introduces a new list APIs that provide a set of distributed
lists (one per CPU), each of which is protected by its own spinlock.
To the callers, however, the set of lists acts like a single
consolidated list.  This allows list entries insertion and deletion
operations to happen in parallel instead of being serialized with a
global list and lock.

List entry insertion is strictly per cpu. List deletion, however, can
happen in a cpu other than the one that did the insertion. So we still
need lock to protect the list. Because of that, there may still be
a small amount of contention when deletion is being done.

A new header file include/linux/dlock-list.h will be added with the
associated dlock_list_head and dlock_list_node structures. The following
functions are provided to manage the per-cpu list:

 1. int init_dlock_list_head(struct dlock_list_head **pdlock_head)
 2. void dlock_list_add(struct dlock_list_node *node,
		        struct dlock_list_head *head)
 3. void dlock_list_del(struct dlock_list *node)

Iteration of all the list entries within a group of per-cpu
lists is done by calling either the dlock_list_iterate() or
dlock_list_iterate_safe() functions in a while loop. They correspond
to the list_for_each_entry() and list_for_each_entry_safe() macros
respectively. The iteration states are keep in a dlock_list_state
structure that is passed to the iteration functions.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/dlock-list.h |  233 ++++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile               |    2 +-
 lib/dlock-list.c           |  100 +++++++++++++++++++
 3 files changed, 334 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/dlock-list.h
 create mode 100644 lib/dlock-list.c

diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
new file mode 100644
index 0000000..43355f8
--- /dev/null
+++ b/include/linux/dlock-list.h
@@ -0,0 +1,233 @@
+/*
+ * Distributed/locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+#ifndef __LINUX_DLOCK_LIST_H
+#define __LINUX_DLOCK_LIST_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+
+/*
+ * include/linux/dlock-list.h
+ *
+ * A distributed (per-cpu) set of lists each of which is protected by its
+ * own spinlock, but acts like a single consolidated list to the callers.
+ *
+ * The dlock_list_head structure contains the spinlock, the other
+ * dlock_list_node structures only contains a pointer to the spinlock in
+ * dlock_list_head.
+ */
+struct dlock_list_head {
+	struct list_head list;
+	spinlock_t lock;
+};
+
+#define DLOCK_LIST_HEAD_INIT(name)				\
+	{							\
+		.list.prev = &name.list,			\
+		.list.next = &name.list,			\
+		.list.lock = __SPIN_LOCK_UNLOCKED(name),	\
+	}
+
+/*
+ * Per-cpu list iteration state
+ */
+struct dlock_list_state {
+	int			 cpu;
+	spinlock_t		*lock;
+	struct list_head	*head;	/* List head of current per-cpu list */
+	struct dlock_list_node	*curr;
+	struct dlock_list_node	*next;
+};
+
+#define DLOCK_LIST_STATE_INIT()			\
+	{					\
+		.cpu  = -1,			\
+		.lock = NULL,			\
+		.head = NULL,			\
+		.curr = NULL,			\
+		.next = NULL,			\
+	}
+
+#define DEFINE_DLOCK_LIST_STATE(s)		\
+	struct dlock_list_state s = DLOCK_LIST_STATE_INIT()
+
+static inline void init_dlock_list_state(struct dlock_list_state *state)
+{
+	state->cpu  = -1;
+	state->lock = NULL;
+	state->head = NULL;
+	state->curr = NULL;
+	state->next = NULL;
+}
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define DLOCK_LIST_WARN_ON(x)	WARN_ON(x)
+#else
+#define DLOCK_LIST_WARN_ON(x)
+#endif
+
+/*
+ * Next per-cpu list entry
+ */
+#define dlock_list_next_entry(pos, member) list_next_entry(pos, member.list)
+
+/*
+ * Per-cpu node data structure
+ */
+struct dlock_list_node {
+	struct list_head list;
+	spinlock_t *lockptr;
+};
+
+#define DLOCK_LIST_NODE_INIT(name)		\
+	{					\
+		.list.prev = &name.list,	\
+		.list.next = &name.list,	\
+		.list.lockptr = NULL		\
+	}
+
+static inline void init_dlock_list_node(struct dlock_list_node *node)
+{
+	INIT_LIST_HEAD(&node->list);
+	node->lockptr = NULL;
+}
+
+static inline void free_dlock_list_head(struct dlock_list_head **pdlock_head)
+{
+	free_percpu(*pdlock_head);
+	*pdlock_head = NULL;
+}
+
+/*
+ * Check if all the per-cpu lists are empty
+ */
+static inline bool dlock_list_empty(struct dlock_list_head *dlock_head)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		if (!list_empty(&per_cpu_ptr(dlock_head, cpu)->list))
+			return false;
+	return true;
+}
+
+/*
+ * Helper function to find the first entry of the next per-cpu list
+ * It works somewhat like for_each_possible_cpu(cpu).
+ *
+ * Return: true if the entry is found, false if all the lists exhausted
+ */
+static __always_inline bool
+__dlock_list_next_cpu(struct dlock_list_head *head,
+		      struct dlock_list_state *state)
+{
+	if (state->lock)
+		spin_unlock(state->lock);
+next_cpu:
+	/*
+	 * for_each_possible_cpu(cpu)
+	 */
+	state->cpu = cpumask_next(state->cpu, cpu_possible_mask);
+	if (state->cpu >= nr_cpu_ids)
+		return false;	/* All the per-cpu lists iterated */
+
+	state->head = &per_cpu_ptr(head, state->cpu)->list;
+	if (list_empty(state->head))
+		goto next_cpu;
+
+	state->lock = &per_cpu_ptr(head, state->cpu)->lock;
+	spin_lock(state->lock);
+	/*
+	 * There is a slight chance that the list may become empty just
+	 * before the lock is acquired. So an additional check is
+	 * needed to make sure that state->curr points to a valid entry.
+	 */
+	if (list_empty(state->head)) {
+		spin_unlock(state->lock);
+		goto next_cpu;
+	}
+	state->curr = list_entry(state->head->next,
+				 struct dlock_list_node, list);
+	return true;
+}
+
+/*
+ * Iterate to the next entry of the group of per-cpu lists
+ *
+ * Return: true if the next entry is found, false if all the entries iterated
+ */
+static inline bool dlock_list_iterate(struct dlock_list_head *head,
+				      struct dlock_list_state *state)
+{
+	/*
+	 * Find next entry
+	 */
+	if (state->curr)
+		state->curr = list_next_entry(state->curr, list);
+
+	if (!state->curr || (&state->curr->list == state->head)) {
+		/*
+		 * The current per-cpu list has been exhausted, try the next
+		 * per-cpu list.
+		 */
+		if (!__dlock_list_next_cpu(head, state))
+			return false;
+	}
+
+	DLOCK_LIST_WARN_ON(state->curr->lockptr != state->lock);
+	return true;	/* Continue the iteration */
+}
+
+/*
+ * Iterate to the next entry of the group of per-cpu lists and safe
+ * against removal of list_entry
+ *
+ * Return: true if the next entry is found, false if all the entries iterated
+ */
+static inline bool dlock_list_iterate_safe(struct dlock_list_head *head,
+					   struct dlock_list_state *state)
+{
+	/*
+	 * Find next entry
+	 */
+	if (state->curr) {
+		state->curr = state->next;
+		state->next = list_next_entry(state->next, list);
+	}
+
+	if (!state->curr || (&state->curr->list == state->head)) {
+		/*
+		 * The current per-cpu list has been exhausted, try the next
+		 * per-cpu list.
+		 */
+		if (!__dlock_list_next_cpu(head, state))
+			return false;
+		state->next = list_next_entry(state->curr, list);
+	}
+
+	DLOCK_LIST_WARN_ON(state->curr->lockptr != state->lock);
+	return true;	/* Continue the iteration */
+}
+
+extern void dlock_list_add(struct dlock_list_node *node,
+			  struct dlock_list_head *head);
+extern void dlock_list_del(struct dlock_list_node *node);
+extern int  init_dlock_list_head(struct dlock_list_head **pdlock_head);
+
+#endif /* __LINUX_DLOCK_LIST_H */
diff --git a/lib/Makefile b/lib/Makefile
index 499fb35..92e8c38 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-	 once.o
+	 once.o dlock-list.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/dlock-list.c b/lib/dlock-list.c
new file mode 100644
index 0000000..84d4623
--- /dev/null
+++ b/lib/dlock-list.c
@@ -0,0 +1,100 @@
+/*
+ * Distributed/locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+#include <linux/dlock-list.h>
+#include <linux/lockdep.h>
+
+/*
+ * The dlock list lock needs its own class to avoid warning and stack
+ * trace when lockdep is enabled.
+ */
+static struct lock_class_key dlock_list_key;
+
+/*
+ * Initialize the per-cpu list head
+ */
+int init_dlock_list_head(struct dlock_list_head **pdlock_head)
+{
+	struct dlock_list_head *dlock_head;
+	int cpu;
+
+	dlock_head = alloc_percpu(struct dlock_list_head);
+	if (!dlock_head)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct dlock_list_head *head = per_cpu_ptr(dlock_head, cpu);
+
+		INIT_LIST_HEAD(&head->list);
+		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
+		lockdep_set_class(&head->lock, &dlock_list_key);
+	}
+
+	*pdlock_head = dlock_head;
+	return 0;
+}
+
+/*
+ * List selection is based on the CPU being used when the dlock_list_add()
+ * function is called. However, deletion may be done by a different CPU.
+ * So we still need to use a lock to protect the content of the list.
+ */
+void dlock_list_add(struct dlock_list_node *node, struct dlock_list_head *head)
+{
+	struct dlock_list_head *myhead;
+
+	/*
+	 * Disable preemption to make sure that CPU won't gets changed.
+	 */
+	myhead = get_cpu_ptr(head);
+	spin_lock(&myhead->lock);
+	node->lockptr = &myhead->lock;
+	list_add(&node->list, &myhead->list);
+	spin_unlock(&myhead->lock);
+	put_cpu_ptr(head);
+}
+
+/*
+ * Delete a node from a dlock list
+ *
+ * We need to check the lock pointer again after taking the lock to guard
+ * against concurrent delete of the same node. If the lock pointer changes
+ * (becomes NULL or to a different one), we assume that the deletion was done
+ * elsewhere.
+ */
+void dlock_list_del(struct dlock_list_node *node)
+{
+	spinlock_t *lock = READ_ONCE(node->lockptr);
+
+	if (unlikely(!lock)) {
+		WARN(1, "dlock_list_del: node 0x%lx has no associated lock\n",
+			(unsigned long)node);
+		return;
+	}
+
+	spin_lock(lock);
+	if (likely(lock == node->lockptr)) {
+		list_del_init(&node->list);
+		node->lockptr = NULL;
+	} else {
+		/*
+		 * This path should never be executed.
+		 */
+		WARN_ON(1);
+	}
+	spin_unlock(lock);
+}
-- 
1.7.1

  reply	other threads:[~2016-07-11 17:32 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-07-11 17:32 [PATCH v2 0/7] vfs: Use dlock list for SB's s_inodes list Waiman Long
2016-07-11 17:32 ` Waiman Long [this message]
2016-07-13 16:08   ` [PATCH v2 1/7] lib/dlock-list: Distributed and lock-protected lists Tejun Heo
2016-07-14  2:54     ` Waiman Long
2016-07-14 11:50       ` Tejun Heo
2016-07-14 14:35         ` Jan Kara
2016-07-14 14:51           ` Tejun Heo
2016-07-14 16:16             ` Waiman Long
2016-07-14 17:13         ` Waiman Long
2016-07-14 17:41           ` Tejun Heo
2016-07-11 17:32 ` [PATCH v2 2/7] lib/dlock-list: Add __percpu modifier for parameters Waiman Long
2016-07-13 16:08   ` Tejun Heo
2016-07-14  2:54     ` Waiman Long
2016-07-11 17:32 ` [PATCH v2 3/7] fsnotify: Simplify inode iteration on umount Waiman Long
2016-07-11 17:32 ` [PATCH v2 4/7] vfs: Remove unnecessary list_for_each_entry_safe() variants Waiman Long
2016-07-11 17:32 ` [PATCH v2 5/7] vfs: Use dlock list for superblock's inode list Waiman Long
2016-07-11 17:32 ` [RFC PATCH v2 6/7] lib/persubnode: Introducing a simple per-subnode APIs Waiman Long
2016-07-12  3:14   ` Boqun Feng
2016-07-12 18:37     ` Waiman Long
2016-07-12 14:27   ` Tejun Heo
2016-07-12 18:51     ` Waiman Long
2016-07-12 18:57       ` Tejun Heo
2016-07-12 19:42         ` Waiman Long
2016-07-11 17:32 ` [RFC PATCH v2 7/7] lib/dlock-list: Use the per-subnode APIs for managing lists Waiman Long

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1468258332-61537-2-git-send-email-Waiman.Long@hpe.com \
    --to=waiman.long@hpe.com \
    --cc=andi@firstfloor.org \
    --cc=bfields@fieldses.org \
    --cc=boqun.feng@gmail.com \
    --cc=cl@linux-foundation.org \
    --cc=dchinner@redhat.com \
    --cc=doug.hatch@hpe.com \
    --cc=jack@suse.com \
    --cc=jlayton@poochiereds.net \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=scott.norton@hpe.com \
    --cc=tj@kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).