* [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head
2014-05-02 19:02 ` [PATCHv2 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
@ 2014-05-02 19:02 ` Dan Streetman
2014-05-02 19:02 ` [PATCH 2/4] plist: add helper functions Dan Streetman
` (3 subsequent siblings)
4 siblings, 0 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-02 19:02 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Weijie Yang,
Rik van Riel, Johannes Weiner, linux-mm, linux-kernel,
Shaohua Li
Replace the singly-linked list tracking active, i.e. swapon'ed,
swap_info_struct entries with a doubly-linked list using struct list_heads.
Simplify the logic iterating and manipulating the list of entries,
especially get_swap_page(), by using standard list_head functions,
and removing the highest priority iteration logic.
The change fixes the bug:
https://lkml.org/lkml/2014/2/13/181
in which different priority swap entries after the highest priority entry
are incorrectly used equally in pairs. The swap behavior is now as
advertised, i.e. different priority swap entries are used in order, and
equal priority swap targets are used concurrently.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
---
Changelog since v1 https://lkml.org/lkml/2014/4/12/72
-update/add comments only, as suggested by Mel.
include/linux/swap.h | 7 +-
include/linux/swapfile.h | 2 +-
mm/frontswap.c | 13 ++--
mm/swapfile.c | 171 ++++++++++++++++++++---------------------------
4 files changed, 78 insertions(+), 115 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5a14b92..8bb85d6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,8 +214,8 @@ struct percpu_cluster {
struct swap_info_struct {
unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */
+ struct list_head list; /* entry in swap list */
signed char type; /* strange name for an index */
- signed char next; /* next type on the swap list */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +255,6 @@ struct swap_info_struct {
struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
};
-struct swap_list_t {
- int head; /* head of priority-ordered swapfile list */
- int next; /* swapfile to be used next */
-};
-
/* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624..2eab382 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
* want to expose them to the dozens of source files that include swap.h
*/
extern spinlock_t swap_lock;
-extern struct swap_list_t swap_list;
+extern struct list_head swap_list_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdc..fae1160 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
static unsigned long __frontswap_curr_pages(void)
{
- int type;
unsigned long totalpages = 0;
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ list_for_each_entry(si, &swap_list_head, list)
totalpages += atomic_read(&si->frontswap_pages);
- }
return totalpages;
}
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
int si_frontswap_pages;
unsigned long total_pages_to_unuse = total;
unsigned long pages = 0, pages_to_unuse = 0;
- int type;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ list_for_each_entry(si, &swap_list_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
vm_unacct_memory(pages);
*unused = pages_to_unuse;
- *swapid = type;
+ *swapid = si->type;
ret = 0;
break;
}
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_list.head each time
+ * so restart scan from swap_list_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4a7f7e6..6c95a8c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+LIST_HEAD(swap_list_head);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -640,66 +643,54 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si, *next;
pgoff_t offset;
- int type, next;
- int wrapped = 0;
- int hp_index;
+ struct list_head *tmp;
spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- hp_index = atomic_xchg(&highest_priority_index, -1);
- /*
- * highest_priority_index records current highest priority swap
- * type which just frees swap entries. If its priority is
- * higher than that of swap_list.next swap type, we use it. It
- * isn't protected by swap_lock, so it can be an invalid value
- * if the corresponding swap type is swapoff. We double check
- * the flags here. It's even possible the swap type is swapoff
- * and swapon again and its priority is changed. In such rare
- * case, low prority swap type might be used, but eventually
- * high priority swap will be used after several rounds of
- * swap.
- */
- if (hp_index != -1 && hp_index != type &&
- swap_info[type]->prio < swap_info[hp_index]->prio &&
- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
- type = hp_index;
- swap_list.next = type;
- }
-
- si = swap_info[type];
- next = si->next;
- if (next < 0 ||
- (!wrapped && si->prio != swap_info[next]->prio)) {
- next = swap_list.head;
- wrapped++;
- }
-
+ list_for_each(tmp, &swap_list_head) {
+ si = list_entry(tmp, typeof(*si), list);
spin_lock(&si->lock);
- if (!si->highest_bit) {
- spin_unlock(&si->lock);
- continue;
- }
- if (!(si->flags & SWP_WRITEOK)) {
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_unlock(&si->lock);
continue;
}
- swap_list.next = next;
+ /*
+ * rotate the current swap_info that we're going to use
+ * to after any other swap_info that have the same prio,
+ * so that all equal-priority swap_info get used equally
+ */
+ next = si;
+ list_for_each_entry_continue(next, &swap_list_head, list) {
+ if (si->prio != next->prio)
+ break;
+ list_rotate_left(&si->list);
+ next = si;
+ }
spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
- return swp_entry(type, offset);
+ return swp_entry(si->type, offset);
spin_lock(&swap_lock);
- next = swap_list.next;
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one. So we need to
+ * try again. Since we dropped the swap_lock, there may now
+ * be non-full higher priority swap_infos, and this si may have
+ * even been removed from the list (although very unlikely).
+ * Let's start over.
+ */
+ tmp = &swap_list_head;
}
atomic_long_inc(&nr_swap_pages);
@@ -766,27 +757,6 @@ out:
return NULL;
}
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
- int old_hp_index, new_hp_index;
-
- do {
- old_hp_index = atomic_read(&highest_priority_index);
- if (old_hp_index != -1 &&
- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
- break;
- new_hp_index = type;
- } while (atomic_cmpxchg(&highest_priority_index,
- old_hp_index, new_hp_index) != old_hp_index);
-}
-
static unsigned char swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
- set_highest_priority_index(p->type);
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- int i, prev;
+ struct swap_info_struct *si;
if (prio >= 0)
p->prio = prio;
@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
+ assert_spin_locked(&swap_lock);
+ BUG_ON(!list_empty(&p->list));
+ /*
+ * insert into swap list; the list is in priority order,
+ * so that get_swap_page() can get a page from the highest
+ * priority swap_info_struct with available page(s), and
+ * swapoff can adjust the auto-assigned (i.e. negative) prio
+ * values for any lower-priority swap_info_structs when
+ * removing a negative-prio swap_info_struct
+ */
+ list_for_each_entry(si, &swap_list_head, list) {
+ if (p->prio >= si->prio) {
+ list_add_tail(&p->list, &si->list);
+ return;
+ }
}
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p->type;
- else
- swap_info[prev]->next = p->type;
+ /*
+ * this covers two cases:
+ * 1) p->prio is less than all existing prio
+ * 2) the swap list is empty
+ */
+ list_add_tail(&p->list, &swap_list_head);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int i, type, prev;
- int err;
+ int err, found = 0;
unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out;
mapping = victim->f_mapping;
- prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
- p = swap_info[type];
+ list_for_each_entry(p, &swap_list_head, list) {
if (p->flags & SWP_WRITEOK) {
- if (p->swap_file->f_mapping == mapping)
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
break;
+ }
}
- prev = type;
}
- if (type < 0) {
+ if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0)
- swap_list.head = p->next;
- else
- swap_info[prev]->next = p->next;
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
- }
spin_lock(&p->lock);
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i]->next)
- swap_info[i]->prio = p->prio--;
+ struct swap_info_struct *si = p;
+
+ list_for_each_entry_continue(si, &swap_list_head, list) {
+ si->prio++;
+ }
least_priority++;
}
+ list_del_init(&p->list);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
set_current_oom_origin();
- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
if (err) {
@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- frontswap_invalidate_area(type);
+ frontswap_invalidate_area(p->type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(cluster_info);
vfree(frontswap_map);
/* Destroy swap account information */
- swap_cgroup_swapoff(type);
+ swap_cgroup_swapoff(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
+ INIT_LIST_HEAD(&p->list);
p->flags = SWP_USED;
- p->next = -1;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* [PATCH 2/4] plist: add helper functions
2014-05-02 19:02 ` [PATCHv2 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
2014-05-02 19:02 ` [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head Dan Streetman
@ 2014-05-02 19:02 ` Dan Streetman
2014-05-12 10:35 ` Mel Gorman
2014-05-02 19:02 ` [PATCH 3/4] plist: add plist_rotate Dan Streetman
` (2 subsequent siblings)
4 siblings, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-02 19:02 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Rik van Riel,
Weijie Yang, Johannes Weiner, linux-mm, linux-kernel,
Paul Gortmaker, Steven Rostedt, Thomas Gleixner
Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
define and initialize a struct plist_head.
Add plist_for_each_continue() and plist_for_each_entry_continue(),
equivalent to list_for_each_continue() and list_for_each_entry_continue(),
to iterate over a plist continuing after the current position.
Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
and ->next, implemented by list_prev_entry() and list_next_entry(), to
access the prev/next struct plist_node entry. These are needed because
unlike struct list_head, direct access of the prev/next struct plist_node
isn't possible; the list must be navigated via the contained struct list_head.
e.g. instead of accessing the prev by list_prev_entry(node, node_list)
it can be accessed by plist_prev(node).
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
This is new to this patch set, and these helper functions are used
by the following 2 patches. They aren't critical, as their functionality
can be achieved using regular list functions, but they help simplify
code that uses them.
include/linux/plist.h | 43 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/include/linux/plist.h b/include/linux/plist.h
index aa0fb39..c815491 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -98,6 +98,13 @@ struct plist_node {
}
/**
+ * PLIST_HEAD - declare and init plist_head
+ * @head: name for struct plist_head variable
+ */
+#define PLIST_HEAD(head) \
+ struct plist_head head = PLIST_HEAD_INIT(head)
+
+/**
* PLIST_NODE_INIT - static struct plist_node initializer
* @node: struct plist_node variable name
* @__prio: initial node priority
@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
list_for_each_entry(pos, &(head)->node_list, node_list)
/**
+ * plist_for_each_continue - continue iteration over the plist
+ * @pos: the type * to use as a loop cursor
+ * @head: the head for your list
+ *
+ * Continue to iterate over plist, continuing after the current position.
+ */
+#define plist_for_each_continue(pos, head) \
+ list_for_each_entry_continue(pos, &(head)->node_list, node_list)
+
+/**
* plist_for_each_safe - iterate safely over a plist of given type
* @pos: the type * to use as a loop counter
* @n: another type * to use as temporary storage
@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
list_for_each_entry(pos, &(head)->node_list, mem.node_list)
/**
+ * plist_for_each_entry_continue - continue iteration over list of given type
+ * @pos: the type * to use as a loop cursor
+ * @head: the head for your list
+ * @m: the name of the list_struct within the struct
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define plist_for_each_entry_continue(pos, head, m) \
+ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
+
+/**
* plist_for_each_entry_safe - iterate safely over list of given type
* @pos: the type * to use as a loop counter
* @n: another type * to use as temporary storage
@@ -229,6 +258,20 @@ static inline int plist_node_empty(const struct plist_node *node)
#endif
/**
+ * plist_next - get the next entry in list
+ * @pos: the type * to cursor
+ */
+#define plist_next(pos) \
+ list_next_entry(pos, node_list)
+
+/**
+ * plist_prev - get the prev entry in list
+ * @pos: the type * to cursor
+ */
+#define plist_prev(pos) \
+ list_prev_entry(pos, node_list)
+
+/**
* plist_first - return the first node (and thus, highest priority)
* @head: the &struct plist_head pointer
*
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCH 2/4] plist: add helper functions
2014-05-02 19:02 ` [PATCH 2/4] plist: add helper functions Dan Streetman
@ 2014-05-12 10:35 ` Mel Gorman
0 siblings, 0 replies; 45+ messages in thread
From: Mel Gorman @ 2014-05-12 10:35 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Michal Hocko, Christian Ehrhardt,
Rik van Riel, Weijie Yang, Johannes Weiner, linux-mm,
linux-kernel, Paul Gortmaker, Steven Rostedt, Thomas Gleixner
On Fri, May 02, 2014 at 03:02:28PM -0400, Dan Streetman wrote:
> Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
> define and initialize a struct plist_head.
>
> Add plist_for_each_continue() and plist_for_each_entry_continue(),
> equivalent to list_for_each_continue() and list_for_each_entry_continue(),
> to iterate over a plist continuing after the current position.
>
> Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
> and ->next, implemented by list_prev_entry() and list_next_entry(), to
> access the prev/next struct plist_node entry. These are needed because
> unlike struct list_head, direct access of the prev/next struct plist_node
> isn't possible; the list must be navigated via the contained struct list_head.
> e.g. instead of accessing the prev by list_prev_entry(node, node_list)
> it can be accessed by plist_prev(node).
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
>
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 3/4] plist: add plist_rotate
2014-05-02 19:02 ` [PATCHv2 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
2014-05-02 19:02 ` [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head Dan Streetman
2014-05-02 19:02 ` [PATCH 2/4] plist: add helper functions Dan Streetman
@ 2014-05-02 19:02 ` Dan Streetman
2014-05-06 2:18 ` Steven Rostedt
2014-05-02 19:02 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
2014-05-12 16:38 ` [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
4 siblings, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-02 19:02 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Rik van Riel,
Weijie Yang, Johannes Weiner, linux-mm, linux-kernel,
Paul Gortmaker, Steven Rostedt, Thomas Gleixner
Add plist_rotate(), which moves the specified plist_node after
all other same-priority plist_nodes in the list.
This is needed by swap, which has a plist of swap_info_structs
and needs to use each same-priority swap_info_struct equally.
Also add plist_test_rotate() test function, for use by plist_test()
to test plist_rotate() function.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
This patch is new to this patch set, and it's required by the next patch,
which needs a way to move a plist entry to the end of all following
same-priority entries. This is possible to do manually instead of adding
a new plist function, but having a common plist function instead of code
specific only to swap seems preferable.
include/linux/plist.h | 2 ++
lib/plist.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 50 insertions(+)
diff --git a/include/linux/plist.h b/include/linux/plist.h
index c815491..06e6925 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -141,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
extern void plist_add(struct plist_node *node, struct plist_head *head);
extern void plist_del(struct plist_node *node, struct plist_head *head);
+extern void plist_rotate(struct plist_node *node, struct plist_head *head);
+
/**
* plist_for_each - iterate over the plist
* @pos: the type * to use as a loop counter
diff --git a/lib/plist.c b/lib/plist.c
index 1ebc95f..a8b54e5 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -134,6 +134,42 @@ void plist_del(struct plist_node *node, struct plist_head *head)
plist_check_head(head);
}
+/**
+ * plist_rotate - Rotate @node to end of same-prio entries.
+ *
+ * @node: &struct plist_node pointer - entry to be moved
+ * @head: &struct plist_head pointer - list head
+ */
+void plist_rotate(struct plist_node *node, struct plist_head *head)
+{
+ struct plist_node *next;
+ struct list_head *next_node_list = &head->node_list;
+
+ plist_check_head(head);
+ BUG_ON(plist_head_empty(head));
+ BUG_ON(plist_node_empty(node));
+
+ if (node == plist_last(head))
+ return;
+
+ next = plist_next(node);
+
+ if (node->prio != next->prio)
+ return;
+
+ plist_del(node, head);
+
+ plist_for_each_continue(next, head) {
+ if (node->prio != next->prio) {
+ next_node_list = &next->node_list;
+ break;
+ }
+ }
+ list_add_tail(&node->node_list, next_node_list);
+
+ plist_check_head(head);
+}
+
#ifdef CONFIG_DEBUG_PI_LIST
#include <linux/sched.h>
#include <linux/module.h>
@@ -170,6 +206,14 @@ static void __init plist_test_check(int nr_expect)
BUG_ON(prio_pos->prio_list.next != &first->prio_list);
}
+static void __init plist_test_rotate(struct plist_node *node)
+{
+ plist_rotate(node, &test_head);
+
+ if (node != plist_last(&test_head))
+ BUG_ON(node->prio == plist_next(node)->prio);
+}
+
static int __init plist_test(void)
{
int nr_expect = 0, i, loop;
@@ -193,6 +237,10 @@ static int __init plist_test(void)
nr_expect--;
}
plist_test_check(nr_expect);
+ if (!plist_node_empty(test_node + i)) {
+ plist_test_rotate(test_node + i);
+ plist_test_check(nr_expect);
+ }
}
for (i = 0; i < ARRAY_SIZE(test_node); i++) {
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCH 3/4] plist: add plist_rotate
2014-05-02 19:02 ` [PATCH 3/4] plist: add plist_rotate Dan Streetman
@ 2014-05-06 2:18 ` Steven Rostedt
2014-05-06 20:12 ` Dan Streetman
0 siblings, 1 reply; 45+ messages in thread
From: Steven Rostedt @ 2014-05-06 2:18 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
linux-mm, linux-kernel, Paul Gortmaker, Thomas Gleixner
On Fri, 2 May 2014 15:02:29 -0400
Dan Streetman <ddstreet@ieee.org> wrote:
> Add plist_rotate(), which moves the specified plist_node after
> all other same-priority plist_nodes in the list.
This is a little confusing? You mean it takes a plist_node from a plist
and simply moves it to the end of the list of all other nodes of the
same priority? Kind of like what a sched_yield() would do with a
SCHED_FIFO task? I wonder if we should call this "plist_yield()" then?
>
> This is needed by swap, which has a plist of swap_info_structs
> and needs to use each same-priority swap_info_struct equally.
"needs to use each same-priority swap_info_struct equally"
-ENOCOMPUTE
>
> Also add plist_test_rotate() test function, for use by plist_test()
> to test plist_rotate() function.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
>
> ---
>
> This patch is new to this patch set, and it's required by the next patch,
> which needs a way to move a plist entry to the end of all following
> same-priority entries. This is possible to do manually instead of adding
> a new plist function, but having a common plist function instead of code
> specific only to swap seems preferable.
>
> include/linux/plist.h | 2 ++
> lib/plist.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 50 insertions(+)
>
> diff --git a/include/linux/plist.h b/include/linux/plist.h
> index c815491..06e6925 100644
> --- a/include/linux/plist.h
> +++ b/include/linux/plist.h
> @@ -141,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
> extern void plist_add(struct plist_node *node, struct plist_head *head);
> extern void plist_del(struct plist_node *node, struct plist_head *head);
>
> +extern void plist_rotate(struct plist_node *node, struct plist_head *head);
> +
> /**
> * plist_for_each - iterate over the plist
> * @pos: the type * to use as a loop counter
> diff --git a/lib/plist.c b/lib/plist.c
> index 1ebc95f..a8b54e5 100644
> --- a/lib/plist.c
> +++ b/lib/plist.c
> @@ -134,6 +134,42 @@ void plist_del(struct plist_node *node, struct plist_head *head)
> plist_check_head(head);
> }
>
> +/**
> + * plist_rotate - Rotate @node to end of same-prio entries.
> + *
> + * @node: &struct plist_node pointer - entry to be moved
> + * @head: &struct plist_head pointer - list head
> + */
> +void plist_rotate(struct plist_node *node, struct plist_head *head)
> +{
> + struct plist_node *next;
> + struct list_head *next_node_list = &head->node_list;
Naming convention should be the same as plist_add() and call this
node_next instead of next_node_list.
-- Steve
> +
> + plist_check_head(head);
> + BUG_ON(plist_head_empty(head));
> + BUG_ON(plist_node_empty(node));
> +
> + if (node == plist_last(head))
> + return;
> +
> + next = plist_next(node);
> +
> + if (node->prio != next->prio)
> + return;
> +
> + plist_del(node, head);
> +
> + plist_for_each_continue(next, head) {
> + if (node->prio != next->prio) {
> + next_node_list = &next->node_list;
> + break;
> + }
> + }
> + list_add_tail(&node->node_list, next_node_list);
> +
> + plist_check_head(head);
> +}
> +
> #ifdef CONFIG_DEBUG_PI_LIST
> #include <linux/sched.h>
> #include <linux/module.h>
> @@ -170,6 +206,14 @@ static void __init plist_test_check(int nr_expect)
> BUG_ON(prio_pos->prio_list.next != &first->prio_list);
> }
>
> +static void __init plist_test_rotate(struct plist_node *node)
> +{
> + plist_rotate(node, &test_head);
> +
> + if (node != plist_last(&test_head))
> + BUG_ON(node->prio == plist_next(node)->prio);
> +}
> +
> static int __init plist_test(void)
> {
> int nr_expect = 0, i, loop;
> @@ -193,6 +237,10 @@ static int __init plist_test(void)
> nr_expect--;
> }
> plist_test_check(nr_expect);
> + if (!plist_node_empty(test_node + i)) {
> + plist_test_rotate(test_node + i);
> + plist_test_check(nr_expect);
> + }
> }
>
> for (i = 0; i < ARRAY_SIZE(test_node); i++) {
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 3/4] plist: add plist_rotate
2014-05-06 2:18 ` Steven Rostedt
@ 2014-05-06 20:12 ` Dan Streetman
2014-05-06 20:39 ` Steven Rostedt
0 siblings, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-06 20:12 UTC (permalink / raw)
To: Steven Rostedt
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
Linux-MM, linux-kernel, Paul Gortmaker, Thomas Gleixner
On Mon, May 5, 2014 at 10:18 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Fri, 2 May 2014 15:02:29 -0400
> Dan Streetman <ddstreet@ieee.org> wrote:
>
>> Add plist_rotate(), which moves the specified plist_node after
>> all other same-priority plist_nodes in the list.
>
> This is a little confusing? You mean it takes a plist_node from a plist
> and simply moves it to the end of the list of all other nodes of the
> same priority?
yes, exactly
> Kind of like what a sched_yield() would do with a
> SCHED_FIFO task? I wonder if we should call this "plist_yield()" then?
I suppose it is similar, yes...I'll rename it in a v2 patch.
>
>>
>> This is needed by swap, which has a plist of swap_info_structs
>> and needs to use each same-priority swap_info_struct equally.
>
> "needs to use each same-priority swap_info_struct equally"
>
> -ENOCOMPUTE
heh, yeah that needs a bit more explaining doesn't it :-)
by "equally", I mean as swap writes pages out to its swap devices, it
must write to any same-priority devices on a round-robin basis.
I'll update the comment in the v2 patch to try to explain clearer.
>
>>
>> Also add plist_test_rotate() test function, for use by plist_test()
>> to test plist_rotate() function.
>>
>> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
>> Cc: Steven Rostedt <rostedt@goodmis.org>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>
>> ---
>>
>> This patch is new to this patch set, and it's required by the next patch,
>> which needs a way to move a plist entry to the end of all following
>> same-priority entries. This is possible to do manually instead of adding
>> a new plist function, but having a common plist function instead of code
>> specific only to swap seems preferable.
>>
>> include/linux/plist.h | 2 ++
>> lib/plist.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 50 insertions(+)
>>
>> diff --git a/include/linux/plist.h b/include/linux/plist.h
>> index c815491..06e6925 100644
>> --- a/include/linux/plist.h
>> +++ b/include/linux/plist.h
>> @@ -141,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
>> extern void plist_add(struct plist_node *node, struct plist_head *head);
>> extern void plist_del(struct plist_node *node, struct plist_head *head);
>>
>> +extern void plist_rotate(struct plist_node *node, struct plist_head *head);
>> +
>> /**
>> * plist_for_each - iterate over the plist
>> * @pos: the type * to use as a loop counter
>> diff --git a/lib/plist.c b/lib/plist.c
>> index 1ebc95f..a8b54e5 100644
>> --- a/lib/plist.c
>> +++ b/lib/plist.c
>> @@ -134,6 +134,42 @@ void plist_del(struct plist_node *node, struct plist_head *head)
>> plist_check_head(head);
>> }
>>
>> +/**
>> + * plist_rotate - Rotate @node to end of same-prio entries.
>> + *
>> + * @node: &struct plist_node pointer - entry to be moved
>> + * @head: &struct plist_head pointer - list head
>> + */
>> +void plist_rotate(struct plist_node *node, struct plist_head *head)
>> +{
>> + struct plist_node *next;
>> + struct list_head *next_node_list = &head->node_list;
>
> Naming convention should be the same as plist_add() and call this
> node_next instead of next_node_list.
ok will do.
>
> -- Steve
>
>> +
>> + plist_check_head(head);
>> + BUG_ON(plist_head_empty(head));
>> + BUG_ON(plist_node_empty(node));
>> +
>> + if (node == plist_last(head))
>> + return;
>> +
>> + next = plist_next(node);
>> +
>> + if (node->prio != next->prio)
>> + return;
>> +
>> + plist_del(node, head);
>> +
>> + plist_for_each_continue(next, head) {
>> + if (node->prio != next->prio) {
>> + next_node_list = &next->node_list;
>> + break;
>> + }
>> + }
>> + list_add_tail(&node->node_list, next_node_list);
>> +
>> + plist_check_head(head);
>> +}
>> +
>> #ifdef CONFIG_DEBUG_PI_LIST
>> #include <linux/sched.h>
>> #include <linux/module.h>
>> @@ -170,6 +206,14 @@ static void __init plist_test_check(int nr_expect)
>> BUG_ON(prio_pos->prio_list.next != &first->prio_list);
>> }
>>
>> +static void __init plist_test_rotate(struct plist_node *node)
>> +{
>> + plist_rotate(node, &test_head);
>> +
>> + if (node != plist_last(&test_head))
>> + BUG_ON(node->prio == plist_next(node)->prio);
>> +}
>> +
>> static int __init plist_test(void)
>> {
>> int nr_expect = 0, i, loop;
>> @@ -193,6 +237,10 @@ static int __init plist_test(void)
>> nr_expect--;
>> }
>> plist_test_check(nr_expect);
>> + if (!plist_node_empty(test_node + i)) {
>> + plist_test_rotate(test_node + i);
>> + plist_test_check(nr_expect);
>> + }
>> }
>>
>> for (i = 0; i < ARRAY_SIZE(test_node); i++) {
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 3/4] plist: add plist_rotate
2014-05-06 20:12 ` Dan Streetman
@ 2014-05-06 20:39 ` Steven Rostedt
2014-05-06 21:47 ` Dan Streetman
0 siblings, 1 reply; 45+ messages in thread
From: Steven Rostedt @ 2014-05-06 20:39 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
Linux-MM, linux-kernel, Paul Gortmaker, Thomas Gleixner,
Peter Zijlstra
On Tue, 6 May 2014 16:12:54 -0400
Dan Streetman <ddstreet@ieee.org> wrote:
> On Mon, May 5, 2014 at 10:18 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> > On Fri, 2 May 2014 15:02:29 -0400
> > Dan Streetman <ddstreet@ieee.org> wrote:
> >
> >> Add plist_rotate(), which moves the specified plist_node after
> >> all other same-priority plist_nodes in the list.
> >
> > This is a little confusing? You mean it takes a plist_node from a plist
> > and simply moves it to the end of the list of all other nodes of the
> > same priority?
>
> yes, exactly
>
> > Kind of like what a sched_yield() would do with a
> > SCHED_FIFO task? I wonder if we should call this "plist_yield()" then?
>
> I suppose it is similar, yes...I'll rename it in a v2 patch.
I'm open to other suggestions as well. What else can give you the idea
that it's putting a node at the end of its priority?
I added Peter to the Cc list because I know how much he loves
sched_yield() :-)
>
> >
> >>
> >> This is needed by swap, which has a plist of swap_info_structs
> >> and needs to use each same-priority swap_info_struct equally.
> >
> > "needs to use each same-priority swap_info_struct equally"
> >
> > -ENOCOMPUTE
>
> heh, yeah that needs a bit more explaining doesn't it :-)
>
> by "equally", I mean as swap writes pages out to its swap devices, it
> must write to any same-priority devices on a round-robin basis.
OK, I think you are suffering from "being too involved to explain
clearly" syndrome. :)
I still don't see the connection between swap pages and plist, and even
more so, why something would already be in a plist and then needs to be
pushed to the end of its priority.
>
> I'll update the comment in the v2 patch to try to explain clearer.
>
Please do. But explain it to someone that has no idea how plists are
used by the swap subsystem, and why you need to move a node to the end
of its priority.
Thanks,
-- Steve
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 3/4] plist: add plist_rotate
2014-05-06 20:39 ` Steven Rostedt
@ 2014-05-06 21:47 ` Dan Streetman
2014-05-06 22:43 ` Steven Rostedt
0 siblings, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-06 21:47 UTC (permalink / raw)
To: Steven Rostedt
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
Linux-MM, linux-kernel, Paul Gortmaker, Thomas Gleixner,
Peter Zijlstra
On Tue, May 6, 2014 at 4:39 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Tue, 6 May 2014 16:12:54 -0400
> Dan Streetman <ddstreet@ieee.org> wrote:
>
>> On Mon, May 5, 2014 at 10:18 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>> > On Fri, 2 May 2014 15:02:29 -0400
>> > Dan Streetman <ddstreet@ieee.org> wrote:
>> >
>> >> Add plist_rotate(), which moves the specified plist_node after
>> >> all other same-priority plist_nodes in the list.
>> >
>> > This is a little confusing? You mean it takes a plist_node from a plist
>> > and simply moves it to the end of the list of all other nodes of the
>> > same priority?
>>
>> yes, exactly
>>
>> > Kind of like what a sched_yield() would do with a
>> > SCHED_FIFO task? I wonder if we should call this "plist_yield()" then?
>>
>> I suppose it is similar, yes...I'll rename it in a v2 patch.
>
> I'm open to other suggestions as well. What else can give you the idea
> that it's putting a node at the end of its priority?
well the specific reason in swap's case is the need to use
same-priority entries in a round-robin basis, but I don't know if
plist_round_robin() is very clear.
Maybe plist_demote()? Although demote might imply actually changing priority.
plist_shuffle()? That might imply random shuffling though.
plist_readd() or plist_requeue()? That might make sense since
technically the function could be replicated by just plist_del() and
plist_add(), based on the implementation detail that plist_add()
inserts after all other same-priority entries, instead of before.
Or add priority into the name explicitly, like plist_priority_yield(),
or plist_priority_rotate(), plist_priority_requeue()?
>
> I added Peter to the Cc list because I know how much he loves
> sched_yield() :-)
>
>>
>> >
>> >>
>> >> This is needed by swap, which has a plist of swap_info_structs
>> >> and needs to use each same-priority swap_info_struct equally.
>> >
>> > "needs to use each same-priority swap_info_struct equally"
>> >
>> > -ENOCOMPUTE
>>
>> heh, yeah that needs a bit more explaining doesn't it :-)
>>
>> by "equally", I mean as swap writes pages out to its swap devices, it
>> must write to any same-priority devices on a round-robin basis.
>
> OK, I think you are suffering from "being too involved to explain
> clearly" syndrome. :)
>
> I still don't see the connection between swap pages and plist, and even
> more so, why something would already be in a plist and then needs to be
> pushed to the end of its priority.
>
>>
>> I'll update the comment in the v2 patch to try to explain clearer.
>>
>
> Please do. But explain it to someone that has no idea how plists are
> used by the swap subsystem, and why you need to move a node to the end
> of its priority.
Ok here's try 3, before I update the patch :) Does this make sense?
This is needed by the next patch in this series, which changes swap
from using regular lists to track its available swap devices
(partitions or files) to using plists. Each swap device has a
priority, and swap allocates pages from devices in priority order,
filling up the highest priority device first (and then removing it
from the available list), by allocating a page from the swap device
that is first in the priority-ordered list. With regular lists, swap
was managing the ordering by priority, while with plists the ordering
is automatically handled. However, swap requires special handling of
swap devices with the same priority; pages must be allocated from them
in round-robin order. To accomplish this with a plist, this new
function is used; when a page is allocated from the first swap device
in the plist, that entry is moved to the end of any same-priority
entries. Then the next time a page needs to be allocated, the next
swap device will be used, and so on.
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 3/4] plist: add plist_rotate
2014-05-06 21:47 ` Dan Streetman
@ 2014-05-06 22:43 ` Steven Rostedt
0 siblings, 0 replies; 45+ messages in thread
From: Steven Rostedt @ 2014-05-06 22:43 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
Linux-MM, linux-kernel, Paul Gortmaker, Thomas Gleixner,
Peter Zijlstra
On Tue, 6 May 2014 17:47:16 -0400
Dan Streetman <ddstreet@ieee.org> wrote:
> well the specific reason in swap's case is the need to use
> same-priority entries in a round-robin basis, but I don't know if
> plist_round_robin() is very clear.
No, that's not very clear.
>
> Maybe plist_demote()? Although demote might imply actually changing priority.
Agreed.
>
> plist_shuffle()? That might imply random shuffling though.
Yep.
>
> plist_readd() or plist_requeue()? That might make sense since
> technically the function could be replicated by just plist_del() and
> plist_add(), based on the implementation detail that plist_add()
> inserts after all other same-priority entries, instead of before.
plist_requeue() sounds like the best so far.
>
> Or add priority into the name explicitly, like plist_priority_yield(),
> or plist_priority_rotate(), plist_priority_requeue()?
No, even plist_yield() assumes priority is the same, thus adding
priority to a plist that means "priority list" is rather redundant.
I think its up between plist_yield() and plist_requeue(), where I'm
leaning towards plist_requeue().
Unless others have any better ideas or objections, lets go with
plist_requeue(). I think that's rather self explanatory and it sounds
just like what you said. It's basically an optimized version of
plist_del() followed by a plist_add().
> Ok here's try 3, before I update the patch :) Does this make sense?
>
> This is needed by the next patch in this series, which changes swap
> from using regular lists to track its available swap devices
> (partitions or files) to using plists. Each swap device has a
> priority, and swap allocates pages from devices in priority order,
> filling up the highest priority device first (and then removing it
> from the available list), by allocating a page from the swap device
> that is first in the priority-ordered list. With regular lists, swap
> was managing the ordering by priority, while with plists the ordering
> is automatically handled. However, swap requires special handling of
> swap devices with the same priority; pages must be allocated from them
> in round-robin order. To accomplish this with a plist, this new
> function is used; when a page is allocated from the first swap device
> in the plist, that entry is moved to the end of any same-priority
> entries. Then the next time a page needs to be allocated, the next
> swap device will be used, and so on.
OK, I read the above a few times and I think I know where my confusion
is coming from. I was thinking that the pages were being added to the
plist. I believe you are saying that the swap devices themselves are
added to the plist, and when the device is empty (no more pages left)
it is removed from the plist. When dealing with memory and swap one
thinks of managing pages. But here we are managing the devices.
Please state clearly at the beginning of your explanation that the swap
devices are being stored in the plist and stay there as long as they
still have pages left to be allocated from. In order to treat swap
devices of the same priority in a round robin fashion, after a device
has pages allocated from it, it needs to be requeued at the end of it's
priority, behind other swap devices of the same priority in order to
make sure the next allocation comes from a different device (of same
priority).
-- Steve
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-02 19:02 ` [PATCHv2 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
` (2 preceding siblings ...)
2014-05-02 19:02 ` [PATCH 3/4] plist: add plist_rotate Dan Streetman
@ 2014-05-02 19:02 ` Dan Streetman
2014-05-05 15:51 ` Dan Streetman
` (2 more replies)
2014-05-12 16:38 ` [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
4 siblings, 3 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-02 19:02 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Rik van Riel,
Weijie Yang, Johannes Weiner, linux-mm, linux-kernel, Shaohua Li
Originally get_swap_page() started iterating through the singly-linked
list of swap_info_structs using swap_list.next or highest_priority_index,
which both were intended to point to the highest priority active swap
target that was not full. The first patch in this series changed the
singly-linked list to a doubly-linked list, and removed the logic to start
at the highest priority non-full entry; it starts scanning at the highest
priority entry each time, even if the entry is full.
Replace the manually ordered swap_list_head with a plist, renamed to
swap_active_head for clarity. Add a new plist, swap_avail_head.
The original swap_active_head plist contains all active swap_info_structs,
as before, while the new swap_avail_head plist contains only
swap_info_structs that are active and available, i.e. not full.
Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list.
Mel Gorman suggested using plists since they internally handle ordering
the list entries based on priority, which is exactly what swap was doing
manually. All the ordering code is now removed, and swap_info_struct
entries and simply added to their corresponding plist and automatically
ordered correctly.
Using a new plist for available swap_info_structs simplifies and
optimizes get_swap_page(), which no longer has to iterate over full
swap_info_structs. Using a new spinlock for swap_avail_head plist
allows each swap_info_struct to add or remove themselves from the
plist when they become full or not-full; previously they could not
do so because the swap_info_struct->lock is held when they change
from full<->not-full, and the swap_lock protecting the main
swap_active_head must be ordered before any swap_info_struct->lock.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
---
Mel, I tried moving the ordering and rotating code into common list functions
and I also tried plists, and you were right, using plists is much simpler and
more maintainable. The only required update to plist is the plist_rotate()
function, which is even simpler to use in get_swap_page() than the
list_rotate_left() function.
After looking more closely at plists, I don't see how they would reduce
performance, so I don't think there is any concern there, although Shaohua if
you have time it might be nice to check this updated patch set's performance.
I will note that if CONFIG_DEBUG_PI_LIST is set, there's quite a lot of list
checking going on for each list modification including rotate; that config is
set if "RT Mutex debugging, deadlock detection" is set, so I assume in that
case overall system performance is expected to be less than optimal.
Also, I might have over-commented in this patch; if so I can remove/reduce
some of it. :)
Changelog since v1 https://lkml.org/lkml/2014/4/12/73
-use plists instead of regular lists
-update/add comments
include/linux/swap.h | 3 +-
include/linux/swapfile.h | 2 +-
mm/frontswap.c | 6 +-
mm/swapfile.c | 142 +++++++++++++++++++++++++++++------------------
4 files changed, 94 insertions(+), 59 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8bb85d6..9155bcd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,8 @@ struct percpu_cluster {
struct swap_info_struct {
unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */
- struct list_head list; /* entry in swap list */
+ struct plist_node list; /* entry in swap_active_head */
+ struct plist_node avail_list; /* entry in swap_avail_head */
signed char type; /* strange name for an index */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 2eab382..388293a 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
* want to expose them to the dozens of source files that include swap.h
*/
extern spinlock_t swap_lock;
-extern struct list_head swap_list_head;
+extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fae1160..c30eec5 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void)
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- list_for_each_entry(si, &swap_list_head, list)
+ plist_for_each_entry(si, &swap_active_head, list)
totalpages += atomic_read(&si->frontswap_pages);
return totalpages;
}
@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
unsigned long pages = 0, pages_to_unuse = 0;
assert_spin_locked(&swap_lock);
- list_for_each_entry(si, &swap_list_head, list) {
+ plist_for_each_entry(si, &swap_active_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_list_head each time
+ * so restart scan from swap_active_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c95a8c..ec230e3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry ";
* all active swap_info_structs
* protected with swap_lock, and ordered by priority.
*/
-LIST_HEAD(swap_list_head);
+PLIST_HEAD(swap_active_head);
+
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -594,6 +609,9 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
@@ -645,57 +663,60 @@ swp_entry_t get_swap_page(void)
{
struct swap_info_struct *si, *next;
pgoff_t offset;
- struct list_head *tmp;
- spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- list_for_each(tmp, &swap_list_head) {
- si = list_entry(tmp, typeof(*si), list);
+ spin_lock(&swap_avail_lock);
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ /* rotate si to tail of same-priority siblings */
+ plist_rotate(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&si->lock);
- continue;
+ goto nextsi;
}
- /*
- * rotate the current swap_info that we're going to use
- * to after any other swap_info that have the same prio,
- * so that all equal-priority swap_info get used equally
- */
- next = si;
- list_for_each_entry_continue(next, &swap_list_head, list) {
- if (si->prio != next->prio)
- break;
- list_rotate_left(&si->list);
- next = si;
- }
-
- spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
return swp_entry(si->type, offset);
- spin_lock(&swap_lock);
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&swap_avail_lock);
+nextsi:
/*
* if we got here, it's likely that si was almost full before,
* and since scan_swap_map() can drop the si->lock, multiple
* callers probably all tried to get a page from the same si
- * and it filled up before we could get one. So we need to
- * try again. Since we dropped the swap_lock, there may now
- * be non-full higher priority swap_infos, and this si may have
- * even been removed from the list (although very unlikely).
- * Let's start over.
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over.
*/
- tmp = &swap_list_head;
+ if (plist_node_empty(&next->avail_list))
+ goto start_over;
}
atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -798,8 +819,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit)
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
p->highest_bit = offset;
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1734,12 +1765,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- struct swap_info_struct *si;
-
if (prio >= 0)
p->prio = prio;
else
p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ p->avail_list.prio = -p->prio;
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
@@ -1747,27 +1782,20 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
total_swap_pages += p->pages;
assert_spin_locked(&swap_lock);
- BUG_ON(!list_empty(&p->list));
- /*
- * insert into swap list; the list is in priority order,
- * so that get_swap_page() can get a page from the highest
- * priority swap_info_struct with available page(s), and
- * swapoff can adjust the auto-assigned (i.e. negative) prio
- * values for any lower-priority swap_info_structs when
- * removing a negative-prio swap_info_struct
- */
- list_for_each_entry(si, &swap_list_head, list) {
- if (p->prio >= si->prio) {
- list_add_tail(&p->list, &si->list);
- return;
- }
- }
/*
- * this covers two cases:
- * 1) p->prio is less than all existing prio
- * 2) the swap list is empty
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
*/
- list_add_tail(&p->list, &swap_list_head);
+ plist_add(&p->list, &swap_active_head);
+ spin_lock(&swap_avail_lock);
+ plist_add(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1821,7 +1849,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mapping = victim->f_mapping;
spin_lock(&swap_lock);
- list_for_each_entry(p, &swap_list_head, list) {
+ plist_for_each_entry(p, &swap_active_head, list) {
if (p->flags & SWP_WRITEOK) {
if (p->swap_file->f_mapping == mapping) {
found = 1;
@@ -1841,16 +1869,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
+ spin_lock(&swap_avail_lock);
+ plist_del(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
- list_for_each_entry_continue(si, &swap_list_head, list) {
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
+ si->list.prio--;
+ si->avail_list.prio--;
}
least_priority++;
}
- list_del_init(&p->list);
+ plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -2115,7 +2148,8 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
- INIT_LIST_HEAD(&p->list);
+ plist_node_init(&p->list, 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-02 19:02 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
@ 2014-05-05 15:51 ` Dan Streetman
2014-05-05 19:13 ` Steven Rostedt
2014-05-12 11:11 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Mel Gorman
2 siblings, 0 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-05 15:51 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Rik van Riel,
Weijie Yang, Johannes Weiner, Linux-MM, linux-kernel, Shaohua Li
On Fri, May 2, 2014 at 3:02 PM, Dan Streetman <ddstreet@ieee.org> wrote:
> Originally get_swap_page() started iterating through the singly-linked
> list of swap_info_structs using swap_list.next or highest_priority_index,
> which both were intended to point to the highest priority active swap
> target that was not full. The first patch in this series changed the
> singly-linked list to a doubly-linked list, and removed the logic to start
> at the highest priority non-full entry; it starts scanning at the highest
> priority entry each time, even if the entry is full.
>
> Replace the manually ordered swap_list_head with a plist, renamed to
> swap_active_head for clarity. Add a new plist, swap_avail_head.
> The original swap_active_head plist contains all active swap_info_structs,
> as before, while the new swap_avail_head plist contains only
> swap_info_structs that are active and available, i.e. not full.
> Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list.
>
> Mel Gorman suggested using plists since they internally handle ordering
> the list entries based on priority, which is exactly what swap was doing
> manually. All the ordering code is now removed, and swap_info_struct
> entries and simply added to their corresponding plist and automatically
> ordered correctly.
>
> Using a new plist for available swap_info_structs simplifies and
> optimizes get_swap_page(), which no longer has to iterate over full
> swap_info_structs. Using a new spinlock for swap_avail_head plist
> allows each swap_info_struct to add or remove themselves from the
> plist when they become full or not-full; previously they could not
> do so because the swap_info_struct->lock is held when they change
> from full<->not-full, and the swap_lock protecting the main
> swap_active_head must be ordered before any swap_info_struct->lock.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Shaohua Li <shli@fusionio.com>
>
> ---
>
> Mel, I tried moving the ordering and rotating code into common list functions
> and I also tried plists, and you were right, using plists is much simpler and
> more maintainable. The only required update to plist is the plist_rotate()
> function, which is even simpler to use in get_swap_page() than the
> list_rotate_left() function.
>
> After looking more closely at plists, I don't see how they would reduce
> performance, so I don't think there is any concern there, although Shaohua if
> you have time it might be nice to check this updated patch set's performance.
> I will note that if CONFIG_DEBUG_PI_LIST is set, there's quite a lot of list
> checking going on for each list modification including rotate; that config is
> set if "RT Mutex debugging, deadlock detection" is set, so I assume in that
> case overall system performance is expected to be less than optimal.
>
> Also, I might have over-commented in this patch; if so I can remove/reduce
> some of it. :)
>
> Changelog since v1 https://lkml.org/lkml/2014/4/12/73
> -use plists instead of regular lists
> -update/add comments
>
> include/linux/swap.h | 3 +-
> include/linux/swapfile.h | 2 +-
> mm/frontswap.c | 6 +-
> mm/swapfile.c | 142 +++++++++++++++++++++++++++++------------------
> 4 files changed, 94 insertions(+), 59 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 8bb85d6..9155bcd 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -214,7 +214,8 @@ struct percpu_cluster {
> struct swap_info_struct {
> unsigned long flags; /* SWP_USED etc: see above */
> signed short prio; /* swap priority of this type */
> - struct list_head list; /* entry in swap list */
> + struct plist_node list; /* entry in swap_active_head */
> + struct plist_node avail_list; /* entry in swap_avail_head */
> signed char type; /* strange name for an index */
> unsigned int max; /* extent of the swap_map */
> unsigned char *swap_map; /* vmalloc'ed array of usage counts */
> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
> index 2eab382..388293a 100644
> --- a/include/linux/swapfile.h
> +++ b/include/linux/swapfile.h
> @@ -6,7 +6,7 @@
> * want to expose them to the dozens of source files that include swap.h
> */
> extern spinlock_t swap_lock;
> -extern struct list_head swap_list_head;
> +extern struct plist_head swap_active_head;
> extern struct swap_info_struct *swap_info[];
> extern int try_to_unuse(unsigned int, bool, unsigned long);
>
> diff --git a/mm/frontswap.c b/mm/frontswap.c
> index fae1160..c30eec5 100644
> --- a/mm/frontswap.c
> +++ b/mm/frontswap.c
> @@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void)
> struct swap_info_struct *si = NULL;
>
> assert_spin_locked(&swap_lock);
> - list_for_each_entry(si, &swap_list_head, list)
> + plist_for_each_entry(si, &swap_active_head, list)
> totalpages += atomic_read(&si->frontswap_pages);
> return totalpages;
> }
> @@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
> unsigned long pages = 0, pages_to_unuse = 0;
>
> assert_spin_locked(&swap_lock);
> - list_for_each_entry(si, &swap_list_head, list) {
> + plist_for_each_entry(si, &swap_active_head, list) {
> si_frontswap_pages = atomic_read(&si->frontswap_pages);
> if (total_pages_to_unuse < si_frontswap_pages) {
> pages = pages_to_unuse = total_pages_to_unuse;
> @@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
> /*
> * we don't want to hold swap_lock while doing a very
> * lengthy try_to_unuse, but swap_list may change
> - * so restart scan from swap_list_head each time
> + * so restart scan from swap_active_head each time
> */
> spin_lock(&swap_lock);
> ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 6c95a8c..ec230e3 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry ";
> * all active swap_info_structs
> * protected with swap_lock, and ordered by priority.
> */
> -LIST_HEAD(swap_list_head);
> +PLIST_HEAD(swap_active_head);
> +
> +/*
> + * all available (active, not full) swap_info_structs
> + * protected with swap_avail_lock, ordered by priority.
> + * This is used by get_swap_page() instead of swap_active_head
> + * because swap_active_head includes all swap_info_structs,
> + * but get_swap_page() doesn't need to look at full ones.
> + * This uses its own lock instead of swap_lock because when a
> + * swap_info_struct changes between not-full/full, it needs to
> + * add/remove itself to/from this list, but the swap_info_struct->lock
> + * is held and the locking order requires swap_lock to be taken
> + * before any swap_info_struct->lock.
> + */
> +static PLIST_HEAD(swap_avail_head);
> +static DEFINE_SPINLOCK(swap_avail_lock);
>
> struct swap_info_struct *swap_info[MAX_SWAPFILES];
>
> @@ -594,6 +609,9 @@ checks:
> if (si->inuse_pages == si->pages) {
> si->lowest_bit = si->max;
> si->highest_bit = 0;
> + spin_lock(&swap_avail_lock);
> + plist_del(&si->avail_list, &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> }
> si->swap_map[offset] = usage;
> inc_cluster_info_page(si, si->cluster_info, offset);
> @@ -645,57 +663,60 @@ swp_entry_t get_swap_page(void)
> {
> struct swap_info_struct *si, *next;
> pgoff_t offset;
> - struct list_head *tmp;
>
> - spin_lock(&swap_lock);
> if (atomic_long_read(&nr_swap_pages) <= 0)
> goto noswap;
> atomic_long_dec(&nr_swap_pages);
>
> - list_for_each(tmp, &swap_list_head) {
> - si = list_entry(tmp, typeof(*si), list);
> + spin_lock(&swap_avail_lock);
> +start_over:
> + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
> + /* rotate si to tail of same-priority siblings */
> + plist_rotate(&si->avail_list, &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> spin_lock(&si->lock);
> if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
> + spin_lock(&swap_avail_lock);
> + if (plist_node_empty(&si->avail_list)) {
> + spin_unlock(&si->lock);
> + goto nextsi;
> + }
> + WARN(!si->highest_bit,
> + "swap_info %d in list but !highest_bit\n",
> + si->type);
> + WARN(!(si->flags & SWP_WRITEOK),
> + "swap_info %d in list but !SWP_WRITEOK\n",
> + si->type);
> + plist_del(&si->avail_list, &swap_avail_head);
> spin_unlock(&si->lock);
> - continue;
> + goto nextsi;
> }
>
> - /*
> - * rotate the current swap_info that we're going to use
> - * to after any other swap_info that have the same prio,
> - * so that all equal-priority swap_info get used equally
> - */
> - next = si;
> - list_for_each_entry_continue(next, &swap_list_head, list) {
> - if (si->prio != next->prio)
> - break;
> - list_rotate_left(&si->list);
> - next = si;
> - }
> -
> - spin_unlock(&swap_lock);
> /* This is called for allocating swap entry for cache */
> offset = scan_swap_map(si, SWAP_HAS_CACHE);
> spin_unlock(&si->lock);
> if (offset)
> return swp_entry(si->type, offset);
> - spin_lock(&swap_lock);
> + pr_debug("scan_swap_map of si %d failed to find offset\n",
> + si->type);
I forgot to mention I changed this from printk(KERN_DEBUG to pr_debug
between v1 and v2. Not sure if a scan_swap_map() failure should be
always printed or only during debug...
> + spin_lock(&swap_avail_lock);
> +nextsi:
> /*
> * if we got here, it's likely that si was almost full before,
> * and since scan_swap_map() can drop the si->lock, multiple
> * callers probably all tried to get a page from the same si
> - * and it filled up before we could get one. So we need to
> - * try again. Since we dropped the swap_lock, there may now
> - * be non-full higher priority swap_infos, and this si may have
> - * even been removed from the list (although very unlikely).
> - * Let's start over.
> + * and it filled up before we could get one; or, the si filled
> + * up between us dropping swap_avail_lock and taking si->lock.
> + * Since we dropped the swap_avail_lock, the swap_avail_head
> + * list may have been modified; so if next is still in the
> + * swap_avail_head list then try it, otherwise start over.
> */
> - tmp = &swap_list_head;
> + if (plist_node_empty(&next->avail_list))
> + goto start_over;
One note I want to point out...if we get here, we tried the highest
priority swap_info_struct, and it filled up; so the options are
either:
1. start over at the beginning
2. continue at next
Under most circumstances, the beginning == next. But, a higher
priority swap_info_struct may have freed page(s) and been added back
into the list.
The danger of continuing with next, if it's still in the list, appears
(to me) to be if next was the last swap_info_struct in the list, and
it also fills up and fails, then get_swap_page() will fail, even
though there may be higher priority swap_info_struct(s) that became
available.
However the danger of starting over in every case, I think, is
continuing to fail repeatedly in scan_swap_map() - for example under
heavy swap, if swap_info_struct(s) are bouncing between full and
not-full (I don't know how likely/common this is). Especially if
there are lower priority swap_info_struct(s) with plenty of room, that
may unnecessarily delay threads trying to get a swap page.
So I'm not sure if doing it this way, continuing with next, is better
than just always starting over.
One option may be to continue at next, but also after the
plist_for_each_safe loop (i.e. complete failure), do:
if (!plist_head_empty(&swap_avail_head))
goto start_over;
so as to prevent a get_swap_page() failure when there actually are
still some pages available...of course, that probably would only ever
get reached if the system is very, very close to completely filling up
swap, so will it really matter if get_swap_page() fails slightly
before all swap is full or if threads battle until the bitter end when
there is actually no swap left...
> }
>
> atomic_long_inc(&nr_swap_pages);
> noswap:
> - spin_unlock(&swap_lock);
> return (swp_entry_t) {0};
> }
>
> @@ -798,8 +819,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
> dec_cluster_info_page(p, p->cluster_info, offset);
> if (offset < p->lowest_bit)
> p->lowest_bit = offset;
> - if (offset > p->highest_bit)
> + if (offset > p->highest_bit) {
> + bool was_full = !p->highest_bit;
> p->highest_bit = offset;
> + if (was_full && (p->flags & SWP_WRITEOK)) {
> + spin_lock(&swap_avail_lock);
> + WARN_ON(!plist_node_empty(&p->avail_list));
> + if (plist_node_empty(&p->avail_list))
> + plist_add(&p->avail_list,
> + &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> + }
> + }
> atomic_long_inc(&nr_swap_pages);
> p->inuse_pages--;
> frontswap_invalidate_page(p->type, offset);
> @@ -1734,12 +1765,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
> unsigned char *swap_map,
> struct swap_cluster_info *cluster_info)
> {
> - struct swap_info_struct *si;
> -
> if (prio >= 0)
> p->prio = prio;
> else
> p->prio = --least_priority;
> + /*
> + * the plist prio is negated because plist ordering is
> + * low-to-high, while swap ordering is high-to-low
> + */
> + p->list.prio = -p->prio;
> + p->avail_list.prio = -p->prio;
> p->swap_map = swap_map;
> p->cluster_info = cluster_info;
> p->flags |= SWP_WRITEOK;
> @@ -1747,27 +1782,20 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
> total_swap_pages += p->pages;
>
> assert_spin_locked(&swap_lock);
> - BUG_ON(!list_empty(&p->list));
> - /*
> - * insert into swap list; the list is in priority order,
> - * so that get_swap_page() can get a page from the highest
> - * priority swap_info_struct with available page(s), and
> - * swapoff can adjust the auto-assigned (i.e. negative) prio
> - * values for any lower-priority swap_info_structs when
> - * removing a negative-prio swap_info_struct
> - */
> - list_for_each_entry(si, &swap_list_head, list) {
> - if (p->prio >= si->prio) {
> - list_add_tail(&p->list, &si->list);
> - return;
> - }
> - }
> /*
> - * this covers two cases:
> - * 1) p->prio is less than all existing prio
> - * 2) the swap list is empty
> + * both lists are plists, and thus priority ordered.
> + * swap_active_head needs to be priority ordered for swapoff(),
> + * which on removal of any swap_info_struct with an auto-assigned
> + * (i.e. negative) priority increments the auto-assigned priority
> + * of any lower-priority swap_info_structs.
> + * swap_avail_head needs to be priority ordered for get_swap_page(),
> + * which allocates swap pages from the highest available priority
> + * swap_info_struct.
> */
> - list_add_tail(&p->list, &swap_list_head);
> + plist_add(&p->list, &swap_active_head);
> + spin_lock(&swap_avail_lock);
> + plist_add(&p->avail_list, &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> }
>
> static void enable_swap_info(struct swap_info_struct *p, int prio,
> @@ -1821,7 +1849,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>
> mapping = victim->f_mapping;
> spin_lock(&swap_lock);
> - list_for_each_entry(p, &swap_list_head, list) {
> + plist_for_each_entry(p, &swap_active_head, list) {
> if (p->flags & SWP_WRITEOK) {
> if (p->swap_file->f_mapping == mapping) {
> found = 1;
> @@ -1841,16 +1869,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> spin_unlock(&swap_lock);
> goto out_dput;
> }
> + spin_lock(&swap_avail_lock);
> + plist_del(&p->avail_list, &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> spin_lock(&p->lock);
> if (p->prio < 0) {
> struct swap_info_struct *si = p;
>
> - list_for_each_entry_continue(si, &swap_list_head, list) {
> + plist_for_each_entry_continue(si, &swap_active_head, list) {
> si->prio++;
> + si->list.prio--;
> + si->avail_list.prio--;
> }
> least_priority++;
> }
> - list_del_init(&p->list);
> + plist_del(&p->list, &swap_active_head);
> atomic_long_sub(p->pages, &nr_swap_pages);
> total_swap_pages -= p->pages;
> p->flags &= ~SWP_WRITEOK;
> @@ -2115,7 +2148,8 @@ static struct swap_info_struct *alloc_swap_info(void)
> */
> }
> INIT_LIST_HEAD(&p->first_swap_extent.list);
> - INIT_LIST_HEAD(&p->list);
> + plist_node_init(&p->list, 0);
> + plist_node_init(&p->avail_list, 0);
> p->flags = SWP_USED;
> spin_unlock(&swap_lock);
> spin_lock_init(&p->lock);
> --
> 1.8.3.1
>
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-02 19:02 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
2014-05-05 15:51 ` Dan Streetman
@ 2014-05-05 19:13 ` Steven Rostedt
2014-05-05 19:38 ` Peter Zijlstra
2014-05-09 20:42 ` [PATCH] plist: make CONFIG_DEBUG_PI_LIST selectable Dan Streetman
2014-05-12 11:11 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Mel Gorman
2 siblings, 2 replies; 45+ messages in thread
From: Steven Rostedt @ 2014-05-05 19:13 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
linux-mm, linux-kernel, Shaohua Li, Peter Zijlstra
On Fri, May 02, 2014 at 03:02:30PM -0400, Dan Streetman wrote:
> Originally get_swap_page() started iterating through the singly-linked
> list of swap_info_structs using swap_list.next or highest_priority_index,
> which both were intended to point to the highest priority active swap
> target that was not full. The first patch in this series changed the
> singly-linked list to a doubly-linked list, and removed the logic to start
> at the highest priority non-full entry; it starts scanning at the highest
> priority entry each time, even if the entry is full.
>
> Replace the manually ordered swap_list_head with a plist, renamed to
> swap_active_head for clarity. Add a new plist, swap_avail_head.
> The original swap_active_head plist contains all active swap_info_structs,
> as before, while the new swap_avail_head plist contains only
> swap_info_structs that are active and available, i.e. not full.
> Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list.
>
> Mel Gorman suggested using plists since they internally handle ordering
> the list entries based on priority, which is exactly what swap was doing
> manually. All the ordering code is now removed, and swap_info_struct
> entries and simply added to their corresponding plist and automatically
> ordered correctly.
>
> Using a new plist for available swap_info_structs simplifies and
> optimizes get_swap_page(), which no longer has to iterate over full
> swap_info_structs. Using a new spinlock for swap_avail_head plist
> allows each swap_info_struct to add or remove themselves from the
> plist when they become full or not-full; previously they could not
> do so because the swap_info_struct->lock is held when they change
> from full<->not-full, and the swap_lock protecting the main
> swap_active_head must be ordered before any swap_info_struct->lock.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Shaohua Li <shli@fusionio.com>
>
> ---
>
> Mel, I tried moving the ordering and rotating code into common list functions
> and I also tried plists, and you were right, using plists is much simpler and
> more maintainable. The only required update to plist is the plist_rotate()
> function, which is even simpler to use in get_swap_page() than the
> list_rotate_left() function.
>
> After looking more closely at plists, I don't see how they would reduce
> performance, so I don't think there is any concern there, although Shaohua if
> you have time it might be nice to check this updated patch set's performance.
> I will note that if CONFIG_DEBUG_PI_LIST is set, there's quite a lot of list
> checking going on for each list modification including rotate; that config is
> set if "RT Mutex debugging, deadlock detection" is set, so I assume in that
> case overall system performance is expected to be less than optimal.
I know Peter Zijlstra was doing some work to convert the rtmutex code to use
rb-trees instead of plists. Peter is that moving forward?
If there are other users of plists we should remove the dependency from the
DEBUG_RT_MUTEX and DEBUG_PI_LIST.
>
> Also, I might have over-commented in this patch; if so I can remove/reduce
> some of it. :)
"over-commented"?? There's no such word ;-)
-- Steve
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-05 19:13 ` Steven Rostedt
@ 2014-05-05 19:38 ` Peter Zijlstra
2014-05-09 20:42 ` [PATCH] plist: make CONFIG_DEBUG_PI_LIST selectable Dan Streetman
1 sibling, 0 replies; 45+ messages in thread
From: Peter Zijlstra @ 2014-05-05 19:38 UTC (permalink / raw)
To: Steven Rostedt
Cc: Dan Streetman, Hugh Dickins, Andrew Morton, Mel Gorman,
Michal Hocko, Christian Ehrhardt, Rik van Riel, Weijie Yang,
Johannes Weiner, linux-mm, linux-kernel, Shaohua Li
[-- Attachment #1: Type: text/plain, Size: 590 bytes --]
On Mon, May 05, 2014 at 03:13:41PM -0400, Steven Rostedt wrote:
> On Fri, May 02, 2014 at 03:02:30PM -0400, Dan Streetman wrote:
> I know Peter Zijlstra was doing some work to convert the rtmutex code to use
> rb-trees instead of plists. Peter is that moving forward?
commit fb00aca474405f4fa8a8519c3179fed722eabd83
Author: Peter Zijlstra <peterz@infradead.org>
Date: Thu Nov 7 14:43:43 2013 +0100
rtmutex: Turn the plist into an rb-tree
That one you mean? Yeah it should be in:
# git describe --contains fb00aca474405f4fa8a8519c3179fed722eabd83 --match "v*"
v3.14-rc1~170^2~39
[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCH] plist: make CONFIG_DEBUG_PI_LIST selectable
2014-05-05 19:13 ` Steven Rostedt
2014-05-05 19:38 ` Peter Zijlstra
@ 2014-05-09 20:42 ` Dan Streetman
2014-05-09 21:17 ` Steven Rostedt
1 sibling, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-09 20:42 UTC (permalink / raw)
To: Steven Rostedt
Cc: Dan Streetman, Hugh Dickins, Andrew Morton, Mel Gorman,
Michal Hocko, Christian Ehrhardt, Rik van Riel, Weijie Yang,
Johannes Weiner, Shaohua Li, linux-mm, linux-kernel,
Peter Zijlstra
Change CONFIG_DEBUG_PI_LIST to be user-selectable, and add a
title and description. Remove the dependency on DEBUG_RT_MUTEXES
since they were changed to use rbtrees, and there are other users
of plists now.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
lib/Kconfig.debug | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e816930..a3b1d68 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -864,11 +864,6 @@ config DEBUG_RT_MUTEXES
This allows rt mutex semantics violations and rt mutex related
deadlocks (lockups) to be detected and reported automatically.
-config DEBUG_PI_LIST
- bool
- default y
- depends on DEBUG_RT_MUTEXES
-
config RT_MUTEX_TESTER
bool "Built-in scriptable tester for rt-mutexes"
depends on DEBUG_KERNEL && RT_MUTEXES
@@ -1094,6 +1089,16 @@ config DEBUG_LIST
If unsure, say N.
+config DEBUG_PI_LIST
+ bool "Debug priority linked list manipulation"
+ depends on DEBUG_KERNEL
+ help
+ Enable this to turn on extended checks in the priority-ordered
+ linked-list (plist) walking routines. This checks the entire
+ list multiple times during each manipulation.
+
+ If unsure, say N.
+
config DEBUG_SG
bool "Debug SG table operations"
depends on DEBUG_KERNEL
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCH] plist: make CONFIG_DEBUG_PI_LIST selectable
2014-05-09 20:42 ` [PATCH] plist: make CONFIG_DEBUG_PI_LIST selectable Dan Streetman
@ 2014-05-09 21:17 ` Steven Rostedt
0 siblings, 0 replies; 45+ messages in thread
From: Steven Rostedt @ 2014-05-09 21:17 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Mel Gorman, Michal Hocko,
Christian Ehrhardt, Rik van Riel, Weijie Yang, Johannes Weiner,
Shaohua Li, linux-mm, linux-kernel, Peter Zijlstra
On Fri, 9 May 2014 16:42:24 -0400
Dan Streetman <ddstreet@ieee.org> wrote:
> Change CONFIG_DEBUG_PI_LIST to be user-selectable, and add a
> title and description. Remove the dependency on DEBUG_RT_MUTEXES
> since they were changed to use rbtrees, and there are other users
> of plists now.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> ---
Acked-by: Steven Rostedt <rostedt@goodmis.org>
-- Steve
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-02 19:02 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
2014-05-05 15:51 ` Dan Streetman
2014-05-05 19:13 ` Steven Rostedt
@ 2014-05-12 11:11 ` Mel Gorman
2014-05-12 13:00 ` Dan Streetman
2 siblings, 1 reply; 45+ messages in thread
From: Mel Gorman @ 2014-05-12 11:11 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Michal Hocko, Christian Ehrhardt,
Rik van Riel, Weijie Yang, Johannes Weiner, linux-mm,
linux-kernel, Shaohua Li
On Fri, May 02, 2014 at 03:02:30PM -0400, Dan Streetman wrote:
> Originally get_swap_page() started iterating through the singly-linked
> list of swap_info_structs using swap_list.next or highest_priority_index,
> which both were intended to point to the highest priority active swap
> target that was not full. The first patch in this series changed the
> singly-linked list to a doubly-linked list, and removed the logic to start
> at the highest priority non-full entry; it starts scanning at the highest
> priority entry each time, even if the entry is full.
>
> Replace the manually ordered swap_list_head with a plist, renamed to
> swap_active_head for clarity. Add a new plist, swap_avail_head.
> The original swap_active_head plist contains all active swap_info_structs,
> as before, while the new swap_avail_head plist contains only
> swap_info_structs that are active and available, i.e. not full.
> Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list.
>
> Mel Gorman suggested using plists since they internally handle ordering
> the list entries based on priority, which is exactly what swap was doing
> manually. All the ordering code is now removed, and swap_info_struct
> entries and simply added to their corresponding plist and automatically
> ordered correctly.
>
> Using a new plist for available swap_info_structs simplifies and
> optimizes get_swap_page(), which no longer has to iterate over full
> swap_info_structs. Using a new spinlock for swap_avail_head plist
> allows each swap_info_struct to add or remove themselves from the
> plist when they become full or not-full; previously they could not
> do so because the swap_info_struct->lock is held when they change
> from full<->not-full, and the swap_lock protecting the main
> swap_active_head must be ordered before any swap_info_struct->lock.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Shaohua Li <shli@fusionio.com>
>
> ---
>
> Mel, I tried moving the ordering and rotating code into common list functions
> and I also tried plists, and you were right, using plists is much simpler and
> more maintainable. The only required update to plist is the plist_rotate()
> function, which is even simpler to use in get_swap_page() than the
> list_rotate_left() function.
>
> After looking more closely at plists, I don't see how they would reduce
> performance, so I don't think there is any concern there, although Shaohua if
> you have time it might be nice to check this updated patch set's performance.
> I will note that if CONFIG_DEBUG_PI_LIST is set, there's quite a lot of list
> checking going on for each list modification including rotate; that config is
> set if "RT Mutex debugging, deadlock detection" is set, so I assume in that
> case overall system performance is expected to be less than optimal.
>
> Also, I might have over-commented in this patch; if so I can remove/reduce
> some of it. :)
>
> Changelog since v1 https://lkml.org/lkml/2014/4/12/73
> -use plists instead of regular lists
> -update/add comments
>
> include/linux/swap.h | 3 +-
> include/linux/swapfile.h | 2 +-
> mm/frontswap.c | 6 +-
> mm/swapfile.c | 142 +++++++++++++++++++++++++++++------------------
> 4 files changed, 94 insertions(+), 59 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 8bb85d6..9155bcd 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -214,7 +214,8 @@ struct percpu_cluster {
> struct swap_info_struct {
> unsigned long flags; /* SWP_USED etc: see above */
> signed short prio; /* swap priority of this type */
> - struct list_head list; /* entry in swap list */
> + struct plist_node list; /* entry in swap_active_head */
> + struct plist_node avail_list; /* entry in swap_avail_head */
> signed char type; /* strange name for an index */
> unsigned int max; /* extent of the swap_map */
> unsigned char *swap_map; /* vmalloc'ed array of usage counts */
> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
> index 2eab382..388293a 100644
> --- a/include/linux/swapfile.h
> +++ b/include/linux/swapfile.h
> @@ -6,7 +6,7 @@
> * want to expose them to the dozens of source files that include swap.h
> */
> extern spinlock_t swap_lock;
> -extern struct list_head swap_list_head;
> +extern struct plist_head swap_active_head;
> extern struct swap_info_struct *swap_info[];
> extern int try_to_unuse(unsigned int, bool, unsigned long);
>
> diff --git a/mm/frontswap.c b/mm/frontswap.c
> index fae1160..c30eec5 100644
> --- a/mm/frontswap.c
> +++ b/mm/frontswap.c
> @@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void)
> struct swap_info_struct *si = NULL;
>
> assert_spin_locked(&swap_lock);
> - list_for_each_entry(si, &swap_list_head, list)
> + plist_for_each_entry(si, &swap_active_head, list)
> totalpages += atomic_read(&si->frontswap_pages);
> return totalpages;
> }
> @@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
> unsigned long pages = 0, pages_to_unuse = 0;
>
> assert_spin_locked(&swap_lock);
> - list_for_each_entry(si, &swap_list_head, list) {
> + plist_for_each_entry(si, &swap_active_head, list) {
> si_frontswap_pages = atomic_read(&si->frontswap_pages);
> if (total_pages_to_unuse < si_frontswap_pages) {
> pages = pages_to_unuse = total_pages_to_unuse;
> @@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
> /*
> * we don't want to hold swap_lock while doing a very
> * lengthy try_to_unuse, but swap_list may change
> - * so restart scan from swap_list_head each time
> + * so restart scan from swap_active_head each time
> */
> spin_lock(&swap_lock);
> ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 6c95a8c..ec230e3 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry ";
> * all active swap_info_structs
> * protected with swap_lock, and ordered by priority.
> */
> -LIST_HEAD(swap_list_head);
> +PLIST_HEAD(swap_active_head);
> +
> +/*
> + * all available (active, not full) swap_info_structs
> + * protected with swap_avail_lock, ordered by priority.
> + * This is used by get_swap_page() instead of swap_active_head
> + * because swap_active_head includes all swap_info_structs,
> + * but get_swap_page() doesn't need to look at full ones.
> + * This uses its own lock instead of swap_lock because when a
> + * swap_info_struct changes between not-full/full, it needs to
> + * add/remove itself to/from this list, but the swap_info_struct->lock
> + * is held and the locking order requires swap_lock to be taken
> + * before any swap_info_struct->lock.
> + */
> +static PLIST_HEAD(swap_avail_head);
> +static DEFINE_SPINLOCK(swap_avail_lock);
>
> struct swap_info_struct *swap_info[MAX_SWAPFILES];
>
> @@ -594,6 +609,9 @@ checks:
> if (si->inuse_pages == si->pages) {
> si->lowest_bit = si->max;
> si->highest_bit = 0;
> + spin_lock(&swap_avail_lock);
> + plist_del(&si->avail_list, &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> }
> si->swap_map[offset] = usage;
> inc_cluster_info_page(si, si->cluster_info, offset);
> @@ -645,57 +663,60 @@ swp_entry_t get_swap_page(void)
> {
> struct swap_info_struct *si, *next;
> pgoff_t offset;
> - struct list_head *tmp;
>
> - spin_lock(&swap_lock);
> if (atomic_long_read(&nr_swap_pages) <= 0)
> goto noswap;
> atomic_long_dec(&nr_swap_pages);
>
> - list_for_each(tmp, &swap_list_head) {
> - si = list_entry(tmp, typeof(*si), list);
> + spin_lock(&swap_avail_lock);
> +start_over:
> + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
> + /* rotate si to tail of same-priority siblings */
> + plist_rotate(&si->avail_list, &swap_avail_head);
> + spin_unlock(&swap_avail_lock);
> spin_lock(&si->lock);
> if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
> + spin_lock(&swap_avail_lock);
> + if (plist_node_empty(&si->avail_list)) {
> + spin_unlock(&si->lock);
> + goto nextsi;
> + }
It's a corner case but rather than dropping the swap_avail_lock early and
retaking it to remove an entry from avail_list, you could just drop it
after this check but before the scan_swap_map. It's not a big deal so
whether you change this or not
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 45+ messages in thread
* Re: [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-12 11:11 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Mel Gorman
@ 2014-05-12 13:00 ` Dan Streetman
0 siblings, 0 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-12 13:00 UTC (permalink / raw)
To: Mel Gorman
Cc: Hugh Dickins, Andrew Morton, Michal Hocko, Christian Ehrhardt,
Rik van Riel, Weijie Yang, Johannes Weiner, Linux-MM,
linux-kernel, Shaohua Li
On Mon, May 12, 2014 at 7:11 AM, Mel Gorman <mgorman@suse.de> wrote:
> On Fri, May 02, 2014 at 03:02:30PM -0400, Dan Streetman wrote:
>> Originally get_swap_page() started iterating through the singly-linked
>> list of swap_info_structs using swap_list.next or highest_priority_index,
>> which both were intended to point to the highest priority active swap
>> target that was not full. The first patch in this series changed the
>> singly-linked list to a doubly-linked list, and removed the logic to start
>> at the highest priority non-full entry; it starts scanning at the highest
>> priority entry each time, even if the entry is full.
>>
>> Replace the manually ordered swap_list_head with a plist, renamed to
>> swap_active_head for clarity. Add a new plist, swap_avail_head.
>> The original swap_active_head plist contains all active swap_info_structs,
>> as before, while the new swap_avail_head plist contains only
>> swap_info_structs that are active and available, i.e. not full.
>> Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list.
>>
>> Mel Gorman suggested using plists since they internally handle ordering
>> the list entries based on priority, which is exactly what swap was doing
>> manually. All the ordering code is now removed, and swap_info_struct
>> entries and simply added to their corresponding plist and automatically
>> ordered correctly.
>>
>> Using a new plist for available swap_info_structs simplifies and
>> optimizes get_swap_page(), which no longer has to iterate over full
>> swap_info_structs. Using a new spinlock for swap_avail_head plist
>> allows each swap_info_struct to add or remove themselves from the
>> plist when they become full or not-full; previously they could not
>> do so because the swap_info_struct->lock is held when they change
>> from full<->not-full, and the swap_lock protecting the main
>> swap_active_head must be ordered before any swap_info_struct->lock.
>>
>> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Shaohua Li <shli@fusionio.com>
>>
>> ---
>>
>> Mel, I tried moving the ordering and rotating code into common list functions
>> and I also tried plists, and you were right, using plists is much simpler and
>> more maintainable. The only required update to plist is the plist_rotate()
>> function, which is even simpler to use in get_swap_page() than the
>> list_rotate_left() function.
>>
>> After looking more closely at plists, I don't see how they would reduce
>> performance, so I don't think there is any concern there, although Shaohua if
>> you have time it might be nice to check this updated patch set's performance.
>> I will note that if CONFIG_DEBUG_PI_LIST is set, there's quite a lot of list
>> checking going on for each list modification including rotate; that config is
>> set if "RT Mutex debugging, deadlock detection" is set, so I assume in that
>> case overall system performance is expected to be less than optimal.
>>
>> Also, I might have over-commented in this patch; if so I can remove/reduce
>> some of it. :)
>>
>> Changelog since v1 https://lkml.org/lkml/2014/4/12/73
>> -use plists instead of regular lists
>> -update/add comments
>>
>> include/linux/swap.h | 3 +-
>> include/linux/swapfile.h | 2 +-
>> mm/frontswap.c | 6 +-
>> mm/swapfile.c | 142 +++++++++++++++++++++++++++++------------------
>> 4 files changed, 94 insertions(+), 59 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 8bb85d6..9155bcd 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -214,7 +214,8 @@ struct percpu_cluster {
>> struct swap_info_struct {
>> unsigned long flags; /* SWP_USED etc: see above */
>> signed short prio; /* swap priority of this type */
>> - struct list_head list; /* entry in swap list */
>> + struct plist_node list; /* entry in swap_active_head */
>> + struct plist_node avail_list; /* entry in swap_avail_head */
>> signed char type; /* strange name for an index */
>> unsigned int max; /* extent of the swap_map */
>> unsigned char *swap_map; /* vmalloc'ed array of usage counts */
>> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
>> index 2eab382..388293a 100644
>> --- a/include/linux/swapfile.h
>> +++ b/include/linux/swapfile.h
>> @@ -6,7 +6,7 @@
>> * want to expose them to the dozens of source files that include swap.h
>> */
>> extern spinlock_t swap_lock;
>> -extern struct list_head swap_list_head;
>> +extern struct plist_head swap_active_head;
>> extern struct swap_info_struct *swap_info[];
>> extern int try_to_unuse(unsigned int, bool, unsigned long);
>>
>> diff --git a/mm/frontswap.c b/mm/frontswap.c
>> index fae1160..c30eec5 100644
>> --- a/mm/frontswap.c
>> +++ b/mm/frontswap.c
>> @@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void)
>> struct swap_info_struct *si = NULL;
>>
>> assert_spin_locked(&swap_lock);
>> - list_for_each_entry(si, &swap_list_head, list)
>> + plist_for_each_entry(si, &swap_active_head, list)
>> totalpages += atomic_read(&si->frontswap_pages);
>> return totalpages;
>> }
>> @@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
>> unsigned long pages = 0, pages_to_unuse = 0;
>>
>> assert_spin_locked(&swap_lock);
>> - list_for_each_entry(si, &swap_list_head, list) {
>> + plist_for_each_entry(si, &swap_active_head, list) {
>> si_frontswap_pages = atomic_read(&si->frontswap_pages);
>> if (total_pages_to_unuse < si_frontswap_pages) {
>> pages = pages_to_unuse = total_pages_to_unuse;
>> @@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
>> /*
>> * we don't want to hold swap_lock while doing a very
>> * lengthy try_to_unuse, but swap_list may change
>> - * so restart scan from swap_list_head each time
>> + * so restart scan from swap_active_head each time
>> */
>> spin_lock(&swap_lock);
>> ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 6c95a8c..ec230e3 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry ";
>> * all active swap_info_structs
>> * protected with swap_lock, and ordered by priority.
>> */
>> -LIST_HEAD(swap_list_head);
>> +PLIST_HEAD(swap_active_head);
>> +
>> +/*
>> + * all available (active, not full) swap_info_structs
>> + * protected with swap_avail_lock, ordered by priority.
>> + * This is used by get_swap_page() instead of swap_active_head
>> + * because swap_active_head includes all swap_info_structs,
>> + * but get_swap_page() doesn't need to look at full ones.
>> + * This uses its own lock instead of swap_lock because when a
>> + * swap_info_struct changes between not-full/full, it needs to
>> + * add/remove itself to/from this list, but the swap_info_struct->lock
>> + * is held and the locking order requires swap_lock to be taken
>> + * before any swap_info_struct->lock.
>> + */
>> +static PLIST_HEAD(swap_avail_head);
>> +static DEFINE_SPINLOCK(swap_avail_lock);
>>
>> struct swap_info_struct *swap_info[MAX_SWAPFILES];
>>
>> @@ -594,6 +609,9 @@ checks:
>> if (si->inuse_pages == si->pages) {
>> si->lowest_bit = si->max;
>> si->highest_bit = 0;
>> + spin_lock(&swap_avail_lock);
>> + plist_del(&si->avail_list, &swap_avail_head);
>> + spin_unlock(&swap_avail_lock);
>> }
>> si->swap_map[offset] = usage;
>> inc_cluster_info_page(si, si->cluster_info, offset);
>> @@ -645,57 +663,60 @@ swp_entry_t get_swap_page(void)
>> {
>> struct swap_info_struct *si, *next;
>> pgoff_t offset;
>> - struct list_head *tmp;
>>
>> - spin_lock(&swap_lock);
>> if (atomic_long_read(&nr_swap_pages) <= 0)
>> goto noswap;
>> atomic_long_dec(&nr_swap_pages);
>>
>> - list_for_each(tmp, &swap_list_head) {
>> - si = list_entry(tmp, typeof(*si), list);
>> + spin_lock(&swap_avail_lock);
>> +start_over:
>> + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
>> + /* rotate si to tail of same-priority siblings */
>> + plist_rotate(&si->avail_list, &swap_avail_head);
>> + spin_unlock(&swap_avail_lock);
>> spin_lock(&si->lock);
>> if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
>> + spin_lock(&swap_avail_lock);
>> + if (plist_node_empty(&si->avail_list)) {
>> + spin_unlock(&si->lock);
>> + goto nextsi;
>> + }
>
> It's a corner case but rather than dropping the swap_avail_lock early and
> retaking it to remove an entry from avail_list, you could just drop it
> after this check but before the scan_swap_map. It's not a big deal so
> whether you change this or not
Well, the locking order requires si->lock before swap_avail_lock;
that's the primary reason for using the new lock instead of swap_lock,
so that during swap_entry_free(), while holding the si->lock, the
swap_avail_lock can be taken and the si removed from swap_avail_head.
So the swap_avail_lock has to be released before taking the si->lock,
and a side effect of that is the si might be removed from
swap_avail_head list between when we release swap_avail_lock and take
si->lock, in which case we just move on to the next si.
And as you said, it's definitely a corner case.
>
> Acked-by: Mel Gorman <mgorman@suse.de>
Let me send one more rev (in the next hour or so) - I missed a
spin_unlock(&swap_avail_lock) in the total failure case (my bad).
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration
2014-05-02 19:02 ` [PATCHv2 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
` (3 preceding siblings ...)
2014-05-02 19:02 ` [PATCH 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
@ 2014-05-12 16:38 ` Dan Streetman
2014-05-12 16:38 ` [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head Dan Streetman
` (3 more replies)
4 siblings, 4 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-12 16:38 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Weijie Yang,
Rik van Riel, Johannes Weiner, Bob Liu, linux-mm, linux-kernel
Changes since v2 https://lkml.org/lkml/2014/5/2/497
Update patches 3 and 4; the third patch is changed mainly to use
plist_requeue() instead of plist_rotate(), and the forth patch
adds a missing spin_unlock() in get_swap_page() in the total failure
case.
The logic controlling the singly-linked list of swap_info_struct entries
for all active, i.e. swapon'ed, swap targets is rather complex, because:
-it stores the entries in priority order
-there is a pointer to the highest priority entry
-there is a pointer to the highest priority not-full entry
-there is a highest_priority_index variable set outside the swap_lock
-swap entries of equal priority should be used equally
this complexity leads to bugs such as:
https://lkml.org/lkml/2014/2/13/181
where different priority swap targets are incorrectly used equally.
That bug probably could be solved with the existing singly-linked lists,
but I think it would only add more complexity to the already difficult
to understand get_swap_page() swap_list iteration logic.
The first patch changes from a singly-linked list to a doubly-linked
list using list_heads; the highest_priority_index and related code are
removed and get_swap_page() starts each iteration at the highest priority
swap_info entry, even if it's full. While this does introduce
unnecessary list iteration (i.e. Schlemiel the painter's algorithm)
in the case where one or more of the highest priority entries are full,
the iteration and manipulation code is much simpler and behaves
correctly re: the above bug; and the fourth patch removes the unnecessary
iteration.
The second patch adds some minor plist helper functions; nothing new
really, just functions to match existing regular list functions. These
are used by the next two patches.
The third patch adds plist_requeue(), which is used by get_swap_page()
in the next patch - it performs the requeueing of same-priority entries
(which moves the entry to the end of its priority in the plist), so that
all equal-priority swap_info_structs get used equally.
The fourth patch converts the main list into a plist, and adds a new plist
that contains only swap_info entries that are both active and not full.
As Mel suggested using plists allows removing all the ordering code from
swap - plists handle ordering automatically. The list naming is also
clarified now that there are two lists, with the original list changed
from swap_list_head to swap_active_head and the new list named
swap_avail_head. A new spinlock is also added for the new list, so
swap_info entries can be added or removed from the new list immediately
as they become full or not full.
Dan Streetman (4):
swap: change swap_info singly-linked list to list_head
plist: add helper functions
plist: add plist_requeue
swap: change swap_list_head to plist, add swap_avail_head
include/linux/plist.h | 45 ++++++++++
include/linux/swap.h | 8 +-
include/linux/swapfile.h | 2 +-
lib/plist.c | 52 +++++++++++
mm/frontswap.c | 13 +--
mm/swapfile.c | 224 +++++++++++++++++++++++++----------------------
6 files changed, 221 insertions(+), 123 deletions(-)
--
1.8.3.1
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head
2014-05-12 16:38 ` [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
@ 2014-05-12 16:38 ` Dan Streetman
2014-05-12 16:38 ` [PATCH 2/4] plist: add helper functions Dan Streetman
` (2 subsequent siblings)
3 siblings, 0 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-12 16:38 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Weijie Yang,
Rik van Riel, Johannes Weiner, Bob Liu, linux-mm, linux-kernel,
Shaohua Li
Replace the singly-linked list tracking active, i.e. swapon'ed,
swap_info_struct entries with a doubly-linked list using struct list_heads.
Simplify the logic iterating and manipulating the list of entries,
especially get_swap_page(), by using standard list_head functions,
and removing the highest priority iteration logic.
The change fixes the bug:
https://lkml.org/lkml/2014/2/13/181
in which different priority swap entries after the highest priority entry
are incorrectly used equally in pairs. The swap behavior is now as
advertised, i.e. different priority swap entries are used in order, and
equal priority swap targets are used concurrently.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
---
This is only a resend, this patch isn't changed in this patch set.
Previous is at https://lkml.org/lkml/2014/5/2/489
include/linux/swap.h | 7 +-
include/linux/swapfile.h | 2 +-
mm/frontswap.c | 13 ++--
mm/swapfile.c | 171 ++++++++++++++++++++---------------------------
4 files changed, 78 insertions(+), 115 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5a14b92..8bb85d6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,8 +214,8 @@ struct percpu_cluster {
struct swap_info_struct {
unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */
+ struct list_head list; /* entry in swap list */
signed char type; /* strange name for an index */
- signed char next; /* next type on the swap list */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +255,6 @@ struct swap_info_struct {
struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
};
-struct swap_list_t {
- int head; /* head of priority-ordered swapfile list */
- int next; /* swapfile to be used next */
-};
-
/* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624..2eab382 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
* want to expose them to the dozens of source files that include swap.h
*/
extern spinlock_t swap_lock;
-extern struct swap_list_t swap_list;
+extern struct list_head swap_list_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdc..fae1160 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
static unsigned long __frontswap_curr_pages(void)
{
- int type;
unsigned long totalpages = 0;
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ list_for_each_entry(si, &swap_list_head, list)
totalpages += atomic_read(&si->frontswap_pages);
- }
return totalpages;
}
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
int si_frontswap_pages;
unsigned long total_pages_to_unuse = total;
unsigned long pages = 0, pages_to_unuse = 0;
- int type;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ list_for_each_entry(si, &swap_list_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
vm_unacct_memory(pages);
*unused = pages_to_unuse;
- *swapid = type;
+ *swapid = si->type;
ret = 0;
break;
}
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_list.head each time
+ * so restart scan from swap_list_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4a7f7e6..6c95a8c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+LIST_HEAD(swap_list_head);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -640,66 +643,54 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si, *next;
pgoff_t offset;
- int type, next;
- int wrapped = 0;
- int hp_index;
+ struct list_head *tmp;
spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- hp_index = atomic_xchg(&highest_priority_index, -1);
- /*
- * highest_priority_index records current highest priority swap
- * type which just frees swap entries. If its priority is
- * higher than that of swap_list.next swap type, we use it. It
- * isn't protected by swap_lock, so it can be an invalid value
- * if the corresponding swap type is swapoff. We double check
- * the flags here. It's even possible the swap type is swapoff
- * and swapon again and its priority is changed. In such rare
- * case, low prority swap type might be used, but eventually
- * high priority swap will be used after several rounds of
- * swap.
- */
- if (hp_index != -1 && hp_index != type &&
- swap_info[type]->prio < swap_info[hp_index]->prio &&
- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
- type = hp_index;
- swap_list.next = type;
- }
-
- si = swap_info[type];
- next = si->next;
- if (next < 0 ||
- (!wrapped && si->prio != swap_info[next]->prio)) {
- next = swap_list.head;
- wrapped++;
- }
-
+ list_for_each(tmp, &swap_list_head) {
+ si = list_entry(tmp, typeof(*si), list);
spin_lock(&si->lock);
- if (!si->highest_bit) {
- spin_unlock(&si->lock);
- continue;
- }
- if (!(si->flags & SWP_WRITEOK)) {
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_unlock(&si->lock);
continue;
}
- swap_list.next = next;
+ /*
+ * rotate the current swap_info that we're going to use
+ * to after any other swap_info that have the same prio,
+ * so that all equal-priority swap_info get used equally
+ */
+ next = si;
+ list_for_each_entry_continue(next, &swap_list_head, list) {
+ if (si->prio != next->prio)
+ break;
+ list_rotate_left(&si->list);
+ next = si;
+ }
spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
- return swp_entry(type, offset);
+ return swp_entry(si->type, offset);
spin_lock(&swap_lock);
- next = swap_list.next;
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one. So we need to
+ * try again. Since we dropped the swap_lock, there may now
+ * be non-full higher priority swap_infos, and this si may have
+ * even been removed from the list (although very unlikely).
+ * Let's start over.
+ */
+ tmp = &swap_list_head;
}
atomic_long_inc(&nr_swap_pages);
@@ -766,27 +757,6 @@ out:
return NULL;
}
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
- int old_hp_index, new_hp_index;
-
- do {
- old_hp_index = atomic_read(&highest_priority_index);
- if (old_hp_index != -1 &&
- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
- break;
- new_hp_index = type;
- } while (atomic_cmpxchg(&highest_priority_index,
- old_hp_index, new_hp_index) != old_hp_index);
-}
-
static unsigned char swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
- set_highest_priority_index(p->type);
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- int i, prev;
+ struct swap_info_struct *si;
if (prio >= 0)
p->prio = prio;
@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
+ assert_spin_locked(&swap_lock);
+ BUG_ON(!list_empty(&p->list));
+ /*
+ * insert into swap list; the list is in priority order,
+ * so that get_swap_page() can get a page from the highest
+ * priority swap_info_struct with available page(s), and
+ * swapoff can adjust the auto-assigned (i.e. negative) prio
+ * values for any lower-priority swap_info_structs when
+ * removing a negative-prio swap_info_struct
+ */
+ list_for_each_entry(si, &swap_list_head, list) {
+ if (p->prio >= si->prio) {
+ list_add_tail(&p->list, &si->list);
+ return;
+ }
}
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p->type;
- else
- swap_info[prev]->next = p->type;
+ /*
+ * this covers two cases:
+ * 1) p->prio is less than all existing prio
+ * 2) the swap list is empty
+ */
+ list_add_tail(&p->list, &swap_list_head);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int i, type, prev;
- int err;
+ int err, found = 0;
unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out;
mapping = victim->f_mapping;
- prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
- p = swap_info[type];
+ list_for_each_entry(p, &swap_list_head, list) {
if (p->flags & SWP_WRITEOK) {
- if (p->swap_file->f_mapping == mapping)
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
break;
+ }
}
- prev = type;
}
- if (type < 0) {
+ if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0)
- swap_list.head = p->next;
- else
- swap_info[prev]->next = p->next;
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
- }
spin_lock(&p->lock);
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i]->next)
- swap_info[i]->prio = p->prio--;
+ struct swap_info_struct *si = p;
+
+ list_for_each_entry_continue(si, &swap_list_head, list) {
+ si->prio++;
+ }
least_priority++;
}
+ list_del_init(&p->list);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
set_current_oom_origin();
- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
if (err) {
@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- frontswap_invalidate_area(type);
+ frontswap_invalidate_area(p->type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(cluster_info);
vfree(frontswap_map);
/* Destroy swap account information */
- swap_cgroup_swapoff(type);
+ swap_cgroup_swapoff(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
+ INIT_LIST_HEAD(&p->list);
p->flags = SWP_USED;
- p->next = -1;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* [PATCH 2/4] plist: add helper functions
2014-05-12 16:38 ` [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
2014-05-12 16:38 ` [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head Dan Streetman
@ 2014-05-12 16:38 ` Dan Streetman
2014-05-12 16:38 ` [PATCHv2 3/4] plist: add plist_requeue Dan Streetman
2014-05-12 16:38 ` [PATCHv2 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
3 siblings, 0 replies; 45+ messages in thread
From: Dan Streetman @ 2014-05-12 16:38 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Weijie Yang,
Rik van Riel, Johannes Weiner, Bob Liu, linux-mm, linux-kernel,
Paul Gortmaker, Steven Rostedt, Thomas Gleixner
Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
define and initialize a struct plist_head.
Add plist_for_each_continue() and plist_for_each_entry_continue(),
equivalent to list_for_each_continue() and list_for_each_entry_continue(),
to iterate over a plist continuing after the current position.
Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
and ->next, implemented by list_prev_entry() and list_next_entry(), to
access the prev/next struct plist_node entry. These are needed because
unlike struct list_head, direct access of the prev/next struct plist_node
isn't possible; the list must be navigated via the contained struct list_head.
e.g. instead of accessing the prev by list_prev_entry(node, node_list)
it can be accessed by plist_prev(node).
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
This is only a resend, this patch isn't changed in this patch set.
Previous is at https://lkml.org/lkml/2014/5/2/498
include/linux/plist.h | 43 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/include/linux/plist.h b/include/linux/plist.h
index aa0fb39..c815491 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -98,6 +98,13 @@ struct plist_node {
}
/**
+ * PLIST_HEAD - declare and init plist_head
+ * @head: name for struct plist_head variable
+ */
+#define PLIST_HEAD(head) \
+ struct plist_head head = PLIST_HEAD_INIT(head)
+
+/**
* PLIST_NODE_INIT - static struct plist_node initializer
* @node: struct plist_node variable name
* @__prio: initial node priority
@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
list_for_each_entry(pos, &(head)->node_list, node_list)
/**
+ * plist_for_each_continue - continue iteration over the plist
+ * @pos: the type * to use as a loop cursor
+ * @head: the head for your list
+ *
+ * Continue to iterate over plist, continuing after the current position.
+ */
+#define plist_for_each_continue(pos, head) \
+ list_for_each_entry_continue(pos, &(head)->node_list, node_list)
+
+/**
* plist_for_each_safe - iterate safely over a plist of given type
* @pos: the type * to use as a loop counter
* @n: another type * to use as temporary storage
@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
list_for_each_entry(pos, &(head)->node_list, mem.node_list)
/**
+ * plist_for_each_entry_continue - continue iteration over list of given type
+ * @pos: the type * to use as a loop cursor
+ * @head: the head for your list
+ * @m: the name of the list_struct within the struct
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define plist_for_each_entry_continue(pos, head, m) \
+ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
+
+/**
* plist_for_each_entry_safe - iterate safely over list of given type
* @pos: the type * to use as a loop counter
* @n: another type * to use as temporary storage
@@ -229,6 +258,20 @@ static inline int plist_node_empty(const struct plist_node *node)
#endif
/**
+ * plist_next - get the next entry in list
+ * @pos: the type * to cursor
+ */
+#define plist_next(pos) \
+ list_next_entry(pos, node_list)
+
+/**
+ * plist_prev - get the prev entry in list
+ * @pos: the type * to cursor
+ */
+#define plist_prev(pos) \
+ list_prev_entry(pos, node_list)
+
+/**
* plist_first - return the first node (and thus, highest priority)
* @head: the &struct plist_head pointer
*
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* [PATCHv2 3/4] plist: add plist_requeue
2014-05-12 16:38 ` [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
2014-05-12 16:38 ` [PATCHv2 1/4] swap: change swap_info singly-linked list to list_head Dan Streetman
2014-05-12 16:38 ` [PATCH 2/4] plist: add helper functions Dan Streetman
@ 2014-05-12 16:38 ` Dan Streetman
2014-05-13 10:33 ` Mel Gorman
2014-05-12 16:38 ` [PATCHv2 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
3 siblings, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-12 16:38 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Weijie Yang,
Rik van Riel, Johannes Weiner, Bob Liu, linux-mm, linux-kernel,
Steven Rostedt, Peter Zijlstra, Paul Gortmaker, Thomas Gleixner
Add plist_requeue(), which moves the specified plist_node after
all other same-priority plist_nodes in the list. This is
essentially an optimized plist_del() followed by plist_add().
This is needed by swap, which (with the next patch in this set)
uses a plist of available swap devices. When a swap device
(either a swap partition or swap file) are added to the system
with swapon(), the device is added to a plist, ordered by
the swap device's priority. When swap needs to allocate a page
from one of the swap devices, it takes the page from the first swap
device on the plist, which is the highest priority swap device.
The swap device is left in the plist until all its pages are
used, and then removed from the plist when it becomes full.
However, as described in man 2 swapon, swap must allocate pages
from swap devices with the same priority in round-robin order;
to do this, on each swap page allocation, swap uses a page from
the first swap device in the plist, and then calls plist_requeue()
to move that swap device entry to after any other same-priority
swap devices. The next swap page allocation will again use a
page from the first swap device in the plist and requeue it,
and so on, resulting in round-robin usage of equal-priority
swap devices.
Also add plist_test_requeue() test function, for use by plist_test()
to test plist_requeue() function.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
Changes since v1 https://lkml.org/lkml/2014/5/2/494
-rename plist_rotate() to plist_requeue()
-change plist_requeue() local var naming to match plist_add() naming
-update/expand commit log to better explain why swap needs it
include/linux/plist.h | 2 ++
lib/plist.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 54 insertions(+)
diff --git a/include/linux/plist.h b/include/linux/plist.h
index c815491..8b6c970 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -141,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
extern void plist_add(struct plist_node *node, struct plist_head *head);
extern void plist_del(struct plist_node *node, struct plist_head *head);
+extern void plist_requeue(struct plist_node *node, struct plist_head *head);
+
/**
* plist_for_each - iterate over the plist
* @pos: the type * to use as a loop counter
diff --git a/lib/plist.c b/lib/plist.c
index b883a7a..d408e77 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
plist_check_head(head);
}
+/**
+ * plist_requeue - Requeue @node at end of same-prio entries.
+ *
+ * This is essentially an optimized plist_del() followed by
+ * plist_add(). It moves an entry already in the plist to
+ * after any other same-priority entries.
+ *
+ * @node: &struct plist_node pointer - entry to be moved
+ * @head: &struct plist_head pointer - list head
+ */
+void plist_requeue(struct plist_node *node, struct plist_head *head)
+{
+ struct plist_node *iter;
+ struct list_head *node_next = &head->node_list;
+
+ plist_check_head(head);
+ BUG_ON(plist_head_empty(head));
+ BUG_ON(plist_node_empty(node));
+
+ if (node == plist_last(head))
+ return;
+
+ iter = plist_next(node);
+
+ if (node->prio != iter->prio)
+ return;
+
+ plist_del(node, head);
+
+ plist_for_each_continue(iter, head) {
+ if (node->prio != iter->prio) {
+ node_next = &iter->node_list;
+ break;
+ }
+ }
+ list_add_tail(&node->node_list, node_next);
+
+ plist_check_head(head);
+}
+
#ifdef CONFIG_DEBUG_PI_LIST
#include <linux/sched.h>
#include <linux/module.h>
@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
BUG_ON(prio_pos->prio_list.next != &first->prio_list);
}
+static void __init plist_test_requeue(struct plist_node *node)
+{
+ plist_requeue(node, &test_head);
+
+ if (node != plist_last(&test_head))
+ BUG_ON(node->prio == plist_next(node)->prio);
+}
+
static int __init plist_test(void)
{
int nr_expect = 0, i, loop;
@@ -193,6 +241,10 @@ static int __init plist_test(void)
nr_expect--;
}
plist_test_check(nr_expect);
+ if (!plist_node_empty(test_node + i)) {
+ plist_test_requeue(test_node + i);
+ plist_test_check(nr_expect);
+ }
}
for (i = 0; i < ARRAY_SIZE(test_node); i++) {
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCHv2 3/4] plist: add plist_requeue
2014-05-12 16:38 ` [PATCHv2 3/4] plist: add plist_requeue Dan Streetman
@ 2014-05-13 10:33 ` Mel Gorman
0 siblings, 0 replies; 45+ messages in thread
From: Mel Gorman @ 2014-05-13 10:33 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Michal Hocko, Christian Ehrhardt,
Weijie Yang, Rik van Riel, Johannes Weiner, Bob Liu, linux-mm,
linux-kernel, Steven Rostedt, Peter Zijlstra, Paul Gortmaker,
Thomas Gleixner
On Mon, May 12, 2014 at 12:38:19PM -0400, Dan Streetman wrote:
> Add plist_requeue(), which moves the specified plist_node after
> all other same-priority plist_nodes in the list. This is
> essentially an optimized plist_del() followed by plist_add().
>
> This is needed by swap, which (with the next patch in this set)
> uses a plist of available swap devices. When a swap device
> (either a swap partition or swap file) are added to the system
> with swapon(), the device is added to a plist, ordered by
> the swap device's priority. When swap needs to allocate a page
> from one of the swap devices, it takes the page from the first swap
> device on the plist, which is the highest priority swap device.
> The swap device is left in the plist until all its pages are
> used, and then removed from the plist when it becomes full.
>
> However, as described in man 2 swapon, swap must allocate pages
> from swap devices with the same priority in round-robin order;
> to do this, on each swap page allocation, swap uses a page from
> the first swap device in the plist, and then calls plist_requeue()
> to move that swap device entry to after any other same-priority
> swap devices. The next swap page allocation will again use a
> page from the first swap device in the plist and requeue it,
> and so on, resulting in round-robin usage of equal-priority
> swap devices.
>
> Also add plist_test_requeue() test function, for use by plist_test()
> to test plist_requeue() function.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
>
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 45+ messages in thread
* [PATCHv2 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-12 16:38 ` [PATCHv3 0/4] swap: simplify/fix swap_list handling and iteration Dan Streetman
` (2 preceding siblings ...)
2014-05-12 16:38 ` [PATCHv2 3/4] plist: add plist_requeue Dan Streetman
@ 2014-05-12 16:38 ` Dan Streetman
2014-05-13 10:34 ` Mel Gorman
3 siblings, 1 reply; 45+ messages in thread
From: Dan Streetman @ 2014-05-12 16:38 UTC (permalink / raw)
To: Hugh Dickins, Andrew Morton, Mel Gorman
Cc: Dan Streetman, Michal Hocko, Christian Ehrhardt, Weijie Yang,
Rik van Riel, Johannes Weiner, Bob Liu, linux-mm, linux-kernel,
Shaohua Li, Steven Rostedt, Peter Zijlstra
Originally get_swap_page() started iterating through the singly-linked
list of swap_info_structs using swap_list.next or highest_priority_index,
which both were intended to point to the highest priority active swap
target that was not full. The first patch in this series changed the
singly-linked list to a doubly-linked list, and removed the logic to start
at the highest priority non-full entry; it starts scanning at the highest
priority entry each time, even if the entry is full.
Replace the manually ordered swap_list_head with a plist, swap_active_head.
Add a new plist, swap_avail_head. The original swap_active_head plist
contains all active swap_info_structs, as before, while the new
swap_avail_head plist contains only swap_info_structs that are active and
available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect
the swap_avail_head list.
Mel Gorman suggested using plists since they internally handle ordering
the list entries based on priority, which is exactly what swap was doing
manually. All the ordering code is now removed, and swap_info_struct
entries and simply added to their corresponding plist and automatically
ordered correctly.
Using a new plist for available swap_info_structs simplifies and
optimizes get_swap_page(), which no longer has to iterate over full
swap_info_structs. Using a new spinlock for swap_avail_head plist
allows each swap_info_struct to add or remove themselves from the
plist when they become full or not-full; previously they could not
do so because the swap_info_struct->lock is held when they change
from full<->not-full, and the swap_lock protecting the main
swap_active_head must be ordered before any swap_info_struct->lock.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
Changes since v1 https://lkml.org/lkml/2014/5/2/510
-add missing spin_unlock(&swap_avail_head) in get_swap_page()
in total failure case
-plist_rotate() call changed to plist_requeue()
include/linux/swap.h | 3 +-
include/linux/swapfile.h | 2 +-
mm/frontswap.c | 6 +-
mm/swapfile.c | 145 +++++++++++++++++++++++++++++------------------
4 files changed, 97 insertions(+), 59 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8bb85d6..9155bcd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,8 @@ struct percpu_cluster {
struct swap_info_struct {
unsigned long flags; /* SWP_USED etc: see above */
signed short prio; /* swap priority of this type */
- struct list_head list; /* entry in swap list */
+ struct plist_node list; /* entry in swap_active_head */
+ struct plist_node avail_list; /* entry in swap_avail_head */
signed char type; /* strange name for an index */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 2eab382..388293a 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
* want to expose them to the dozens of source files that include swap.h
*/
extern spinlock_t swap_lock;
-extern struct list_head swap_list_head;
+extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index fae1160..c30eec5 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void)
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- list_for_each_entry(si, &swap_list_head, list)
+ plist_for_each_entry(si, &swap_active_head, list)
totalpages += atomic_read(&si->frontswap_pages);
return totalpages;
}
@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
unsigned long pages = 0, pages_to_unuse = 0;
assert_spin_locked(&swap_lock);
- list_for_each_entry(si, &swap_list_head, list) {
+ plist_for_each_entry(si, &swap_active_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_list_head each time
+ * so restart scan from swap_active_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c95a8c..beeeef8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry ";
* all active swap_info_structs
* protected with swap_lock, and ordered by priority.
*/
-LIST_HEAD(swap_list_head);
+PLIST_HEAD(swap_active_head);
+
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -594,6 +609,9 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
@@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void)
{
struct swap_info_struct *si, *next;
pgoff_t offset;
- struct list_head *tmp;
- spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- list_for_each(tmp, &swap_list_head) {
- si = list_entry(tmp, typeof(*si), list);
+ spin_lock(&swap_avail_lock);
+
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ /* requeue si to after same-priority siblings */
+ plist_requeue(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&si->lock);
- continue;
+ goto nextsi;
}
- /*
- * rotate the current swap_info that we're going to use
- * to after any other swap_info that have the same prio,
- * so that all equal-priority swap_info get used equally
- */
- next = si;
- list_for_each_entry_continue(next, &swap_list_head, list) {
- if (si->prio != next->prio)
- break;
- list_rotate_left(&si->list);
- next = si;
- }
-
- spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
return swp_entry(si->type, offset);
- spin_lock(&swap_lock);
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&swap_avail_lock);
+nextsi:
/*
* if we got here, it's likely that si was almost full before,
* and since scan_swap_map() can drop the si->lock, multiple
* callers probably all tried to get a page from the same si
- * and it filled up before we could get one. So we need to
- * try again. Since we dropped the swap_lock, there may now
- * be non-full higher priority swap_infos, and this si may have
- * even been removed from the list (although very unlikely).
- * Let's start over.
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over.
*/
- tmp = &swap_list_head;
+ if (plist_node_empty(&next->avail_list))
+ goto start_over;
}
+ spin_unlock(&swap_avail_lock);
+
atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -798,8 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit)
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
p->highest_bit = offset;
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- struct swap_info_struct *si;
-
if (prio >= 0)
p->prio = prio;
else
p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ p->avail_list.prio = -p->prio;
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
@@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
total_swap_pages += p->pages;
assert_spin_locked(&swap_lock);
- BUG_ON(!list_empty(&p->list));
- /*
- * insert into swap list; the list is in priority order,
- * so that get_swap_page() can get a page from the highest
- * priority swap_info_struct with available page(s), and
- * swapoff can adjust the auto-assigned (i.e. negative) prio
- * values for any lower-priority swap_info_structs when
- * removing a negative-prio swap_info_struct
- */
- list_for_each_entry(si, &swap_list_head, list) {
- if (p->prio >= si->prio) {
- list_add_tail(&p->list, &si->list);
- return;
- }
- }
/*
- * this covers two cases:
- * 1) p->prio is less than all existing prio
- * 2) the swap list is empty
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
*/
- list_add_tail(&p->list, &swap_list_head);
+ plist_add(&p->list, &swap_active_head);
+ spin_lock(&swap_avail_lock);
+ plist_add(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mapping = victim->f_mapping;
spin_lock(&swap_lock);
- list_for_each_entry(p, &swap_list_head, list) {
+ plist_for_each_entry(p, &swap_active_head, list) {
if (p->flags & SWP_WRITEOK) {
if (p->swap_file->f_mapping == mapping) {
found = 1;
@@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
+ spin_lock(&swap_avail_lock);
+ plist_del(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
- list_for_each_entry_continue(si, &swap_list_head, list) {
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
+ si->list.prio--;
+ si->avail_list.prio--;
}
least_priority++;
}
- list_del_init(&p->list);
+ plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
- INIT_LIST_HEAD(&p->list);
+ plist_node_init(&p->list, 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 45+ messages in thread
* Re: [PATCHv2 4/4] swap: change swap_list_head to plist, add swap_avail_head
2014-05-12 16:38 ` [PATCHv2 4/4] swap: change swap_list_head to plist, add swap_avail_head Dan Streetman
@ 2014-05-13 10:34 ` Mel Gorman
0 siblings, 0 replies; 45+ messages in thread
From: Mel Gorman @ 2014-05-13 10:34 UTC (permalink / raw)
To: Dan Streetman
Cc: Hugh Dickins, Andrew Morton, Michal Hocko, Christian Ehrhardt,
Weijie Yang, Rik van Riel, Johannes Weiner, Bob Liu, linux-mm,
linux-kernel, Shaohua Li, Steven Rostedt, Peter Zijlstra
On Mon, May 12, 2014 at 12:38:20PM -0400, Dan Streetman wrote:
> Originally get_swap_page() started iterating through the singly-linked
> list of swap_info_structs using swap_list.next or highest_priority_index,
> which both were intended to point to the highest priority active swap
> target that was not full. The first patch in this series changed the
> singly-linked list to a doubly-linked list, and removed the logic to start
> at the highest priority non-full entry; it starts scanning at the highest
> priority entry each time, even if the entry is full.
>
> Replace the manually ordered swap_list_head with a plist, swap_active_head.
> Add a new plist, swap_avail_head. The original swap_active_head plist
> contains all active swap_info_structs, as before, while the new
> swap_avail_head plist contains only swap_info_structs that are active and
> available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect
> the swap_avail_head list.
>
> Mel Gorman suggested using plists since they internally handle ordering
> the list entries based on priority, which is exactly what swap was doing
> manually. All the ordering code is now removed, and swap_info_struct
> entries and simply added to their corresponding plist and automatically
> ordered correctly.
>
> Using a new plist for available swap_info_structs simplifies and
> optimizes get_swap_page(), which no longer has to iterate over full
> swap_info_structs. Using a new spinlock for swap_avail_head plist
> allows each swap_info_struct to add or remove themselves from the
> plist when they become full or not-full; previously they could not
> do so because the swap_info_struct->lock is held when they change
> from full<->not-full, and the swap_lock protecting the main
> swap_active_head must be ordered before any swap_info_struct->lock.
>
> Signed-off-by: Dan Streetman <ddstreet@ieee.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Shaohua Li <shli@fusionio.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
>
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 45+ messages in thread