* [PATCH v11 01/25] fs: bump inode and dentry counters to long
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly
true when umounting a filesystem, where eventually since every live
object will eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
CC: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
fs/dcache.c | 8 ++++----
fs/inode.c | 18 +++++++++---------
fs/internal.h | 2 +-
include/linux/dcache.h | 10 +++++-----
include/linux/fs.h | 4 ++--
include/uapi/linux/fs.h | 6 +++---
kernel/sysctl.c | 6 +++---
7 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b908..aca4e4b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -117,13 +117,13 @@ struct dentry_stat_t dentry_stat = {
.age_limit = 45,
};
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+static long get_nr_dentry(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry, i);
return sum < 0 ? 0 : sum;
@@ -133,7 +133,7 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3..ff29765 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -70,33 +70,33 @@ EXPORT_SYMBOL(empty_aops);
*/
struct inodes_stat_t inodes_stat;
-static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static DEFINE_PER_CPU(unsigned int, nr_unused);
+static DEFINE_PER_CPU(unsigned long, nr_inodes);
+static DEFINE_PER_CPU(unsigned long, nr_unused);
static struct kmem_cache *inode_cachep __read_mostly;
-static int get_nr_inodes(void)
+static long get_nr_inodes(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_inodes, i);
return sum < 0 ? 0 : sum;
}
-static inline int get_nr_inodes_unused(void)
+static inline long get_nr_inodes_unused(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_unused, i);
return sum < 0 ? 0 : sum;
}
-int get_nr_dirty_inodes(void)
+long get_nr_dirty_inodes(void)
{
/* not actually dirty inodes, but a wild approximation */
- int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
return nr_dirty > 0 ? nr_dirty : 0;
}
@@ -109,7 +109,7 @@ int proc_nr_inodes(ctl_table *table, int write,
{
inodes_stat.nr_inodes = get_nr_inodes();
inodes_stat.nr_unused = get_nr_inodes_unused();
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
diff --git a/fs/internal.h b/fs/internal.h
index eaa75f7..cd5009f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -117,7 +117,7 @@ extern void inode_add_lru(struct inode *inode);
*/
extern void inode_wb_list_del(struct inode *inode);
-extern int get_nr_dirty_inodes(void);
+extern long get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1a6bb81..1a82bdb 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -54,11 +54,11 @@ struct qstr {
#define hashlen_len(hashlen) ((u32)((hashlen) >> 32))
struct dentry_stat_t {
- int nr_dentry;
- int nr_unused;
- int age_limit; /* age in seconds */
- int want_pages; /* pages requested by system */
- int dummy[2];
+ long nr_dentry;
+ long nr_unused;
+ long age_limit; /* age in seconds */
+ long want_pages; /* pages requested by system */
+ long dummy[2];
};
extern struct dentry_stat_t dentry_stat;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a4c9fbe..ad3eb76 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1266,12 +1266,12 @@ struct super_block {
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
struct list_head s_dentry_lru; /* unused dentry lru */
- int s_nr_dentry_unused; /* # of dentry on lru */
+ long s_nr_dentry_unused; /* # of dentry on lru */
/* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp;
struct list_head s_inode_lru; /* unused inode lru */
- int s_nr_inodes_unused; /* # of inodes on lru */
+ long s_nr_inodes_unused; /* # of inodes on lru */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a4ed56c..6c28b61 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -49,9 +49,9 @@ struct files_stat_struct {
};
struct inodes_stat_t {
- int nr_inodes;
- int nr_unused;
- int dummy[5]; /* padding for sysctl ABI compatibility */
+ long nr_inodes;
+ long nr_unused;
+ long dummy[5]; /* padding for sysctl ABI compatibility */
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf45..fb90f7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1456,14 +1456,14 @@ static struct ctl_table fs_table[] = {
{
.procname = "inode-nr",
.data = &inodes_stat,
- .maxlen = 2*sizeof(int),
+ .maxlen = 2*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
{
.procname = "inode-state",
.data = &inodes_stat,
- .maxlen = 7*sizeof(int),
+ .maxlen = 7*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
@@ -1493,7 +1493,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "dentry-state",
.data = &dentry_stat,
- .maxlen = 6*sizeof(int),
+ .maxlen = 6*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 01/25] fs: bump inode and dentry counters to long
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly
true when umounting a filesystem, where eventually since every live
object will eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer@openvz.org>
CC: Dave Chinner <dchinner@redhat.com>
---
fs/dcache.c | 8 ++++----
fs/inode.c | 18 +++++++++---------
fs/internal.h | 2 +-
include/linux/dcache.h | 10 +++++-----
include/linux/fs.h | 4 ++--
include/uapi/linux/fs.h | 6 +++---
kernel/sysctl.c | 6 +++---
7 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b908..aca4e4b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -117,13 +117,13 @@ struct dentry_stat_t dentry_stat = {
.age_limit = 45,
};
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+static long get_nr_dentry(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry, i);
return sum < 0 ? 0 : sum;
@@ -133,7 +133,7 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3..ff29765 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -70,33 +70,33 @@ EXPORT_SYMBOL(empty_aops);
*/
struct inodes_stat_t inodes_stat;
-static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static DEFINE_PER_CPU(unsigned int, nr_unused);
+static DEFINE_PER_CPU(unsigned long, nr_inodes);
+static DEFINE_PER_CPU(unsigned long, nr_unused);
static struct kmem_cache *inode_cachep __read_mostly;
-static int get_nr_inodes(void)
+static long get_nr_inodes(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_inodes, i);
return sum < 0 ? 0 : sum;
}
-static inline int get_nr_inodes_unused(void)
+static inline long get_nr_inodes_unused(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_unused, i);
return sum < 0 ? 0 : sum;
}
-int get_nr_dirty_inodes(void)
+long get_nr_dirty_inodes(void)
{
/* not actually dirty inodes, but a wild approximation */
- int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
return nr_dirty > 0 ? nr_dirty : 0;
}
@@ -109,7 +109,7 @@ int proc_nr_inodes(ctl_table *table, int write,
{
inodes_stat.nr_inodes = get_nr_inodes();
inodes_stat.nr_unused = get_nr_inodes_unused();
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
diff --git a/fs/internal.h b/fs/internal.h
index eaa75f7..cd5009f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -117,7 +117,7 @@ extern void inode_add_lru(struct inode *inode);
*/
extern void inode_wb_list_del(struct inode *inode);
-extern int get_nr_dirty_inodes(void);
+extern long get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1a6bb81..1a82bdb 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -54,11 +54,11 @@ struct qstr {
#define hashlen_len(hashlen) ((u32)((hashlen) >> 32))
struct dentry_stat_t {
- int nr_dentry;
- int nr_unused;
- int age_limit; /* age in seconds */
- int want_pages; /* pages requested by system */
- int dummy[2];
+ long nr_dentry;
+ long nr_unused;
+ long age_limit; /* age in seconds */
+ long want_pages; /* pages requested by system */
+ long dummy[2];
};
extern struct dentry_stat_t dentry_stat;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a4c9fbe..ad3eb76 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1266,12 +1266,12 @@ struct super_block {
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
struct list_head s_dentry_lru; /* unused dentry lru */
- int s_nr_dentry_unused; /* # of dentry on lru */
+ long s_nr_dentry_unused; /* # of dentry on lru */
/* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp;
struct list_head s_inode_lru; /* unused inode lru */
- int s_nr_inodes_unused; /* # of inodes on lru */
+ long s_nr_inodes_unused; /* # of inodes on lru */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a4ed56c..6c28b61 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -49,9 +49,9 @@ struct files_stat_struct {
};
struct inodes_stat_t {
- int nr_inodes;
- int nr_unused;
- int dummy[5]; /* padding for sysctl ABI compatibility */
+ long nr_inodes;
+ long nr_unused;
+ long dummy[5]; /* padding for sysctl ABI compatibility */
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf45..fb90f7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1456,14 +1456,14 @@ static struct ctl_table fs_table[] = {
{
.procname = "inode-nr",
.data = &inodes_stat,
- .maxlen = 2*sizeof(int),
+ .maxlen = 2*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
{
.procname = "inode-state",
.data = &inodes_stat,
- .maxlen = 7*sizeof(int),
+ .maxlen = 7*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
@@ -1493,7 +1493,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "dentry-state",
.data = &dentry_stat,
- .maxlen = 6*sizeof(int),
+ .maxlen = 6*sizeof(long),
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 03/25] dcache: convert dentry_stat.nr_unused to per-cpu counters
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Before we split up the dcache_lru_lock, the unused dentry counter
needs to be made independent of the global dcache_lru_lock. Convert
it to per-cpu counters to do this.
[ v11: updated comments about the handcrafted percpu implementation ]
[ v5: comment about possible cpus ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Reviewed-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
fs/dcache.c | 30 +++++++++++++++++++++++++++---
1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index aca4e4b..0466dbd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -118,8 +118,22 @@ struct dentry_stat_t dentry_stat = {
};
static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
static long get_nr_dentry(void)
{
int i;
@@ -129,10 +143,20 @@ static long get_nr_dentry(void)
return sum < 0 ? 0 : sum;
}
+static long get_nr_dentry_unused(void)
+{
+ int i;
+ long sum = 0;
+ for_each_possible_cpu(i)
+ sum += per_cpu(nr_dentry_unused, i);
+ return sum < 0 ? 0 : sum;
+}
+
int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
+ dentry_stat.nr_unused = get_nr_dentry_unused();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
@@ -312,7 +336,7 @@ static void dentry_lru_add(struct dentry *dentry)
spin_lock(&dcache_lru_lock);
list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
+ this_cpu_inc(nr_dentry_unused);
spin_unlock(&dcache_lru_lock);
}
}
@@ -322,7 +346,7 @@ static void __dentry_lru_del(struct dentry *dentry)
list_del_init(&dentry->d_lru);
dentry->d_flags &= ~DCACHE_SHRINK_LIST;
dentry->d_sb->s_nr_dentry_unused--;
- dentry_stat.nr_unused--;
+ this_cpu_dec(nr_dentry_unused);
}
/*
@@ -343,7 +367,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
if (list_empty(&dentry->d_lru)) {
list_add_tail(&dentry->d_lru, list);
dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
+ this_cpu_inc(nr_dentry_unused);
} else {
list_move_tail(&dentry->d_lru, list);
}
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 03/25] dcache: convert dentry_stat.nr_unused to per-cpu counters
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Before we split up the dcache_lru_lock, the unused dentry counter
needs to be made independent of the global dcache_lru_lock. Convert
it to per-cpu counters to do this.
[ v11: updated comments about the handcrafted percpu implementation ]
[ v5: comment about possible cpus ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mel Gorman <mgorman@suse.de>
---
fs/dcache.c | 30 +++++++++++++++++++++++++++---
1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index aca4e4b..0466dbd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -118,8 +118,22 @@ struct dentry_stat_t dentry_stat = {
};
static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
static long get_nr_dentry(void)
{
int i;
@@ -129,10 +143,20 @@ static long get_nr_dentry(void)
return sum < 0 ? 0 : sum;
}
+static long get_nr_dentry_unused(void)
+{
+ int i;
+ long sum = 0;
+ for_each_possible_cpu(i)
+ sum += per_cpu(nr_dentry_unused, i);
+ return sum < 0 ? 0 : sum;
+}
+
int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
+ dentry_stat.nr_unused = get_nr_dentry_unused();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
@@ -312,7 +336,7 @@ static void dentry_lru_add(struct dentry *dentry)
spin_lock(&dcache_lru_lock);
list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
+ this_cpu_inc(nr_dentry_unused);
spin_unlock(&dcache_lru_lock);
}
}
@@ -322,7 +346,7 @@ static void __dentry_lru_del(struct dentry *dentry)
list_del_init(&dentry->d_lru);
dentry->d_flags &= ~DCACHE_SHRINK_LIST;
dentry->d_sb->s_nr_dentry_unused--;
- dentry_stat.nr_unused--;
+ this_cpu_dec(nr_dentry_unused);
}
/*
@@ -343,7 +367,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
if (list_empty(&dentry->d_lru)) {
list_add_tail(&dentry->d_lru, list);
dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
+ this_cpu_inc(nr_dentry_unused);
} else {
list_move_tail(&dentry->d_lru, list);
}
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 04/25] dentry: move to per-sb LRU locks
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
With the dentry LRUs being per-sb structures, there is no real need for
a global dentry_lru_lock. The locking can be made more fine-grained by
moving to a per-sb LRU lock, isolating the LRU operations of different
filesytsems completely from each other. The need for this is independent
of any performance consideration that may arise: in the interest of
abstracting the lru operations away, it is mandatory that each lru works
around its own lock instead of a global lock for all of them.
[ glommer: updated changelog ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Reviewed-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
fs/dcache.c | 33 ++++++++++++++++-----------------
fs/super.c | 1 +
include/linux/fs.h | 4 +++-
3 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 0466dbd..0a49669 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -48,7 +48,7 @@
* - the dcache hash table
* s_anon bl list spinlock protects:
* - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
* - the dcache lru lists and counters
* d_lock protects:
* - d_flags
@@ -63,7 +63,7 @@
* Ordering:
* dentry->d_inode->i_lock
* dentry->d_lock
- * dcache_lru_lock
+ * dentry->d_sb->s_dentry_lru_lock
* dcache_hash_bucket lock
* s_anon lock
*
@@ -81,7 +81,6 @@
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
@@ -333,11 +332,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
static void dentry_lru_add(struct dentry *dentry)
{
if (list_empty(&dentry->d_lru)) {
- spin_lock(&dcache_lru_lock);
+ spin_lock(&dentry->d_sb->s_dentry_lru_lock);
list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
dentry->d_sb->s_nr_dentry_unused++;
this_cpu_inc(nr_dentry_unused);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
}
@@ -355,15 +354,15 @@ static void __dentry_lru_del(struct dentry *dentry)
static void dentry_lru_del(struct dentry *dentry)
{
if (!list_empty(&dentry->d_lru)) {
- spin_lock(&dcache_lru_lock);
+ spin_lock(&dentry->d_sb->s_dentry_lru_lock);
__dentry_lru_del(dentry);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
}
static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
- spin_lock(&dcache_lru_lock);
+ spin_lock(&dentry->d_sb->s_dentry_lru_lock);
if (list_empty(&dentry->d_lru)) {
list_add_tail(&dentry->d_lru, list);
dentry->d_sb->s_nr_dentry_unused++;
@@ -371,7 +370,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
} else {
list_move_tail(&dentry->d_lru, list);
}
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
/**
@@ -851,14 +850,14 @@ void prune_dcache_sb(struct super_block *sb, int count)
LIST_HEAD(tmp);
relock:
- spin_lock(&dcache_lru_lock);
+ spin_lock(&sb->s_dentry_lru_lock);
while (!list_empty(&sb->s_dentry_lru)) {
dentry = list_entry(sb->s_dentry_lru.prev,
struct dentry, d_lru);
BUG_ON(dentry->d_sb != sb);
if (!spin_trylock(&dentry->d_lock)) {
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
cpu_relax();
goto relock;
}
@@ -874,11 +873,11 @@ relock:
if (!--count)
break;
}
- cond_resched_lock(&dcache_lru_lock);
+ cond_resched_lock(&sb->s_dentry_lru_lock);
}
if (!list_empty(&referenced))
list_splice(&referenced, &sb->s_dentry_lru);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
shrink_dentry_list(&tmp);
}
@@ -894,14 +893,14 @@ void shrink_dcache_sb(struct super_block *sb)
{
LIST_HEAD(tmp);
- spin_lock(&dcache_lru_lock);
+ spin_lock(&sb->s_dentry_lru_lock);
while (!list_empty(&sb->s_dentry_lru)) {
list_splice_init(&sb->s_dentry_lru, &tmp);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
shrink_dentry_list(&tmp);
- spin_lock(&dcache_lru_lock);
+ spin_lock(&sb->s_dentry_lru_lock);
}
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
}
EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/super.c b/fs/super.c
index 2a37fd6..0be75fb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -182,6 +182,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
+ spin_lock_init(&s->s_dentry_lru_lock);
INIT_LIST_HEAD(&s->s_inode_lru);
spin_lock_init(&s->s_inode_lru_lock);
INIT_LIST_HEAD(&s->s_mounts);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ad3eb76..41cbe7a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1264,7 +1264,9 @@ struct super_block {
struct list_head s_files;
#endif
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
- /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
+
+ /* s_dentry_lru_lock protects s_dentry_lru and s_nr_dentry_unused */
+ spinlock_t s_dentry_lru_lock ____cacheline_aligned_in_smp;
struct list_head s_dentry_lru; /* unused dentry lru */
long s_nr_dentry_unused; /* # of dentry on lru */
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 04/25] dentry: move to per-sb LRU locks
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
With the dentry LRUs being per-sb structures, there is no real need for
a global dentry_lru_lock. The locking can be made more fine-grained by
moving to a per-sb LRU lock, isolating the LRU operations of different
filesytsems completely from each other. The need for this is independent
of any performance consideration that may arise: in the interest of
abstracting the lru operations away, it is mandatory that each lru works
around its own lock instead of a global lock for all of them.
[ glommer: updated changelog ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mel Gorman <mgorman@suse.de>
---
fs/dcache.c | 33 ++++++++++++++++-----------------
fs/super.c | 1 +
include/linux/fs.h | 4 +++-
3 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 0466dbd..0a49669 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -48,7 +48,7 @@
* - the dcache hash table
* s_anon bl list spinlock protects:
* - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
* - the dcache lru lists and counters
* d_lock protects:
* - d_flags
@@ -63,7 +63,7 @@
* Ordering:
* dentry->d_inode->i_lock
* dentry->d_lock
- * dcache_lru_lock
+ * dentry->d_sb->s_dentry_lru_lock
* dcache_hash_bucket lock
* s_anon lock
*
@@ -81,7 +81,6 @@
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
@@ -333,11 +332,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
static void dentry_lru_add(struct dentry *dentry)
{
if (list_empty(&dentry->d_lru)) {
- spin_lock(&dcache_lru_lock);
+ spin_lock(&dentry->d_sb->s_dentry_lru_lock);
list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
dentry->d_sb->s_nr_dentry_unused++;
this_cpu_inc(nr_dentry_unused);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
}
@@ -355,15 +354,15 @@ static void __dentry_lru_del(struct dentry *dentry)
static void dentry_lru_del(struct dentry *dentry)
{
if (!list_empty(&dentry->d_lru)) {
- spin_lock(&dcache_lru_lock);
+ spin_lock(&dentry->d_sb->s_dentry_lru_lock);
__dentry_lru_del(dentry);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
}
static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
- spin_lock(&dcache_lru_lock);
+ spin_lock(&dentry->d_sb->s_dentry_lru_lock);
if (list_empty(&dentry->d_lru)) {
list_add_tail(&dentry->d_lru, list);
dentry->d_sb->s_nr_dentry_unused++;
@@ -371,7 +370,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
} else {
list_move_tail(&dentry->d_lru, list);
}
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
/**
@@ -851,14 +850,14 @@ void prune_dcache_sb(struct super_block *sb, int count)
LIST_HEAD(tmp);
relock:
- spin_lock(&dcache_lru_lock);
+ spin_lock(&sb->s_dentry_lru_lock);
while (!list_empty(&sb->s_dentry_lru)) {
dentry = list_entry(sb->s_dentry_lru.prev,
struct dentry, d_lru);
BUG_ON(dentry->d_sb != sb);
if (!spin_trylock(&dentry->d_lock)) {
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
cpu_relax();
goto relock;
}
@@ -874,11 +873,11 @@ relock:
if (!--count)
break;
}
- cond_resched_lock(&dcache_lru_lock);
+ cond_resched_lock(&sb->s_dentry_lru_lock);
}
if (!list_empty(&referenced))
list_splice(&referenced, &sb->s_dentry_lru);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
shrink_dentry_list(&tmp);
}
@@ -894,14 +893,14 @@ void shrink_dcache_sb(struct super_block *sb)
{
LIST_HEAD(tmp);
- spin_lock(&dcache_lru_lock);
+ spin_lock(&sb->s_dentry_lru_lock);
while (!list_empty(&sb->s_dentry_lru)) {
list_splice_init(&sb->s_dentry_lru, &tmp);
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
shrink_dentry_list(&tmp);
- spin_lock(&dcache_lru_lock);
+ spin_lock(&sb->s_dentry_lru_lock);
}
- spin_unlock(&dcache_lru_lock);
+ spin_unlock(&sb->s_dentry_lru_lock);
}
EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/super.c b/fs/super.c
index 2a37fd6..0be75fb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -182,6 +182,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
+ spin_lock_init(&s->s_dentry_lru_lock);
INIT_LIST_HEAD(&s->s_inode_lru);
spin_lock_init(&s->s_inode_lru_lock);
INIT_LIST_HEAD(&s->s_mounts);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ad3eb76..41cbe7a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1264,7 +1264,9 @@ struct super_block {
struct list_head s_files;
#endif
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
- /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
+
+ /* s_dentry_lru_lock protects s_dentry_lru and s_nr_dentry_unused */
+ spinlock_t s_dentry_lru_lock ____cacheline_aligned_in_smp;
struct list_head s_dentry_lru; /* unused dentry lru */
long s_nr_dentry_unused; /* # of dentry on lru */
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 05/25] dcache: remove dentries from LRU before putting on dispose list
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
One of the big problems with modifying the way the dcache shrinker
and LRU implementation works is that the LRU is abused in several
ways. One of these is shrink_dentry_list().
Basically, we can move a dentry off the LRU onto a different list
without doing any accounting changes, and then use dentry_lru_prune()
to remove it from what-ever list it is now on to do the LRU
accounting at that point.
This makes it -really hard- to change the LRU implementation. The
use of the per-sb LRU lock serialises movement of the dentries
between the different lists and the removal of them, and this is the
only reason that it works. If we want to break up the dentry LRU
lock and lists into, say, per-node lists, we remove the only
serialisation that allows this lru list/dispose list abuse to work.
To make this work effectively, the dispose list has to be isolated
from the LRU list - dentries have to be removed from the LRU
*before* being placed on the dispose list. This means that the LRU
accounting and isolation is completed before disposal is started,
and that means we can change the LRU implementation freely in
future.
This means that dentries *must* be marked with DCACHE_SHRINK_LIST
when they are placed on the dispose list so that we don't think that
parent dentries found in try_prune_one_dentry() are on the LRU when
the are actually on the dispose list. This would result in
accounting the dentry to the LRU a second time. Hence
dentry_lru_del() has to handle the DCACHE_SHRINK_LIST case
differently because the dentry isn't on the LRU list.
[ v2: don't decrement nr unused twice, spotted by Sha Zhengju ]
[ v7: (dchinner)
- shrink list leaks dentries when inode/parent can't be locked in
dentry_kill().
- remove the readdition of dentry_lru_prune(). ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
fs/dcache.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 77 insertions(+), 21 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 0a49669..16b599e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -327,7 +327,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
}
/*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * dentry_lru_(add|del|move_list) must be called with d_lock held.
*/
static void dentry_lru_add(struct dentry *dentry)
{
@@ -343,16 +343,25 @@ static void dentry_lru_add(struct dentry *dentry)
static void __dentry_lru_del(struct dentry *dentry)
{
list_del_init(&dentry->d_lru);
- dentry->d_flags &= ~DCACHE_SHRINK_LIST;
dentry->d_sb->s_nr_dentry_unused--;
this_cpu_dec(nr_dentry_unused);
}
/*
* Remove a dentry with references from the LRU.
+ *
+ * If we are on the shrink list, then we can get to try_prune_one_dentry() and
+ * lose our last reference through the parent walk. In this case, we need to
+ * remove ourselves from the shrink list, not the LRU.
*/
static void dentry_lru_del(struct dentry *dentry)
{
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ list_del_init(&dentry->d_lru);
+ dentry->d_flags &= ~DCACHE_SHRINK_LIST;
+ return;
+ }
+
if (!list_empty(&dentry->d_lru)) {
spin_lock(&dentry->d_sb->s_dentry_lru_lock);
__dentry_lru_del(dentry);
@@ -362,13 +371,15 @@ static void dentry_lru_del(struct dentry *dentry)
static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
+ BUG_ON(dentry->d_flags & DCACHE_SHRINK_LIST);
+
spin_lock(&dentry->d_sb->s_dentry_lru_lock);
if (list_empty(&dentry->d_lru)) {
list_add_tail(&dentry->d_lru, list);
- dentry->d_sb->s_nr_dentry_unused++;
- this_cpu_inc(nr_dentry_unused);
} else {
list_move_tail(&dentry->d_lru, list);
+ dentry->d_sb->s_nr_dentry_unused--;
+ this_cpu_dec(nr_dentry_unused);
}
spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
@@ -466,7 +477,8 @@ EXPORT_SYMBOL(d_drop);
* If ref is non-zero, then decrement the refcount too.
* Returns dentry requiring refcount drop, or NULL if we're done.
*/
-static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+static inline struct dentry *
+dentry_kill(struct dentry *dentry, int ref, int unlock_on_failure)
__releases(dentry->d_lock)
{
struct inode *inode;
@@ -475,8 +487,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
inode = dentry->d_inode;
if (inode && !spin_trylock(&inode->i_lock)) {
relock:
- spin_unlock(&dentry->d_lock);
- cpu_relax();
+ if (unlock_on_failure) {
+ spin_unlock(&dentry->d_lock);
+ cpu_relax();
+ }
return dentry; /* try again with same dentry */
}
if (IS_ROOT(dentry))
@@ -563,7 +577,7 @@ repeat:
return;
kill_it:
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry, 1, 1);
if (dentry)
goto repeat;
}
@@ -762,12 +776,12 @@ EXPORT_SYMBOL(d_prune_aliases);
*
* This may fail if locks cannot be acquired no problem, just try again.
*/
-static void try_prune_one_dentry(struct dentry *dentry)
+static struct dentry * try_prune_one_dentry(struct dentry *dentry)
__releases(dentry->d_lock)
{
struct dentry *parent;
- parent = dentry_kill(dentry, 0);
+ parent = dentry_kill(dentry, 0, 0);
/*
* If dentry_kill returns NULL, we have nothing more to do.
* if it returns the same dentry, trylocks failed. In either
@@ -779,9 +793,9 @@ static void try_prune_one_dentry(struct dentry *dentry)
* fragmentation.
*/
if (!parent)
- return;
+ return NULL;
if (parent == dentry)
- return;
+ return dentry;
/* Prune ancestors. */
dentry = parent;
@@ -790,10 +804,11 @@ static void try_prune_one_dentry(struct dentry *dentry)
if (dentry->d_count > 1) {
dentry->d_count--;
spin_unlock(&dentry->d_lock);
- return;
+ return NULL;
}
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry, 1, 1);
}
+ return NULL;
}
static void shrink_dentry_list(struct list_head *list)
@@ -812,21 +827,31 @@ static void shrink_dentry_list(struct list_head *list)
}
/*
+ * The dispose list is isolated and dentries are not accounted
+ * to the LRU here, so we can simply remove it from the list
+ * here regardless of whether it is referenced or not.
+ */
+ list_del_init(&dentry->d_lru);
+ dentry->d_flags &= ~DCACHE_SHRINK_LIST;
+
+ /*
* We found an inuse dentry which was not removed from
- * the LRU because of laziness during lookup. Do not free
- * it - just keep it off the LRU list.
+ * the LRU because of laziness during lookup. Do not free it.
*/
if (dentry->d_count) {
- dentry_lru_del(dentry);
spin_unlock(&dentry->d_lock);
continue;
}
-
rcu_read_unlock();
- try_prune_one_dentry(dentry);
+ dentry = try_prune_one_dentry(dentry);
rcu_read_lock();
+ if (dentry) {
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ list_add(&dentry->d_lru, list);
+ spin_unlock(&dentry->d_lock);
+ }
}
rcu_read_unlock();
}
@@ -867,8 +892,10 @@ relock:
list_move(&dentry->d_lru, &referenced);
spin_unlock(&dentry->d_lock);
} else {
- list_move_tail(&dentry->d_lru, &tmp);
+ list_move(&dentry->d_lru, &tmp);
dentry->d_flags |= DCACHE_SHRINK_LIST;
+ this_cpu_dec(nr_dentry_unused);
+ sb->s_nr_dentry_unused--;
spin_unlock(&dentry->d_lock);
if (!--count)
break;
@@ -882,6 +909,27 @@ relock:
shrink_dentry_list(&tmp);
}
+/*
+ * Mark all the dentries as on being the dispose list so we don't think they are
+ * still on the LRU if we try to kill them from ascending the parent chain in
+ * try_prune_one_dentry() rather than directly from the dispose list.
+ */
+static void
+shrink_dcache_list(
+ struct list_head *dispose)
+{
+ struct dentry *dentry;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dentry, dispose, d_lru) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ spin_unlock(&dentry->d_lock);
+ }
+ rcu_read_unlock();
+ shrink_dentry_list(dispose);
+}
+
/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
@@ -895,9 +943,17 @@ void shrink_dcache_sb(struct super_block *sb)
spin_lock(&sb->s_dentry_lru_lock);
while (!list_empty(&sb->s_dentry_lru)) {
+ /*
+ * account for removal here so we don't need to handle it later
+ * even though the dentry is no longer on the lru list.
+ */
list_splice_init(&sb->s_dentry_lru, &tmp);
+ this_cpu_sub(nr_dentry_unused, sb->s_nr_dentry_unused);
+ sb->s_nr_dentry_unused = 0;
spin_unlock(&sb->s_dentry_lru_lock);
- shrink_dentry_list(&tmp);
+
+ shrink_dcache_list(&tmp);
+
spin_lock(&sb->s_dentry_lru_lock);
}
spin_unlock(&sb->s_dentry_lru_lock);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 05/25] dcache: remove dentries from LRU before putting on dispose list
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
One of the big problems with modifying the way the dcache shrinker
and LRU implementation works is that the LRU is abused in several
ways. One of these is shrink_dentry_list().
Basically, we can move a dentry off the LRU onto a different list
without doing any accounting changes, and then use dentry_lru_prune()
to remove it from what-ever list it is now on to do the LRU
accounting at that point.
This makes it -really hard- to change the LRU implementation. The
use of the per-sb LRU lock serialises movement of the dentries
between the different lists and the removal of them, and this is the
only reason that it works. If we want to break up the dentry LRU
lock and lists into, say, per-node lists, we remove the only
serialisation that allows this lru list/dispose list abuse to work.
To make this work effectively, the dispose list has to be isolated
from the LRU list - dentries have to be removed from the LRU
*before* being placed on the dispose list. This means that the LRU
accounting and isolation is completed before disposal is started,
and that means we can change the LRU implementation freely in
future.
This means that dentries *must* be marked with DCACHE_SHRINK_LIST
when they are placed on the dispose list so that we don't think that
parent dentries found in try_prune_one_dentry() are on the LRU when
the are actually on the dispose list. This would result in
accounting the dentry to the LRU a second time. Hence
dentry_lru_del() has to handle the DCACHE_SHRINK_LIST case
differently because the dentry isn't on the LRU list.
[ v2: don't decrement nr unused twice, spotted by Sha Zhengju ]
[ v7: (dchinner)
- shrink list leaks dentries when inode/parent can't be locked in
dentry_kill().
- remove the readdition of dentry_lru_prune(). ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
---
fs/dcache.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 77 insertions(+), 21 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 0a49669..16b599e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -327,7 +327,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
}
/*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * dentry_lru_(add|del|move_list) must be called with d_lock held.
*/
static void dentry_lru_add(struct dentry *dentry)
{
@@ -343,16 +343,25 @@ static void dentry_lru_add(struct dentry *dentry)
static void __dentry_lru_del(struct dentry *dentry)
{
list_del_init(&dentry->d_lru);
- dentry->d_flags &= ~DCACHE_SHRINK_LIST;
dentry->d_sb->s_nr_dentry_unused--;
this_cpu_dec(nr_dentry_unused);
}
/*
* Remove a dentry with references from the LRU.
+ *
+ * If we are on the shrink list, then we can get to try_prune_one_dentry() and
+ * lose our last reference through the parent walk. In this case, we need to
+ * remove ourselves from the shrink list, not the LRU.
*/
static void dentry_lru_del(struct dentry *dentry)
{
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ list_del_init(&dentry->d_lru);
+ dentry->d_flags &= ~DCACHE_SHRINK_LIST;
+ return;
+ }
+
if (!list_empty(&dentry->d_lru)) {
spin_lock(&dentry->d_sb->s_dentry_lru_lock);
__dentry_lru_del(dentry);
@@ -362,13 +371,15 @@ static void dentry_lru_del(struct dentry *dentry)
static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
+ BUG_ON(dentry->d_flags & DCACHE_SHRINK_LIST);
+
spin_lock(&dentry->d_sb->s_dentry_lru_lock);
if (list_empty(&dentry->d_lru)) {
list_add_tail(&dentry->d_lru, list);
- dentry->d_sb->s_nr_dentry_unused++;
- this_cpu_inc(nr_dentry_unused);
} else {
list_move_tail(&dentry->d_lru, list);
+ dentry->d_sb->s_nr_dentry_unused--;
+ this_cpu_dec(nr_dentry_unused);
}
spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
@@ -466,7 +477,8 @@ EXPORT_SYMBOL(d_drop);
* If ref is non-zero, then decrement the refcount too.
* Returns dentry requiring refcount drop, or NULL if we're done.
*/
-static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+static inline struct dentry *
+dentry_kill(struct dentry *dentry, int ref, int unlock_on_failure)
__releases(dentry->d_lock)
{
struct inode *inode;
@@ -475,8 +487,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
inode = dentry->d_inode;
if (inode && !spin_trylock(&inode->i_lock)) {
relock:
- spin_unlock(&dentry->d_lock);
- cpu_relax();
+ if (unlock_on_failure) {
+ spin_unlock(&dentry->d_lock);
+ cpu_relax();
+ }
return dentry; /* try again with same dentry */
}
if (IS_ROOT(dentry))
@@ -563,7 +577,7 @@ repeat:
return;
kill_it:
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry, 1, 1);
if (dentry)
goto repeat;
}
@@ -762,12 +776,12 @@ EXPORT_SYMBOL(d_prune_aliases);
*
* This may fail if locks cannot be acquired no problem, just try again.
*/
-static void try_prune_one_dentry(struct dentry *dentry)
+static struct dentry * try_prune_one_dentry(struct dentry *dentry)
__releases(dentry->d_lock)
{
struct dentry *parent;
- parent = dentry_kill(dentry, 0);
+ parent = dentry_kill(dentry, 0, 0);
/*
* If dentry_kill returns NULL, we have nothing more to do.
* if it returns the same dentry, trylocks failed. In either
@@ -779,9 +793,9 @@ static void try_prune_one_dentry(struct dentry *dentry)
* fragmentation.
*/
if (!parent)
- return;
+ return NULL;
if (parent == dentry)
- return;
+ return dentry;
/* Prune ancestors. */
dentry = parent;
@@ -790,10 +804,11 @@ static void try_prune_one_dentry(struct dentry *dentry)
if (dentry->d_count > 1) {
dentry->d_count--;
spin_unlock(&dentry->d_lock);
- return;
+ return NULL;
}
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry, 1, 1);
}
+ return NULL;
}
static void shrink_dentry_list(struct list_head *list)
@@ -812,21 +827,31 @@ static void shrink_dentry_list(struct list_head *list)
}
/*
+ * The dispose list is isolated and dentries are not accounted
+ * to the LRU here, so we can simply remove it from the list
+ * here regardless of whether it is referenced or not.
+ */
+ list_del_init(&dentry->d_lru);
+ dentry->d_flags &= ~DCACHE_SHRINK_LIST;
+
+ /*
* We found an inuse dentry which was not removed from
- * the LRU because of laziness during lookup. Do not free
- * it - just keep it off the LRU list.
+ * the LRU because of laziness during lookup. Do not free it.
*/
if (dentry->d_count) {
- dentry_lru_del(dentry);
spin_unlock(&dentry->d_lock);
continue;
}
-
rcu_read_unlock();
- try_prune_one_dentry(dentry);
+ dentry = try_prune_one_dentry(dentry);
rcu_read_lock();
+ if (dentry) {
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ list_add(&dentry->d_lru, list);
+ spin_unlock(&dentry->d_lock);
+ }
}
rcu_read_unlock();
}
@@ -867,8 +892,10 @@ relock:
list_move(&dentry->d_lru, &referenced);
spin_unlock(&dentry->d_lock);
} else {
- list_move_tail(&dentry->d_lru, &tmp);
+ list_move(&dentry->d_lru, &tmp);
dentry->d_flags |= DCACHE_SHRINK_LIST;
+ this_cpu_dec(nr_dentry_unused);
+ sb->s_nr_dentry_unused--;
spin_unlock(&dentry->d_lock);
if (!--count)
break;
@@ -882,6 +909,27 @@ relock:
shrink_dentry_list(&tmp);
}
+/*
+ * Mark all the dentries as on being the dispose list so we don't think they are
+ * still on the LRU if we try to kill them from ascending the parent chain in
+ * try_prune_one_dentry() rather than directly from the dispose list.
+ */
+static void
+shrink_dcache_list(
+ struct list_head *dispose)
+{
+ struct dentry *dentry;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dentry, dispose, d_lru) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ spin_unlock(&dentry->d_lock);
+ }
+ rcu_read_unlock();
+ shrink_dentry_list(dispose);
+}
+
/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
@@ -895,9 +943,17 @@ void shrink_dcache_sb(struct super_block *sb)
spin_lock(&sb->s_dentry_lru_lock);
while (!list_empty(&sb->s_dentry_lru)) {
+ /*
+ * account for removal here so we don't need to handle it later
+ * even though the dentry is no longer on the lru list.
+ */
list_splice_init(&sb->s_dentry_lru, &tmp);
+ this_cpu_sub(nr_dentry_unused, sb->s_nr_dentry_unused);
+ sb->s_nr_dentry_unused = 0;
spin_unlock(&sb->s_dentry_lru_lock);
- shrink_dentry_list(&tmp);
+
+ shrink_dcache_list(&tmp);
+
spin_lock(&sb->s_dentry_lru_lock);
}
spin_unlock(&sb->s_dentry_lru_lock);
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 06/25] mm: new shrinker API
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
The current shrinker callout API uses an a single shrinker call for
multiple functions. To determine the function, a special magical
value is passed in a parameter to change the behaviour. This
complicates the implementation and return value specification for
the different behaviours.
Separate the two different behaviours into separate operations, one
to return a count of freeable objects in the cache, and another to
scan a certain number of objects in the cache for freeing. In
defining these new operations, ensure the return values and
resultant behaviours are clearly defined and documented.
Modify shrink_slab() to use the new API and implement the callouts
for all the existing shrinkers.
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
include/linux/shrinker.h | 38 ++++++++++++++++++++++--------
mm/vmscan.c | 60 ++++++++++++++++++++++++++++++++----------------
2 files changed, 69 insertions(+), 29 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index ac6b8ee..884e762 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,12 @@
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extention later.
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrink_control {
gfp_t gfp_mask;
@@ -12,23 +18,37 @@ struct shrink_control {
unsigned long nr_to_scan;
};
+#define SHRINK_STOP (~0UL)
/*
* A callback you can register to apply pressure to ageable caches.
*
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'. It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up. It should return
- * the number of objects which remain in the cache. If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ * @shrink() should look through the least-recently-used 'nr_to_scan' entries
+ * and attempt to free them up. It should return the number of objects which
+ * remain in the cache. If it returns -1, it means it cannot do any scanning at
+ * this time (eg. there is a risk of deadlock).
*
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
+ * @count_objects should return the number of freeable items in the cache. If
+ * there are no objects to free or the number of freeable items cannot be
+ * determined, it should return 0. No deadlock checks should be done during the
+ * count callback - the shrinker relies on aggregating scan counts that couldn't
+ * be executed due to potential deadlocks to be run at a later call when the
+ * deadlock condition is no longer pending.
*
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
+ * @scan_objects will only be called if @count_objects returned a non-zero
+ * value for the number of freeable objects. The callout should scan the cache
+ * and attempt to free items from the cache. It should then return the number
+ * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
+ * due to potential deadlocks. If SHRINK_STOP is returned, then no further
+ * attempts to call the @scan_objects will be made from the current reclaim
+ * context.
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
+ unsigned long (*count_objects)(struct shrinker *,
+ struct shrink_control *sc);
+ unsigned long (*scan_objects)(struct shrinker *,
+ struct shrink_control *sc);
+
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b96faea..dfc5685 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -205,19 +205,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
- unsigned long ret = 0;
+ unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
- /* Assume we'll be able to shrink next time */
- ret = 1;
+ /*
+ * If we would return 0, our callers would understand that we
+ * have nothing else to shrink and give up trying. By returning
+ * 1 we keep it going and assume we'll be able to shrink next
+ * time.
+ */
+ freed = 1;
goto out;
}
@@ -225,14 +230,16 @@ unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long long delta;
long total_scan;
long max_pass;
- int shrink_ret = 0;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- max_pass = do_shrinker_shrink(shrinker, shrink, 0);
- if (max_pass <= 0)
+ if (shrinker->count_objects)
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ else
+ max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ if (max_pass == 0)
continue;
/*
@@ -248,8 +255,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
- printk(KERN_ERR "shrink_slab: %pF negative objects to "
- "delete nr=%ld\n",
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->shrink, total_scan);
total_scan = max_pass;
}
@@ -277,20 +284,33 @@ unsigned long shrink_slab(struct shrink_control *shrink,
if (total_scan > max_pass * 2)
total_scan = max_pass * 2;
- trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
while (total_scan >= batch_size) {
- int nr_before;
- nr_before = do_shrinker_shrink(shrinker, shrink, 0);
- shrink_ret = do_shrinker_shrink(shrinker, shrink,
- batch_size);
- if (shrink_ret == -1)
- break;
- if (shrink_ret < nr_before)
- ret += nr_before - shrink_ret;
+ if (shrinker->scan_objects) {
+ unsigned long ret;
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+ } else {
+ int nr_before;
+ long ret;
+
+ nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ ret = do_shrinker_shrink(shrinker, shrinkctl,
+ batch_size);
+ if (ret == -1)
+ break;
+ if (ret < nr_before)
+ freed += nr_before - ret;
+ }
+
count_vm_events(SLABS_SCANNED, batch_size);
total_scan -= batch_size;
@@ -308,12 +328,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
else
new_nr = atomic_long_read(&shrinker->nr_in_batch);
- trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+ trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
cond_resched();
- return ret;
+ return freed;
}
static inline int is_page_cache_freeable(struct page *page)
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 06/25] mm: new shrinker API
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
The current shrinker callout API uses an a single shrinker call for
multiple functions. To determine the function, a special magical
value is passed in a parameter to change the behaviour. This
complicates the implementation and return value specification for
the different behaviours.
Separate the two different behaviours into separate operations, one
to return a count of freeable objects in the cache, and another to
scan a certain number of objects in the cache for freeing. In
defining these new operations, ensure the return values and
resultant behaviours are clearly defined and documented.
Modify shrink_slab() to use the new API and implement the callouts
for all the existing shrinkers.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Mel Gorman <mgorman@suse.de>
---
include/linux/shrinker.h | 38 ++++++++++++++++++++++--------
mm/vmscan.c | 60 ++++++++++++++++++++++++++++++++----------------
2 files changed, 69 insertions(+), 29 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index ac6b8ee..884e762 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,12 @@
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extention later.
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrink_control {
gfp_t gfp_mask;
@@ -12,23 +18,37 @@ struct shrink_control {
unsigned long nr_to_scan;
};
+#define SHRINK_STOP (~0UL)
/*
* A callback you can register to apply pressure to ageable caches.
*
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'. It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up. It should return
- * the number of objects which remain in the cache. If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ * @shrink() should look through the least-recently-used 'nr_to_scan' entries
+ * and attempt to free them up. It should return the number of objects which
+ * remain in the cache. If it returns -1, it means it cannot do any scanning at
+ * this time (eg. there is a risk of deadlock).
*
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
+ * @count_objects should return the number of freeable items in the cache. If
+ * there are no objects to free or the number of freeable items cannot be
+ * determined, it should return 0. No deadlock checks should be done during the
+ * count callback - the shrinker relies on aggregating scan counts that couldn't
+ * be executed due to potential deadlocks to be run at a later call when the
+ * deadlock condition is no longer pending.
*
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
+ * @scan_objects will only be called if @count_objects returned a non-zero
+ * value for the number of freeable objects. The callout should scan the cache
+ * and attempt to free items from the cache. It should then return the number
+ * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
+ * due to potential deadlocks. If SHRINK_STOP is returned, then no further
+ * attempts to call the @scan_objects will be made from the current reclaim
+ * context.
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
+ unsigned long (*count_objects)(struct shrinker *,
+ struct shrink_control *sc);
+ unsigned long (*scan_objects)(struct shrinker *,
+ struct shrink_control *sc);
+
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b96faea..dfc5685 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -205,19 +205,24 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
- unsigned long ret = 0;
+ unsigned long freed = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
- /* Assume we'll be able to shrink next time */
- ret = 1;
+ /*
+ * If we would return 0, our callers would understand that we
+ * have nothing else to shrink and give up trying. By returning
+ * 1 we keep it going and assume we'll be able to shrink next
+ * time.
+ */
+ freed = 1;
goto out;
}
@@ -225,14 +230,16 @@ unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long long delta;
long total_scan;
long max_pass;
- int shrink_ret = 0;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- max_pass = do_shrinker_shrink(shrinker, shrink, 0);
- if (max_pass <= 0)
+ if (shrinker->count_objects)
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ else
+ max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ if (max_pass == 0)
continue;
/*
@@ -248,8 +255,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
- printk(KERN_ERR "shrink_slab: %pF negative objects to "
- "delete nr=%ld\n",
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->shrink, total_scan);
total_scan = max_pass;
}
@@ -277,20 +284,33 @@ unsigned long shrink_slab(struct shrink_control *shrink,
if (total_scan > max_pass * 2)
total_scan = max_pass * 2;
- trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
while (total_scan >= batch_size) {
- int nr_before;
- nr_before = do_shrinker_shrink(shrinker, shrink, 0);
- shrink_ret = do_shrinker_shrink(shrinker, shrink,
- batch_size);
- if (shrink_ret == -1)
- break;
- if (shrink_ret < nr_before)
- ret += nr_before - shrink_ret;
+ if (shrinker->scan_objects) {
+ unsigned long ret;
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+ } else {
+ int nr_before;
+ long ret;
+
+ nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ ret = do_shrinker_shrink(shrinker, shrinkctl,
+ batch_size);
+ if (ret == -1)
+ break;
+ if (ret < nr_before)
+ freed += nr_before - ret;
+ }
+
count_vm_events(SLABS_SCANNED, batch_size);
total_scan -= batch_size;
@@ -308,12 +328,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
else
new_nr = atomic_long_read(&shrinker->nr_in_batch);
- trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+ trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
cond_resched();
- return ret;
+ return freed;
}
static inline int is_page_cache_freeable(struct page *page)
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 07/25] shrinker: convert superblock shrinkers to new API
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Convert superblock shrinker to use the new count/scan API, and
propagate the API changes through to the filesystem callouts. The
filesystem callouts already use a count/scan API, so it's just
changing counters to longs to match the VM API.
This requires the dentry and inode shrinker callouts to be converted
to the count/scan API. This is mainly a mechanical change.
[ v8: fix super_cache_count() return value ]
[ glommer: use mult_frac for fractional proportions, build fixes ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
fs/dcache.c | 10 ++++---
fs/inode.c | 7 +++--
fs/internal.h | 2 ++
fs/super.c | 78 ++++++++++++++++++++++++++++++++---------------------
fs/xfs/xfs_icache.c | 4 +--
fs/xfs/xfs_icache.h | 2 +-
fs/xfs/xfs_super.c | 8 +++---
include/linux/fs.h | 8 ++----
8 files changed, 69 insertions(+), 50 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 16b599e..d7609a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -868,11 +868,12 @@ static void shrink_dentry_list(struct list_head *list)
* This function may fail to free any resources if all the dentries are in
* use.
*/
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan)
{
struct dentry *dentry;
LIST_HEAD(referenced);
LIST_HEAD(tmp);
+ long freed = 0;
relock:
spin_lock(&sb->s_dentry_lru_lock);
@@ -897,7 +898,8 @@ relock:
this_cpu_dec(nr_dentry_unused);
sb->s_nr_dentry_unused--;
spin_unlock(&dentry->d_lock);
- if (!--count)
+ freed++;
+ if (!--nr_to_scan)
break;
}
cond_resched_lock(&sb->s_dentry_lru_lock);
@@ -907,6 +909,7 @@ relock:
spin_unlock(&sb->s_dentry_lru_lock);
shrink_dentry_list(&tmp);
+ return freed;
}
/*
@@ -1294,9 +1297,8 @@ rename_retry:
void shrink_dcache_parent(struct dentry * parent)
{
LIST_HEAD(dispose);
- int found;
- while ((found = select_parent(parent, &dispose)) != 0) {
+ while (select_parent(parent, &dispose)) {
shrink_dentry_list(&dispose);
cond_resched();
}
diff --git a/fs/inode.c b/fs/inode.c
index ff29765..1ddaa2e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -704,10 +704,11 @@ static int can_unuse(struct inode *inode)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-void prune_icache_sb(struct super_block *sb, int nr_to_scan)
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
{
LIST_HEAD(freeable);
- int nr_scanned;
+ long nr_scanned;
+ long freed = 0;
unsigned long reap = 0;
spin_lock(&sb->s_inode_lru_lock);
@@ -777,6 +778,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
list_move(&inode->i_lru, &freeable);
sb->s_nr_inodes_unused--;
this_cpu_dec(nr_unused);
+ freed++;
}
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -787,6 +789,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
current->reclaim_state->reclaimed_slab += reap;
dispose_list(&freeable);
+ return freed;
}
static void __wait_on_freeing_inode(struct inode *inode);
diff --git a/fs/internal.h b/fs/internal.h
index cd5009f..ea43c89 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
+extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan);
extern void inode_add_lru(struct inode *inode);
/*
@@ -125,6 +126,7 @@ extern int invalidate_inodes(struct super_block *, bool);
* dcache.c
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
+extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan);
/*
* read_write.c
diff --git a/fs/super.c b/fs/super.c
index 0be75fb..86801eb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -53,11 +53,14 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
-static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct super_block *sb;
- int fs_objects = 0;
- int total_objects;
+ long fs_objects = 0;
+ long total_objects;
+ long freed = 0;
+ long dentries;
+ long inodes;
sb = container_of(shrink, struct super_block, s_shrink);
@@ -65,11 +68,11 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
* Deadlock avoidance. We may hold various FS locks, and we don't want
* to recurse into the FS that called us in clear_inode() and friends..
*/
- if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
- return -1;
+ if (!(sc->gfp_mask & __GFP_FS))
+ return SHRINK_STOP;
if (!grab_super_passive(sb))
- return -1;
+ return SHRINK_STOP;
if (sb->s_op && sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb);
@@ -77,33 +80,45 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
total_objects = sb->s_nr_dentry_unused +
sb->s_nr_inodes_unused + fs_objects + 1;
- if (sc->nr_to_scan) {
- int dentries;
- int inodes;
-
- /* proportion the scan between the caches */
- dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
- total_objects);
- inodes = mult_frac(sc->nr_to_scan, sb->s_nr_inodes_unused,
- total_objects);
- if (fs_objects)
- fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
- total_objects);
- /*
- * prune the dcache first as the icache is pinned by it, then
- * prune the icache, followed by the filesystem specific caches
- */
- prune_dcache_sb(sb, dentries);
- prune_icache_sb(sb, inodes);
+ /* proportion the scan between the caches */
+ dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
+ total_objects);
+ inodes = mult_frac(sc->nr_to_scan, sb->s_nr_inodes_unused,
+ total_objects);
- if (fs_objects && sb->s_op->free_cached_objects) {
- sb->s_op->free_cached_objects(sb, fs_objects);
- fs_objects = sb->s_op->nr_cached_objects(sb);
- }
- total_objects = sb->s_nr_dentry_unused +
- sb->s_nr_inodes_unused + fs_objects;
+ /*
+ * prune the dcache first as the icache is pinned by it, then
+ * prune the icache, followed by the filesystem specific caches
+ */
+ freed = prune_dcache_sb(sb, dentries);
+ freed += prune_icache_sb(sb, inodes);
+
+ if (fs_objects) {
+ fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
+ total_objects);
+ freed += sb->s_op->free_cached_objects(sb, fs_objects);
}
+ drop_super(sb);
+ return freed;
+}
+
+static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct super_block *sb;
+ long total_objects = 0;
+
+ sb = container_of(shrink, struct super_block, s_shrink);
+
+ if (!grab_super_passive(sb))
+ return 0;
+
+ if (sb->s_op && sb->s_op->nr_cached_objects)
+ total_objects = sb->s_op->nr_cached_objects(sb);
+
+ total_objects += sb->s_nr_dentry_unused;
+ total_objects += sb->s_nr_inodes_unused;
+
total_objects = vfs_pressure_ratio(total_objects);
drop_super(sb);
return total_objects;
@@ -217,7 +232,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->cleancache_poolid = -1;
s->s_shrink.seeks = DEFAULT_SEEKS;
- s->s_shrink.shrink = prune_super;
+ s->s_shrink.scan_objects = super_cache_scan;
+ s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
}
out:
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e..b35c311 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1164,7 +1164,7 @@ xfs_reclaim_inodes(
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
*/
-void
+long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
int nr_to_scan)
@@ -1173,7 +1173,7 @@ xfs_reclaim_inodes_nr(
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
}
/*
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c..2d6d2d3 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -31,7 +31,7 @@ void xfs_reclaim_worker(struct work_struct *work);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5..443a8bc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1534,19 +1534,19 @@ xfs_fs_mount(
return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
}
-static int
+static long
xfs_fs_nr_cached_objects(
struct super_block *sb)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
-static void
+static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- int nr_to_scan)
+ long nr_to_scan)
{
- xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+ return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
}
static const struct super_operations xfs_super_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 41cbe7a..2913d3b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1327,10 +1327,6 @@ struct super_block {
int s_readonly_remount;
};
-/* superblock cache pruning functions */
-extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
-extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
-
extern struct timespec current_fs_time(struct super_block *sb);
/*
@@ -1617,8 +1613,8 @@ struct super_operations {
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
- int (*nr_cached_objects)(struct super_block *);
- void (*free_cached_objects)(struct super_block *, int);
+ long (*nr_cached_objects)(struct super_block *);
+ long (*free_cached_objects)(struct super_block *, long);
};
/*
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 07/25] shrinker: convert superblock shrinkers to new API
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Convert superblock shrinker to use the new count/scan API, and
propagate the API changes through to the filesystem callouts. The
filesystem callouts already use a count/scan API, so it's just
changing counters to longs to match the VM API.
This requires the dentry and inode shrinker callouts to be converted
to the count/scan API. This is mainly a mechanical change.
[ v8: fix super_cache_count() return value ]
[ glommer: use mult_frac for fractional proportions, build fixes ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Acked-by: Mel Gorman <mgorman@suse.de>
---
fs/dcache.c | 10 ++++---
fs/inode.c | 7 +++--
fs/internal.h | 2 ++
fs/super.c | 78 ++++++++++++++++++++++++++++++++---------------------
fs/xfs/xfs_icache.c | 4 +--
fs/xfs/xfs_icache.h | 2 +-
fs/xfs/xfs_super.c | 8 +++---
include/linux/fs.h | 8 ++----
8 files changed, 69 insertions(+), 50 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 16b599e..d7609a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -868,11 +868,12 @@ static void shrink_dentry_list(struct list_head *list)
* This function may fail to free any resources if all the dentries are in
* use.
*/
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan)
{
struct dentry *dentry;
LIST_HEAD(referenced);
LIST_HEAD(tmp);
+ long freed = 0;
relock:
spin_lock(&sb->s_dentry_lru_lock);
@@ -897,7 +898,8 @@ relock:
this_cpu_dec(nr_dentry_unused);
sb->s_nr_dentry_unused--;
spin_unlock(&dentry->d_lock);
- if (!--count)
+ freed++;
+ if (!--nr_to_scan)
break;
}
cond_resched_lock(&sb->s_dentry_lru_lock);
@@ -907,6 +909,7 @@ relock:
spin_unlock(&sb->s_dentry_lru_lock);
shrink_dentry_list(&tmp);
+ return freed;
}
/*
@@ -1294,9 +1297,8 @@ rename_retry:
void shrink_dcache_parent(struct dentry * parent)
{
LIST_HEAD(dispose);
- int found;
- while ((found = select_parent(parent, &dispose)) != 0) {
+ while (select_parent(parent, &dispose)) {
shrink_dentry_list(&dispose);
cond_resched();
}
diff --git a/fs/inode.c b/fs/inode.c
index ff29765..1ddaa2e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -704,10 +704,11 @@ static int can_unuse(struct inode *inode)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-void prune_icache_sb(struct super_block *sb, int nr_to_scan)
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
{
LIST_HEAD(freeable);
- int nr_scanned;
+ long nr_scanned;
+ long freed = 0;
unsigned long reap = 0;
spin_lock(&sb->s_inode_lru_lock);
@@ -777,6 +778,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
list_move(&inode->i_lru, &freeable);
sb->s_nr_inodes_unused--;
this_cpu_dec(nr_unused);
+ freed++;
}
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -787,6 +789,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
current->reclaim_state->reclaimed_slab += reap;
dispose_list(&freeable);
+ return freed;
}
static void __wait_on_freeing_inode(struct inode *inode);
diff --git a/fs/internal.h b/fs/internal.h
index cd5009f..ea43c89 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
+extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan);
extern void inode_add_lru(struct inode *inode);
/*
@@ -125,6 +126,7 @@ extern int invalidate_inodes(struct super_block *, bool);
* dcache.c
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
+extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan);
/*
* read_write.c
diff --git a/fs/super.c b/fs/super.c
index 0be75fb..86801eb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -53,11 +53,14 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
-static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct super_block *sb;
- int fs_objects = 0;
- int total_objects;
+ long fs_objects = 0;
+ long total_objects;
+ long freed = 0;
+ long dentries;
+ long inodes;
sb = container_of(shrink, struct super_block, s_shrink);
@@ -65,11 +68,11 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
* Deadlock avoidance. We may hold various FS locks, and we don't want
* to recurse into the FS that called us in clear_inode() and friends..
*/
- if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
- return -1;
+ if (!(sc->gfp_mask & __GFP_FS))
+ return SHRINK_STOP;
if (!grab_super_passive(sb))
- return -1;
+ return SHRINK_STOP;
if (sb->s_op && sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb);
@@ -77,33 +80,45 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
total_objects = sb->s_nr_dentry_unused +
sb->s_nr_inodes_unused + fs_objects + 1;
- if (sc->nr_to_scan) {
- int dentries;
- int inodes;
-
- /* proportion the scan between the caches */
- dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
- total_objects);
- inodes = mult_frac(sc->nr_to_scan, sb->s_nr_inodes_unused,
- total_objects);
- if (fs_objects)
- fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
- total_objects);
- /*
- * prune the dcache first as the icache is pinned by it, then
- * prune the icache, followed by the filesystem specific caches
- */
- prune_dcache_sb(sb, dentries);
- prune_icache_sb(sb, inodes);
+ /* proportion the scan between the caches */
+ dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
+ total_objects);
+ inodes = mult_frac(sc->nr_to_scan, sb->s_nr_inodes_unused,
+ total_objects);
- if (fs_objects && sb->s_op->free_cached_objects) {
- sb->s_op->free_cached_objects(sb, fs_objects);
- fs_objects = sb->s_op->nr_cached_objects(sb);
- }
- total_objects = sb->s_nr_dentry_unused +
- sb->s_nr_inodes_unused + fs_objects;
+ /*
+ * prune the dcache first as the icache is pinned by it, then
+ * prune the icache, followed by the filesystem specific caches
+ */
+ freed = prune_dcache_sb(sb, dentries);
+ freed += prune_icache_sb(sb, inodes);
+
+ if (fs_objects) {
+ fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
+ total_objects);
+ freed += sb->s_op->free_cached_objects(sb, fs_objects);
}
+ drop_super(sb);
+ return freed;
+}
+
+static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct super_block *sb;
+ long total_objects = 0;
+
+ sb = container_of(shrink, struct super_block, s_shrink);
+
+ if (!grab_super_passive(sb))
+ return 0;
+
+ if (sb->s_op && sb->s_op->nr_cached_objects)
+ total_objects = sb->s_op->nr_cached_objects(sb);
+
+ total_objects += sb->s_nr_dentry_unused;
+ total_objects += sb->s_nr_inodes_unused;
+
total_objects = vfs_pressure_ratio(total_objects);
drop_super(sb);
return total_objects;
@@ -217,7 +232,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->cleancache_poolid = -1;
s->s_shrink.seeks = DEFAULT_SEEKS;
- s->s_shrink.shrink = prune_super;
+ s->s_shrink.scan_objects = super_cache_scan;
+ s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
}
out:
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e..b35c311 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1164,7 +1164,7 @@ xfs_reclaim_inodes(
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
*/
-void
+long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
int nr_to_scan)
@@ -1173,7 +1173,7 @@ xfs_reclaim_inodes_nr(
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
}
/*
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c..2d6d2d3 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -31,7 +31,7 @@ void xfs_reclaim_worker(struct work_struct *work);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5..443a8bc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1534,19 +1534,19 @@ xfs_fs_mount(
return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
}
-static int
+static long
xfs_fs_nr_cached_objects(
struct super_block *sb)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
-static void
+static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- int nr_to_scan)
+ long nr_to_scan)
{
- xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+ return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
}
static const struct super_operations xfs_super_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 41cbe7a..2913d3b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1327,10 +1327,6 @@ struct super_block {
int s_readonly_remount;
};
-/* superblock cache pruning functions */
-extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
-extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
-
extern struct timespec current_fs_time(struct super_block *sb);
/*
@@ -1617,8 +1613,8 @@ struct super_operations {
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
- int (*nr_cached_objects)(struct super_block *);
- void (*free_cached_objects)(struct super_block *, int);
+ long (*nr_cached_objects)(struct super_block *);
+ long (*free_cached_objects)(struct super_block *, long);
};
/*
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 08/25] list: add a new LRU list type
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Several subsystems use the same construct for LRU lists - a list
head, a spin lock and and item count. They also use exactly the same
code for adding and removing items from the LRU. Create a generic
type for these LRU lists.
This is the beginning of generic, node aware LRUs for shrinkers to
work with.
[ glommer: enum defined constants for lru. Suggested by gthelen,
don't relock over retry ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Reviewed-by: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
include/linux/list_lru.h | 115 ++++++++++++++++++++++++++++++++++++++++++++++
mm/Makefile | 2 +-
mm/list_lru.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 233 insertions(+), 1 deletion(-)
create mode 100644 include/linux/list_lru.h
create mode 100644 mm/list_lru.c
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
new file mode 100644
index 0000000..1a548b0
--- /dev/null
+++ b/include/linux/list_lru.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#ifndef _LRU_LIST_H
+#define _LRU_LIST_H
+
+#include <linux/list.h>
+
+/* list_lru_walk_cb has to always return one of those */
+enum lru_status {
+ LRU_REMOVED, /* item removed from list */
+ LRU_ROTATE, /* item referenced, give another pass */
+ LRU_SKIP, /* item cannot be locked, skip */
+ LRU_RETRY, /* item not freeable. May drop the lock
+ internally, but has to return locked. */
+};
+
+struct list_lru {
+ spinlock_t lock;
+ struct list_head list;
+ /* kept as signed so we can catch imbalance bugs */
+ long nr_items;
+};
+
+int list_lru_init(struct list_lru *lru);
+
+/**
+ * list_lru_add: add an element to the lru list's tail
+ * @list_lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * If the element is already part of a list, this function returns doing
+ * nothing. Therefore the caller does not need to keep state about whether or
+ * not the element already belongs in the list and is allowed to lazy update
+ * it. Note however that this is valid for *a* list, not *this* list. If
+ * the caller organize itself in a way that elements can be in more than
+ * one type of list, it is up to the caller to fully remove the item from
+ * the previous list (with list_lru_del() for instance) before moving it
+ * to @list_lru
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_del: delete an element to the lru list
+ * @list_lru: the lru pointer
+ * @item: the item to be deleted.
+ *
+ * This function works analogously as list_lru_add in terms of list
+ * manipulation. The comments about an element already pertaining to
+ * a list are also valid for list_lru_del.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_del(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_count: return the number of objects currently held by @lru
+ * @lru: the lru pointer.
+ *
+ * Always return a non-negative number, 0 for empty lists. There is no
+ * guarantee that the list is not updated while the count is being computed.
+ * Callers that want such a guarantee need to provide an outer lock.
+ */
+static inline unsigned long list_lru_count(struct list_lru *lru)
+{
+ return lru->nr_items;
+}
+
+typedef enum lru_status
+(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+/**
+ * list_lru_walk: walk a list_lru, isolating and disposing freeable items.
+ * @lru: the lru pointer.
+ * @isolate: callback function that is resposible for deciding what to do with
+ * the item currently being scanned
+ * @cb_arg: opaque type that will be passed to @isolate
+ * @nr_to_walk: how many items to scan.
+ *
+ * This function will scan all elements in a particular list_lru, calling the
+ * @isolate callback for each of those items, along with the current list
+ * spinlock and a caller-provided opaque. The @isolate callback can choose to
+ * drop the lock internally, but *must* return with the lock held. The callback
+ * will return an enum lru_status telling the list_lru infrastructure what to
+ * do with the object being scanned.
+ *
+ * Please note that nr_to_walk does not mean how many objects will be freed,
+ * just how many objects will be scanned.
+ *
+ * Return value: the number of objects effectively removed from the LRU.
+ */
+unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk);
+
+typedef void (*list_lru_dispose_cb)(struct list_head *dispose_list);
+/**
+ * list_lru_dispose_all: forceably flush all elements in an @lru
+ * @lru: the lru pointer
+ * @dispose: callback function to be called for each lru list.
+ *
+ * This function will forceably isolate all elements into the dispose list, and
+ * call the @dispose callback to flush the list. Please note that the callback
+ * should expect items in any state, clean or dirty, and be able to flush all of
+ * them.
+ *
+ * Return value: how many objects were freed. It should be equal to all objects
+ * in the list_lru.
+ */
+unsigned long
+list_lru_dispose_all(struct list_lru *lru, list_lru_dispose_cb dispose);
+#endif /* _LRU_LIST_H */
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb..db430a4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
- interval_tree.o $(mmu-y)
+ interval_tree.o list_lru.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/list_lru.c b/mm/list_lru.c
new file mode 100644
index 0000000..dd74c54
--- /dev/null
+++ b/mm/list_lru.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_lru.h>
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+ spin_lock(&lru->lock);
+ if (list_empty(item)) {
+ list_add_tail(item, &lru->list);
+ lru->nr_items++;
+ spin_unlock(&lru->lock);
+ return true;
+ }
+ spin_unlock(&lru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item)
+{
+ spin_lock(&lru->lock);
+ if (!list_empty(item)) {
+ list_del_init(item);
+ lru->nr_items--;
+ spin_unlock(&lru->lock);
+ return true;
+ }
+ spin_unlock(&lru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del);
+
+unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk)
+{
+ struct list_head *item, *n;
+ unsigned long removed = 0;
+ /*
+ * If we don't keep state of at which pass we are, we can loop at
+ * LRU_RETRY, since we have no guarantees that the caller will be able
+ * to do something other than retry on the next pass. We handle this by
+ * allowing at most one retry per object. This should not be altered
+ * by any condition other than LRU_RETRY.
+ */
+ bool first_pass = true;
+
+ spin_lock(&lru->lock);
+restart:
+ list_for_each_safe(item, n, &lru->list) {
+ enum lru_status ret;
+ ret = isolate(item, &lru->lock, cb_arg);
+ switch (ret) {
+ case LRU_REMOVED:
+ lru->nr_items--;
+ removed++;
+ break;
+ case LRU_ROTATE:
+ list_move_tail(item, &lru->list);
+ break;
+ case LRU_SKIP:
+ break;
+ case LRU_RETRY:
+ if (!first_pass) {
+ first_pass = true;
+ break;
+ }
+ first_pass = false;
+ goto restart;
+ default:
+ BUG();
+ }
+
+ if (nr_to_walk-- == 0)
+ break;
+
+ }
+ spin_unlock(&lru->lock);
+ return removed;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk);
+
+unsigned long list_lru_dispose_all(struct list_lru *lru,
+ list_lru_dispose_cb dispose)
+{
+ unsigned long disposed = 0;
+ LIST_HEAD(dispose_list);
+
+ spin_lock(&lru->lock);
+ while (!list_empty(&lru->list)) {
+ list_splice_init(&lru->list, &dispose_list);
+ disposed += lru->nr_items;
+ lru->nr_items = 0;
+ spin_unlock(&lru->lock);
+
+ dispose(&dispose_list);
+
+ spin_lock(&lru->lock);
+ }
+ spin_unlock(&lru->lock);
+ return disposed;
+}
+
+int list_lru_init(struct list_lru *lru)
+{
+ spin_lock_init(&lru->lock);
+ INIT_LIST_HEAD(&lru->list);
+ lru->nr_items = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(list_lru_init);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 08/25] list: add a new LRU list type
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Several subsystems use the same construct for LRU lists - a list
head, a spin lock and and item count. They also use exactly the same
code for adding and removing items from the LRU. Create a generic
type for these LRU lists.
This is the beginning of generic, node aware LRUs for shrinkers to
work with.
[ glommer: enum defined constants for lru. Suggested by gthelen,
don't relock over retry ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
---
include/linux/list_lru.h | 115 ++++++++++++++++++++++++++++++++++++++++++++++
mm/Makefile | 2 +-
mm/list_lru.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 233 insertions(+), 1 deletion(-)
create mode 100644 include/linux/list_lru.h
create mode 100644 mm/list_lru.c
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
new file mode 100644
index 0000000..1a548b0
--- /dev/null
+++ b/include/linux/list_lru.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#ifndef _LRU_LIST_H
+#define _LRU_LIST_H
+
+#include <linux/list.h>
+
+/* list_lru_walk_cb has to always return one of those */
+enum lru_status {
+ LRU_REMOVED, /* item removed from list */
+ LRU_ROTATE, /* item referenced, give another pass */
+ LRU_SKIP, /* item cannot be locked, skip */
+ LRU_RETRY, /* item not freeable. May drop the lock
+ internally, but has to return locked. */
+};
+
+struct list_lru {
+ spinlock_t lock;
+ struct list_head list;
+ /* kept as signed so we can catch imbalance bugs */
+ long nr_items;
+};
+
+int list_lru_init(struct list_lru *lru);
+
+/**
+ * list_lru_add: add an element to the lru list's tail
+ * @list_lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * If the element is already part of a list, this function returns doing
+ * nothing. Therefore the caller does not need to keep state about whether or
+ * not the element already belongs in the list and is allowed to lazy update
+ * it. Note however that this is valid for *a* list, not *this* list. If
+ * the caller organize itself in a way that elements can be in more than
+ * one type of list, it is up to the caller to fully remove the item from
+ * the previous list (with list_lru_del() for instance) before moving it
+ * to @list_lru
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_del: delete an element to the lru list
+ * @list_lru: the lru pointer
+ * @item: the item to be deleted.
+ *
+ * This function works analogously as list_lru_add in terms of list
+ * manipulation. The comments about an element already pertaining to
+ * a list are also valid for list_lru_del.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_del(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_count: return the number of objects currently held by @lru
+ * @lru: the lru pointer.
+ *
+ * Always return a non-negative number, 0 for empty lists. There is no
+ * guarantee that the list is not updated while the count is being computed.
+ * Callers that want such a guarantee need to provide an outer lock.
+ */
+static inline unsigned long list_lru_count(struct list_lru *lru)
+{
+ return lru->nr_items;
+}
+
+typedef enum lru_status
+(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+/**
+ * list_lru_walk: walk a list_lru, isolating and disposing freeable items.
+ * @lru: the lru pointer.
+ * @isolate: callback function that is resposible for deciding what to do with
+ * the item currently being scanned
+ * @cb_arg: opaque type that will be passed to @isolate
+ * @nr_to_walk: how many items to scan.
+ *
+ * This function will scan all elements in a particular list_lru, calling the
+ * @isolate callback for each of those items, along with the current list
+ * spinlock and a caller-provided opaque. The @isolate callback can choose to
+ * drop the lock internally, but *must* return with the lock held. The callback
+ * will return an enum lru_status telling the list_lru infrastructure what to
+ * do with the object being scanned.
+ *
+ * Please note that nr_to_walk does not mean how many objects will be freed,
+ * just how many objects will be scanned.
+ *
+ * Return value: the number of objects effectively removed from the LRU.
+ */
+unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk);
+
+typedef void (*list_lru_dispose_cb)(struct list_head *dispose_list);
+/**
+ * list_lru_dispose_all: forceably flush all elements in an @lru
+ * @lru: the lru pointer
+ * @dispose: callback function to be called for each lru list.
+ *
+ * This function will forceably isolate all elements into the dispose list, and
+ * call the @dispose callback to flush the list. Please note that the callback
+ * should expect items in any state, clean or dirty, and be able to flush all of
+ * them.
+ *
+ * Return value: how many objects were freed. It should be equal to all objects
+ * in the list_lru.
+ */
+unsigned long
+list_lru_dispose_all(struct list_lru *lru, list_lru_dispose_cb dispose);
+#endif /* _LRU_LIST_H */
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb..db430a4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
- interval_tree.o $(mmu-y)
+ interval_tree.o list_lru.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/list_lru.c b/mm/list_lru.c
new file mode 100644
index 0000000..dd74c54
--- /dev/null
+++ b/mm/list_lru.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_lru.h>
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+ spin_lock(&lru->lock);
+ if (list_empty(item)) {
+ list_add_tail(item, &lru->list);
+ lru->nr_items++;
+ spin_unlock(&lru->lock);
+ return true;
+ }
+ spin_unlock(&lru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item)
+{
+ spin_lock(&lru->lock);
+ if (!list_empty(item)) {
+ list_del_init(item);
+ lru->nr_items--;
+ spin_unlock(&lru->lock);
+ return true;
+ }
+ spin_unlock(&lru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del);
+
+unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk)
+{
+ struct list_head *item, *n;
+ unsigned long removed = 0;
+ /*
+ * If we don't keep state of at which pass we are, we can loop at
+ * LRU_RETRY, since we have no guarantees that the caller will be able
+ * to do something other than retry on the next pass. We handle this by
+ * allowing at most one retry per object. This should not be altered
+ * by any condition other than LRU_RETRY.
+ */
+ bool first_pass = true;
+
+ spin_lock(&lru->lock);
+restart:
+ list_for_each_safe(item, n, &lru->list) {
+ enum lru_status ret;
+ ret = isolate(item, &lru->lock, cb_arg);
+ switch (ret) {
+ case LRU_REMOVED:
+ lru->nr_items--;
+ removed++;
+ break;
+ case LRU_ROTATE:
+ list_move_tail(item, &lru->list);
+ break;
+ case LRU_SKIP:
+ break;
+ case LRU_RETRY:
+ if (!first_pass) {
+ first_pass = true;
+ break;
+ }
+ first_pass = false;
+ goto restart;
+ default:
+ BUG();
+ }
+
+ if (nr_to_walk-- == 0)
+ break;
+
+ }
+ spin_unlock(&lru->lock);
+ return removed;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk);
+
+unsigned long list_lru_dispose_all(struct list_lru *lru,
+ list_lru_dispose_cb dispose)
+{
+ unsigned long disposed = 0;
+ LIST_HEAD(dispose_list);
+
+ spin_lock(&lru->lock);
+ while (!list_empty(&lru->list)) {
+ list_splice_init(&lru->list, &dispose_list);
+ disposed += lru->nr_items;
+ lru->nr_items = 0;
+ spin_unlock(&lru->lock);
+
+ dispose(&dispose_list);
+
+ spin_lock(&lru->lock);
+ }
+ spin_unlock(&lru->lock);
+ return disposed;
+}
+
+int list_lru_init(struct list_lru *lru)
+{
+ spin_lock_init(&lru->lock);
+ INIT_LIST_HEAD(&lru->list);
+ lru->nr_items = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(list_lru_init);
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 09/25] inode: convert inode lru list to generic lru list code.
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
[ glommer: adapted for new LRU return codes ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
fs/inode.c | 175 +++++++++++++++++++++--------------------------------
fs/super.c | 12 ++--
include/linux/fs.h | 6 +-
3 files changed, 77 insertions(+), 116 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 1ddaa2e..5d85521 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
#include "internal.h"
/*
@@ -24,7 +25,7 @@
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
- * inode->i_sb->s_inode_lru_lock protects:
+ * Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
@@ -37,7 +38,7 @@
*
* inode_sb_list_lock
* inode->i_lock
- * inode->i_sb->s_inode_lru_lock
+ * Inode LRU list locks
*
* bdi->wb.list_lock
* inode->i_lock
@@ -399,13 +400,8 @@ EXPORT_SYMBOL(ihold);
static void inode_lru_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_lru_lock);
- if (list_empty(&inode->i_lru)) {
- list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
- inode->i_sb->s_nr_inodes_unused++;
+ if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- }
- spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
/*
@@ -423,13 +419,9 @@ void inode_add_lru(struct inode *inode)
static void inode_lru_list_del(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_lru_lock);
- if (!list_empty(&inode->i_lru)) {
- list_del_init(&inode->i_lru);
- inode->i_sb->s_nr_inodes_unused--;
+
+ if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
- }
- spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
/**
@@ -673,24 +665,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
return busy;
}
-static int can_unuse(struct inode *inode)
-{
- if (inode->i_state & ~I_REFERENCED)
- return 0;
- if (inode_has_buffers(inode))
- return 0;
- if (atomic_read(&inode->i_count))
- return 0;
- if (inode->i_data.nrpages)
- return 0;
- return 1;
-}
-
/*
- * Walk the superblock inode LRU for freeable inodes and attempt to free them.
- * This is called from the superblock shrinker function with a number of inodes
- * to trim from the LRU. Inodes to be freed are moved to a temporary list and
- * then are freed outside inode_lock by dispose_list().
+ * Isolate the inode from the LRU in preparation for freeing it.
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. If the inode has metadata buffers attached to
@@ -704,90 +680,79 @@ static int can_unuse(struct inode *inode)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
+static enum lru_status
+inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
{
- LIST_HEAD(freeable);
- long nr_scanned;
- long freed = 0;
- unsigned long reap = 0;
+ struct list_head *freeable = arg;
+ struct inode *inode = container_of(item, struct inode, i_lru);
- spin_lock(&sb->s_inode_lru_lock);
- for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
- struct inode *inode;
+ /*
+ * we are inverting the lru lock/inode->i_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&inode->i_lock))
+ return LRU_SKIP;
- if (list_empty(&sb->s_inode_lru))
- break;
+ /*
+ * Referenced or dirty inodes are still in use. Give them another pass
+ * through the LRU as we canot reclaim them now.
+ */
+ if (atomic_read(&inode->i_count) ||
+ (inode->i_state & ~I_REFERENCED)) {
+ list_del_init(&inode->i_lru);
+ spin_unlock(&inode->i_lock);
+ this_cpu_dec(nr_unused);
+ return LRU_REMOVED;
+ }
- inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
+ /* recently referenced inodes get one more pass */
+ if (inode->i_state & I_REFERENCED) {
+ inode->i_state &= ~I_REFERENCED;
+ spin_unlock(&inode->i_lock);
+ return LRU_ROTATE;
+ }
- /*
- * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
- * so use a trylock. If we fail to get the lock, just move the
- * inode to the back of the list so we don't spin on it.
- */
- if (!spin_trylock(&inode->i_lock)) {
- list_move(&inode->i_lru, &sb->s_inode_lru);
- continue;
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(lru_lock);
+ if (remove_inode_buffers(inode)) {
+ unsigned long reap;
+ reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_INODESTEAL, reap);
+ else
+ __count_vm_events(PGINODESTEAL, reap);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += reap;
}
+ iput(inode);
+ spin_lock(lru_lock);
+ return LRU_RETRY;
+ }
- /*
- * Referenced or dirty inodes are still in use. Give them
- * another pass through the LRU as we canot reclaim them now.
- */
- if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
- list_del_init(&inode->i_lru);
- spin_unlock(&inode->i_lock);
- sb->s_nr_inodes_unused--;
- this_cpu_dec(nr_unused);
- continue;
- }
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state |= I_FREEING;
+ spin_unlock(&inode->i_lock);
- /* recently referenced inodes get one more pass */
- if (inode->i_state & I_REFERENCED) {
- inode->i_state &= ~I_REFERENCED;
- list_move(&inode->i_lru, &sb->s_inode_lru);
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_lru_lock);
- if (remove_inode_buffers(inode))
- reap += invalidate_mapping_pages(&inode->i_data,
- 0, -1);
- iput(inode);
- spin_lock(&sb->s_inode_lru_lock);
-
- if (inode != list_entry(sb->s_inode_lru.next,
- struct inode, i_lru))
- continue; /* wrong inode or list_empty */
- /* avoid lock inversions with trylock */
- if (!spin_trylock(&inode->i_lock))
- continue;
- if (!can_unuse(inode)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- }
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state |= I_FREEING;
- spin_unlock(&inode->i_lock);
+ list_move(&inode->i_lru, freeable);
+ this_cpu_dec(nr_unused);
+ return LRU_REMOVED;
+}
- list_move(&inode->i_lru, &freeable);
- sb->s_nr_inodes_unused--;
- this_cpu_dec(nr_unused);
- freed++;
- }
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_INODESTEAL, reap);
- else
- __count_vm_events(PGINODESTEAL, reap);
- spin_unlock(&sb->s_inode_lru_lock);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += reap;
+/*
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
+ */
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ long freed;
+ freed = list_lru_walk(&sb->s_inode_lru, inode_lru_isolate,
+ &freeable, nr_to_scan);
dispose_list(&freeable);
return freed;
}
diff --git a/fs/super.c b/fs/super.c
index 86801eb..fea5c44 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,14 +77,13 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
if (sb->s_op && sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb);
- total_objects = sb->s_nr_dentry_unused +
- sb->s_nr_inodes_unused + fs_objects + 1;
+ inodes = list_lru_count(&sb->s_inode_lru);
+ total_objects = sb->s_nr_dentry_unused + inodes + fs_objects + 1;
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
total_objects);
- inodes = mult_frac(sc->nr_to_scan, sb->s_nr_inodes_unused,
- total_objects);
+ inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
/*
* prune the dcache first as the icache is pinned by it, then
@@ -117,7 +116,7 @@ static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc
total_objects = sb->s_op->nr_cached_objects(sb);
total_objects += sb->s_nr_dentry_unused;
- total_objects += sb->s_nr_inodes_unused;
+ total_objects += list_lru_count(&sb->s_inode_lru);
total_objects = vfs_pressure_ratio(total_objects);
drop_super(sb);
@@ -198,8 +197,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
spin_lock_init(&s->s_dentry_lru_lock);
- INIT_LIST_HEAD(&s->s_inode_lru);
- spin_lock_init(&s->s_inode_lru_lock);
+ list_lru_init(&s->s_inode_lru);
INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2913d3b..a50f175 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
+#include <linux/list_lru.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/init.h>
@@ -1270,10 +1271,7 @@ struct super_block {
struct list_head s_dentry_lru; /* unused dentry lru */
long s_nr_dentry_unused; /* # of dentry on lru */
- /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
- spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp;
- struct list_head s_inode_lru; /* unused inode lru */
- long s_nr_inodes_unused; /* # of inodes on lru */
+ struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 09/25] inode: convert inode lru list to generic lru list code.
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
[ glommer: adapted for new LRU return codes ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
---
fs/inode.c | 175 +++++++++++++++++++++--------------------------------
fs/super.c | 12 ++--
include/linux/fs.h | 6 +-
3 files changed, 77 insertions(+), 116 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 1ddaa2e..5d85521 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
#include "internal.h"
/*
@@ -24,7 +25,7 @@
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
- * inode->i_sb->s_inode_lru_lock protects:
+ * Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
@@ -37,7 +38,7 @@
*
* inode_sb_list_lock
* inode->i_lock
- * inode->i_sb->s_inode_lru_lock
+ * Inode LRU list locks
*
* bdi->wb.list_lock
* inode->i_lock
@@ -399,13 +400,8 @@ EXPORT_SYMBOL(ihold);
static void inode_lru_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_lru_lock);
- if (list_empty(&inode->i_lru)) {
- list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
- inode->i_sb->s_nr_inodes_unused++;
+ if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- }
- spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
/*
@@ -423,13 +419,9 @@ void inode_add_lru(struct inode *inode)
static void inode_lru_list_del(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_lru_lock);
- if (!list_empty(&inode->i_lru)) {
- list_del_init(&inode->i_lru);
- inode->i_sb->s_nr_inodes_unused--;
+
+ if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
- }
- spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
/**
@@ -673,24 +665,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
return busy;
}
-static int can_unuse(struct inode *inode)
-{
- if (inode->i_state & ~I_REFERENCED)
- return 0;
- if (inode_has_buffers(inode))
- return 0;
- if (atomic_read(&inode->i_count))
- return 0;
- if (inode->i_data.nrpages)
- return 0;
- return 1;
-}
-
/*
- * Walk the superblock inode LRU for freeable inodes and attempt to free them.
- * This is called from the superblock shrinker function with a number of inodes
- * to trim from the LRU. Inodes to be freed are moved to a temporary list and
- * then are freed outside inode_lock by dispose_list().
+ * Isolate the inode from the LRU in preparation for freeing it.
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. If the inode has metadata buffers attached to
@@ -704,90 +680,79 @@ static int can_unuse(struct inode *inode)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
+static enum lru_status
+inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
{
- LIST_HEAD(freeable);
- long nr_scanned;
- long freed = 0;
- unsigned long reap = 0;
+ struct list_head *freeable = arg;
+ struct inode *inode = container_of(item, struct inode, i_lru);
- spin_lock(&sb->s_inode_lru_lock);
- for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
- struct inode *inode;
+ /*
+ * we are inverting the lru lock/inode->i_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&inode->i_lock))
+ return LRU_SKIP;
- if (list_empty(&sb->s_inode_lru))
- break;
+ /*
+ * Referenced or dirty inodes are still in use. Give them another pass
+ * through the LRU as we canot reclaim them now.
+ */
+ if (atomic_read(&inode->i_count) ||
+ (inode->i_state & ~I_REFERENCED)) {
+ list_del_init(&inode->i_lru);
+ spin_unlock(&inode->i_lock);
+ this_cpu_dec(nr_unused);
+ return LRU_REMOVED;
+ }
- inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
+ /* recently referenced inodes get one more pass */
+ if (inode->i_state & I_REFERENCED) {
+ inode->i_state &= ~I_REFERENCED;
+ spin_unlock(&inode->i_lock);
+ return LRU_ROTATE;
+ }
- /*
- * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
- * so use a trylock. If we fail to get the lock, just move the
- * inode to the back of the list so we don't spin on it.
- */
- if (!spin_trylock(&inode->i_lock)) {
- list_move(&inode->i_lru, &sb->s_inode_lru);
- continue;
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(lru_lock);
+ if (remove_inode_buffers(inode)) {
+ unsigned long reap;
+ reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_INODESTEAL, reap);
+ else
+ __count_vm_events(PGINODESTEAL, reap);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += reap;
}
+ iput(inode);
+ spin_lock(lru_lock);
+ return LRU_RETRY;
+ }
- /*
- * Referenced or dirty inodes are still in use. Give them
- * another pass through the LRU as we canot reclaim them now.
- */
- if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
- list_del_init(&inode->i_lru);
- spin_unlock(&inode->i_lock);
- sb->s_nr_inodes_unused--;
- this_cpu_dec(nr_unused);
- continue;
- }
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state |= I_FREEING;
+ spin_unlock(&inode->i_lock);
- /* recently referenced inodes get one more pass */
- if (inode->i_state & I_REFERENCED) {
- inode->i_state &= ~I_REFERENCED;
- list_move(&inode->i_lru, &sb->s_inode_lru);
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_lru_lock);
- if (remove_inode_buffers(inode))
- reap += invalidate_mapping_pages(&inode->i_data,
- 0, -1);
- iput(inode);
- spin_lock(&sb->s_inode_lru_lock);
-
- if (inode != list_entry(sb->s_inode_lru.next,
- struct inode, i_lru))
- continue; /* wrong inode or list_empty */
- /* avoid lock inversions with trylock */
- if (!spin_trylock(&inode->i_lock))
- continue;
- if (!can_unuse(inode)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- }
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state |= I_FREEING;
- spin_unlock(&inode->i_lock);
+ list_move(&inode->i_lru, freeable);
+ this_cpu_dec(nr_unused);
+ return LRU_REMOVED;
+}
- list_move(&inode->i_lru, &freeable);
- sb->s_nr_inodes_unused--;
- this_cpu_dec(nr_unused);
- freed++;
- }
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_INODESTEAL, reap);
- else
- __count_vm_events(PGINODESTEAL, reap);
- spin_unlock(&sb->s_inode_lru_lock);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += reap;
+/*
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
+ */
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ long freed;
+ freed = list_lru_walk(&sb->s_inode_lru, inode_lru_isolate,
+ &freeable, nr_to_scan);
dispose_list(&freeable);
return freed;
}
diff --git a/fs/super.c b/fs/super.c
index 86801eb..fea5c44 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,14 +77,13 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
if (sb->s_op && sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb);
- total_objects = sb->s_nr_dentry_unused +
- sb->s_nr_inodes_unused + fs_objects + 1;
+ inodes = list_lru_count(&sb->s_inode_lru);
+ total_objects = sb->s_nr_dentry_unused + inodes + fs_objects + 1;
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
total_objects);
- inodes = mult_frac(sc->nr_to_scan, sb->s_nr_inodes_unused,
- total_objects);
+ inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
/*
* prune the dcache first as the icache is pinned by it, then
@@ -117,7 +116,7 @@ static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc
total_objects = sb->s_op->nr_cached_objects(sb);
total_objects += sb->s_nr_dentry_unused;
- total_objects += sb->s_nr_inodes_unused;
+ total_objects += list_lru_count(&sb->s_inode_lru);
total_objects = vfs_pressure_ratio(total_objects);
drop_super(sb);
@@ -198,8 +197,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
spin_lock_init(&s->s_dentry_lru_lock);
- INIT_LIST_HEAD(&s->s_inode_lru);
- spin_lock_init(&s->s_inode_lru_lock);
+ list_lru_init(&s->s_inode_lru);
INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2913d3b..a50f175 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
+#include <linux/list_lru.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/init.h>
@@ -1270,10 +1271,7 @@ struct super_block {
struct list_head s_dentry_lru; /* unused dentry lru */
long s_nr_dentry_unused; /* # of dentry on lru */
- /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
- spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp;
- struct list_head s_inode_lru; /* unused inode lru */
- long s_nr_inodes_unused; /* # of inodes on lru */
+ struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 10/25] dcache: convert to use new lru list infrastructure
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
[ glommer: don't reintroduce double decrement of nr_unused_dentries,
adapted for new LRU return codes ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
fs/dcache.c | 165 ++++++++++++++++++++++++-----------------------------
fs/super.c | 11 ++--
include/linux/fs.h | 15 +++--
3 files changed, 87 insertions(+), 104 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index d7609a0..00722b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
#include <linux/rculist_bl.h>
#include <linux/prefetch.h>
#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"
@@ -331,20 +332,8 @@ static void dentry_unlink_inode(struct dentry * dentry)
*/
static void dentry_lru_add(struct dentry *dentry)
{
- if (list_empty(&dentry->d_lru)) {
- spin_lock(&dentry->d_sb->s_dentry_lru_lock);
- list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
- dentry->d_sb->s_nr_dentry_unused++;
+ if (list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru))
this_cpu_inc(nr_dentry_unused);
- spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
- }
-}
-
-static void __dentry_lru_del(struct dentry *dentry)
-{
- list_del_init(&dentry->d_lru);
- dentry->d_sb->s_nr_dentry_unused--;
- this_cpu_dec(nr_dentry_unused);
}
/*
@@ -362,26 +351,8 @@ static void dentry_lru_del(struct dentry *dentry)
return;
}
- if (!list_empty(&dentry->d_lru)) {
- spin_lock(&dentry->d_sb->s_dentry_lru_lock);
- __dentry_lru_del(dentry);
- spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
- }
-}
-
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
-{
- BUG_ON(dentry->d_flags & DCACHE_SHRINK_LIST);
-
- spin_lock(&dentry->d_sb->s_dentry_lru_lock);
- if (list_empty(&dentry->d_lru)) {
- list_add_tail(&dentry->d_lru, list);
- } else {
- list_move_tail(&dentry->d_lru, list);
- dentry->d_sb->s_nr_dentry_unused--;
+ if (list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru))
this_cpu_dec(nr_dentry_unused);
- }
- spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
/**
@@ -856,12 +827,72 @@ static void shrink_dentry_list(struct list_head *list)
rcu_read_unlock();
}
+static enum lru_status
+dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *freeable = arg;
+ struct dentry *dentry = container_of(item, struct dentry, d_lru);
+
+
+ /*
+ * we are inverting the lru lock/dentry->d_lock here,
+ * so use a trylock. If we fail to get the lock, just skip
+ * it
+ */
+ if (!spin_trylock(&dentry->d_lock))
+ return LRU_SKIP;
+
+ /*
+ * Referenced dentries are still in use. If they have active
+ * counts, just remove them from the LRU. Otherwise give them
+ * another pass through the LRU.
+ */
+ if (dentry->d_count) {
+ list_del_init(&dentry->d_lru);
+ spin_unlock(&dentry->d_lock);
+ return LRU_REMOVED;
+ }
+
+ if (dentry->d_flags & DCACHE_REFERENCED) {
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ spin_unlock(&dentry->d_lock);
+
+ /*
+ * The list move itself will be made by the common LRU code. At
+ * this point, we've dropped the dentry->d_lock but keep the
+ * lru lock. This is safe to do, since every list movement is
+ * protected by the lru lock even if both locks are held.
+ *
+ * This is guaranteed by the fact that all LRU management
+ * functions are intermediated by the LRU API calls like
+ * list_lru_add and list_lru_del. List movement in this file
+ * only ever occur through this functions or through callbacks
+ * like this one, that are called from the LRU API.
+ *
+ * The only exceptions to this are functions like
+ * shrink_dentry_list, and code that first checks for the
+ * DCACHE_SHRINK_LIST flag. Those are guaranteed to be
+ * operating only with stack provided lists after they are
+ * properly isolated from the main list. It is thus, always a
+ * local access.
+ */
+ return LRU_ROTATE;
+ }
+
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ list_move_tail(&dentry->d_lru, freeable);
+ this_cpu_dec(nr_dentry_unused);
+ spin_unlock(&dentry->d_lock);
+
+ return LRU_REMOVED;
+}
+
/**
* prune_dcache_sb - shrink the dcache
* @sb: superblock
- * @count: number of entries to try to free
+ * @nr_to_scan : number of entries to try to free
*
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
* done when we need more memory an called from the superblock shrinker
* function.
*
@@ -870,45 +901,12 @@ static void shrink_dentry_list(struct list_head *list)
*/
long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan)
{
- struct dentry *dentry;
- LIST_HEAD(referenced);
- LIST_HEAD(tmp);
- long freed = 0;
-
-relock:
- spin_lock(&sb->s_dentry_lru_lock);
- while (!list_empty(&sb->s_dentry_lru)) {
- dentry = list_entry(sb->s_dentry_lru.prev,
- struct dentry, d_lru);
- BUG_ON(dentry->d_sb != sb);
-
- if (!spin_trylock(&dentry->d_lock)) {
- spin_unlock(&sb->s_dentry_lru_lock);
- cpu_relax();
- goto relock;
- }
-
- if (dentry->d_flags & DCACHE_REFERENCED) {
- dentry->d_flags &= ~DCACHE_REFERENCED;
- list_move(&dentry->d_lru, &referenced);
- spin_unlock(&dentry->d_lock);
- } else {
- list_move(&dentry->d_lru, &tmp);
- dentry->d_flags |= DCACHE_SHRINK_LIST;
- this_cpu_dec(nr_dentry_unused);
- sb->s_nr_dentry_unused--;
- spin_unlock(&dentry->d_lock);
- freed++;
- if (!--nr_to_scan)
- break;
- }
- cond_resched_lock(&sb->s_dentry_lru_lock);
- }
- if (!list_empty(&referenced))
- list_splice(&referenced, &sb->s_dentry_lru);
- spin_unlock(&sb->s_dentry_lru_lock);
+ LIST_HEAD(dispose);
+ long freed;
- shrink_dentry_list(&tmp);
+ freed = list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate,
+ &dispose, nr_to_scan);
+ shrink_dentry_list(&dispose);
return freed;
}
@@ -942,24 +940,10 @@ shrink_dcache_list(
*/
void shrink_dcache_sb(struct super_block *sb)
{
- LIST_HEAD(tmp);
-
- spin_lock(&sb->s_dentry_lru_lock);
- while (!list_empty(&sb->s_dentry_lru)) {
- /*
- * account for removal here so we don't need to handle it later
- * even though the dentry is no longer on the lru list.
- */
- list_splice_init(&sb->s_dentry_lru, &tmp);
- this_cpu_sub(nr_dentry_unused, sb->s_nr_dentry_unused);
- sb->s_nr_dentry_unused = 0;
- spin_unlock(&sb->s_dentry_lru_lock);
+ long disposed;
- shrink_dcache_list(&tmp);
-
- spin_lock(&sb->s_dentry_lru_lock);
- }
- spin_unlock(&sb->s_dentry_lru_lock);
+ disposed = list_lru_dispose_all(&sb->s_dentry_lru, shrink_dcache_list);
+ this_cpu_sub(nr_dentry_unused, disposed);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1232,7 +1216,8 @@ resume:
if (dentry->d_count) {
dentry_lru_del(dentry);
} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- dentry_lru_move_list(dentry, dispose);
+ dentry_lru_del(dentry);
+ list_add_tail(&dentry->d_lru, dispose);
dentry->d_flags |= DCACHE_SHRINK_LIST;
found++;
}
diff --git a/fs/super.c b/fs/super.c
index fea5c44..7fe934d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -78,11 +78,11 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
fs_objects = sb->s_op->nr_cached_objects(sb);
inodes = list_lru_count(&sb->s_inode_lru);
- total_objects = sb->s_nr_dentry_unused + inodes + fs_objects + 1;
+ dentries = list_lru_count(&sb->s_dentry_lru);
+ total_objects = dentries + inodes + fs_objects + 1;
/* proportion the scan between the caches */
- dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
- total_objects);
+ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
/*
@@ -115,7 +115,7 @@ static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb);
- total_objects += sb->s_nr_dentry_unused;
+ total_objects += list_lru_count(&sb->s_dentry_lru);
total_objects += list_lru_count(&sb->s_inode_lru);
total_objects = vfs_pressure_ratio(total_objects);
@@ -195,8 +195,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
- INIT_LIST_HEAD(&s->s_dentry_lru);
- spin_lock_init(&s->s_dentry_lru_lock);
+ list_lru_init(&s->s_dentry_lru);
list_lru_init(&s->s_inode_lru);
INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a50f175..976258f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1265,14 +1265,6 @@ struct super_block {
struct list_head s_files;
#endif
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
-
- /* s_dentry_lru_lock protects s_dentry_lru and s_nr_dentry_unused */
- spinlock_t s_dentry_lru_lock ____cacheline_aligned_in_smp;
- struct list_head s_dentry_lru; /* unused dentry lru */
- long s_nr_dentry_unused; /* # of dentry on lru */
-
- struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
-
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
@@ -1323,6 +1315,13 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+
+ /*
+ * Keep the lru lists last in the structure so they always sit on their
+ * own individual cachelines.
+ */
+ struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
+ struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
};
extern struct timespec current_fs_time(struct super_block *sb);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 10/25] dcache: convert to use new lru list infrastructure
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
[ glommer: don't reintroduce double decrement of nr_unused_dentries,
adapted for new LRU return codes ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
---
fs/dcache.c | 165 ++++++++++++++++++++++++-----------------------------
fs/super.c | 11 ++--
include/linux/fs.h | 15 +++--
3 files changed, 87 insertions(+), 104 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index d7609a0..00722b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
#include <linux/rculist_bl.h>
#include <linux/prefetch.h>
#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"
@@ -331,20 +332,8 @@ static void dentry_unlink_inode(struct dentry * dentry)
*/
static void dentry_lru_add(struct dentry *dentry)
{
- if (list_empty(&dentry->d_lru)) {
- spin_lock(&dentry->d_sb->s_dentry_lru_lock);
- list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
- dentry->d_sb->s_nr_dentry_unused++;
+ if (list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru))
this_cpu_inc(nr_dentry_unused);
- spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
- }
-}
-
-static void __dentry_lru_del(struct dentry *dentry)
-{
- list_del_init(&dentry->d_lru);
- dentry->d_sb->s_nr_dentry_unused--;
- this_cpu_dec(nr_dentry_unused);
}
/*
@@ -362,26 +351,8 @@ static void dentry_lru_del(struct dentry *dentry)
return;
}
- if (!list_empty(&dentry->d_lru)) {
- spin_lock(&dentry->d_sb->s_dentry_lru_lock);
- __dentry_lru_del(dentry);
- spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
- }
-}
-
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
-{
- BUG_ON(dentry->d_flags & DCACHE_SHRINK_LIST);
-
- spin_lock(&dentry->d_sb->s_dentry_lru_lock);
- if (list_empty(&dentry->d_lru)) {
- list_add_tail(&dentry->d_lru, list);
- } else {
- list_move_tail(&dentry->d_lru, list);
- dentry->d_sb->s_nr_dentry_unused--;
+ if (list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru))
this_cpu_dec(nr_dentry_unused);
- }
- spin_unlock(&dentry->d_sb->s_dentry_lru_lock);
}
/**
@@ -856,12 +827,72 @@ static void shrink_dentry_list(struct list_head *list)
rcu_read_unlock();
}
+static enum lru_status
+dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *freeable = arg;
+ struct dentry *dentry = container_of(item, struct dentry, d_lru);
+
+
+ /*
+ * we are inverting the lru lock/dentry->d_lock here,
+ * so use a trylock. If we fail to get the lock, just skip
+ * it
+ */
+ if (!spin_trylock(&dentry->d_lock))
+ return LRU_SKIP;
+
+ /*
+ * Referenced dentries are still in use. If they have active
+ * counts, just remove them from the LRU. Otherwise give them
+ * another pass through the LRU.
+ */
+ if (dentry->d_count) {
+ list_del_init(&dentry->d_lru);
+ spin_unlock(&dentry->d_lock);
+ return LRU_REMOVED;
+ }
+
+ if (dentry->d_flags & DCACHE_REFERENCED) {
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ spin_unlock(&dentry->d_lock);
+
+ /*
+ * The list move itself will be made by the common LRU code. At
+ * this point, we've dropped the dentry->d_lock but keep the
+ * lru lock. This is safe to do, since every list movement is
+ * protected by the lru lock even if both locks are held.
+ *
+ * This is guaranteed by the fact that all LRU management
+ * functions are intermediated by the LRU API calls like
+ * list_lru_add and list_lru_del. List movement in this file
+ * only ever occur through this functions or through callbacks
+ * like this one, that are called from the LRU API.
+ *
+ * The only exceptions to this are functions like
+ * shrink_dentry_list, and code that first checks for the
+ * DCACHE_SHRINK_LIST flag. Those are guaranteed to be
+ * operating only with stack provided lists after they are
+ * properly isolated from the main list. It is thus, always a
+ * local access.
+ */
+ return LRU_ROTATE;
+ }
+
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ list_move_tail(&dentry->d_lru, freeable);
+ this_cpu_dec(nr_dentry_unused);
+ spin_unlock(&dentry->d_lock);
+
+ return LRU_REMOVED;
+}
+
/**
* prune_dcache_sb - shrink the dcache
* @sb: superblock
- * @count: number of entries to try to free
+ * @nr_to_scan : number of entries to try to free
*
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
* done when we need more memory an called from the superblock shrinker
* function.
*
@@ -870,45 +901,12 @@ static void shrink_dentry_list(struct list_head *list)
*/
long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan)
{
- struct dentry *dentry;
- LIST_HEAD(referenced);
- LIST_HEAD(tmp);
- long freed = 0;
-
-relock:
- spin_lock(&sb->s_dentry_lru_lock);
- while (!list_empty(&sb->s_dentry_lru)) {
- dentry = list_entry(sb->s_dentry_lru.prev,
- struct dentry, d_lru);
- BUG_ON(dentry->d_sb != sb);
-
- if (!spin_trylock(&dentry->d_lock)) {
- spin_unlock(&sb->s_dentry_lru_lock);
- cpu_relax();
- goto relock;
- }
-
- if (dentry->d_flags & DCACHE_REFERENCED) {
- dentry->d_flags &= ~DCACHE_REFERENCED;
- list_move(&dentry->d_lru, &referenced);
- spin_unlock(&dentry->d_lock);
- } else {
- list_move(&dentry->d_lru, &tmp);
- dentry->d_flags |= DCACHE_SHRINK_LIST;
- this_cpu_dec(nr_dentry_unused);
- sb->s_nr_dentry_unused--;
- spin_unlock(&dentry->d_lock);
- freed++;
- if (!--nr_to_scan)
- break;
- }
- cond_resched_lock(&sb->s_dentry_lru_lock);
- }
- if (!list_empty(&referenced))
- list_splice(&referenced, &sb->s_dentry_lru);
- spin_unlock(&sb->s_dentry_lru_lock);
+ LIST_HEAD(dispose);
+ long freed;
- shrink_dentry_list(&tmp);
+ freed = list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate,
+ &dispose, nr_to_scan);
+ shrink_dentry_list(&dispose);
return freed;
}
@@ -942,24 +940,10 @@ shrink_dcache_list(
*/
void shrink_dcache_sb(struct super_block *sb)
{
- LIST_HEAD(tmp);
-
- spin_lock(&sb->s_dentry_lru_lock);
- while (!list_empty(&sb->s_dentry_lru)) {
- /*
- * account for removal here so we don't need to handle it later
- * even though the dentry is no longer on the lru list.
- */
- list_splice_init(&sb->s_dentry_lru, &tmp);
- this_cpu_sub(nr_dentry_unused, sb->s_nr_dentry_unused);
- sb->s_nr_dentry_unused = 0;
- spin_unlock(&sb->s_dentry_lru_lock);
+ long disposed;
- shrink_dcache_list(&tmp);
-
- spin_lock(&sb->s_dentry_lru_lock);
- }
- spin_unlock(&sb->s_dentry_lru_lock);
+ disposed = list_lru_dispose_all(&sb->s_dentry_lru, shrink_dcache_list);
+ this_cpu_sub(nr_dentry_unused, disposed);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1232,7 +1216,8 @@ resume:
if (dentry->d_count) {
dentry_lru_del(dentry);
} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- dentry_lru_move_list(dentry, dispose);
+ dentry_lru_del(dentry);
+ list_add_tail(&dentry->d_lru, dispose);
dentry->d_flags |= DCACHE_SHRINK_LIST;
found++;
}
diff --git a/fs/super.c b/fs/super.c
index fea5c44..7fe934d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -78,11 +78,11 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
fs_objects = sb->s_op->nr_cached_objects(sb);
inodes = list_lru_count(&sb->s_inode_lru);
- total_objects = sb->s_nr_dentry_unused + inodes + fs_objects + 1;
+ dentries = list_lru_count(&sb->s_dentry_lru);
+ total_objects = dentries + inodes + fs_objects + 1;
/* proportion the scan between the caches */
- dentries = mult_frac(sc->nr_to_scan, sb->s_nr_dentry_unused,
- total_objects);
+ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
/*
@@ -115,7 +115,7 @@ static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb);
- total_objects += sb->s_nr_dentry_unused;
+ total_objects += list_lru_count(&sb->s_dentry_lru);
total_objects += list_lru_count(&sb->s_inode_lru);
total_objects = vfs_pressure_ratio(total_objects);
@@ -195,8 +195,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
- INIT_LIST_HEAD(&s->s_dentry_lru);
- spin_lock_init(&s->s_dentry_lru_lock);
+ list_lru_init(&s->s_dentry_lru);
list_lru_init(&s->s_inode_lru);
INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a50f175..976258f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1265,14 +1265,6 @@ struct super_block {
struct list_head s_files;
#endif
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
-
- /* s_dentry_lru_lock protects s_dentry_lru and s_nr_dentry_unused */
- spinlock_t s_dentry_lru_lock ____cacheline_aligned_in_smp;
- struct list_head s_dentry_lru; /* unused dentry lru */
- long s_nr_dentry_unused; /* # of dentry on lru */
-
- struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
-
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
@@ -1323,6 +1315,13 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+
+ /*
+ * Keep the lru lists last in the structure so they always sit on their
+ * own individual cachelines.
+ */
+ struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
+ struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
};
extern struct timespec current_fs_time(struct super_block *sb);
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 11/25] list_lru: per-node list infrastructure
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Now that we have an LRU list API, we can start to enhance the
implementation. This splits the single LRU list into per-node lists
and locks to enhance scalability. Items are placed on lists
according to the node the memory belongs to. To make scanning the
lists efficient, also track whether the per-node lists have entries
in them in a active nodemask.
Note:
We use a fixed-size array for the node LRU, this struct can be very big
if MAX_NUMNODES is big. If this becomes a problem this is fixable by
turning this into a pointer and dynamically allocating this to
nr_node_ids. This quantity is firwmare-provided, and still would provide
room for all nodes at the cost of a pointer lookup and an extra
allocation. Because that allocation will most likely come from a
different slab cache than the main structure holding this structure, we
may very well fail.
[ glommer: fixed warnings, added note about node lru ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Reviewed-by: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
include/linux/list_lru.h | 23 ++++++--
mm/list_lru.c | 146 +++++++++++++++++++++++++++++++++++------------
2 files changed, 129 insertions(+), 40 deletions(-)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 1a548b0..f4d4cb6 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -8,6 +8,7 @@
#define _LRU_LIST_H
#include <linux/list.h>
+#include <linux/nodemask.h>
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
@@ -18,11 +19,26 @@ enum lru_status {
internally, but has to return locked. */
};
-struct list_lru {
+struct list_lru_node {
spinlock_t lock;
struct list_head list;
/* kept as signed so we can catch imbalance bugs */
long nr_items;
+} ____cacheline_aligned_in_smp;
+
+struct list_lru {
+ /*
+ * Because we use a fixed-size array, this struct can be very big if
+ * MAX_NUMNODES is big. If this becomes a problem this is fixable by
+ * turning this into a pointer and dynamically allocating this to
+ * nr_node_ids. This quantity is firwmare-provided, and still would
+ * provide room for all nodes at the cost of a pointer lookup and an
+ * extra allocation. Because that allocation will most likely come from
+ * a different slab cache than the main structure holding this
+ * structure, we may very well fail.
+ */
+ struct list_lru_node node[MAX_NUMNODES];
+ nodemask_t active_nodes;
};
int list_lru_init(struct list_lru *lru);
@@ -66,10 +82,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
-static inline unsigned long list_lru_count(struct list_lru *lru)
-{
- return lru->nr_items;
-}
+unsigned long list_lru_count(struct list_lru *lru);
typedef enum lru_status
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index dd74c54..f2d1d6e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -6,41 +6,73 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/mm.h>
#include <linux/list_lru.h>
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
- spin_lock(&lru->lock);
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ BUG_ON(nlru->nr_items < 0);
if (list_empty(item)) {
- list_add_tail(item, &lru->list);
- lru->nr_items++;
- spin_unlock(&lru->lock);
+ list_add_tail(item, &nlru->list);
+ if (nlru->nr_items++ == 0)
+ node_set(nid, lru->active_nodes);
+ spin_unlock(&nlru->lock);
return true;
}
- spin_unlock(&lru->lock);
+ spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);
bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
- spin_lock(&lru->lock);
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
if (!list_empty(item)) {
list_del_init(item);
- lru->nr_items--;
- spin_unlock(&lru->lock);
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ BUG_ON(nlru->nr_items < 0);
+ spin_unlock(&nlru->lock);
return true;
}
- spin_unlock(&lru->lock);
+ spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long nr_to_walk)
+unsigned long list_lru_count(struct list_lru *lru)
{
+ unsigned long count = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes) {
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ BUG_ON(nlru->nr_items < 0);
+ count += nlru->nr_items;
+ spin_unlock(&nlru->lock);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(list_lru_count);
+
+static unsigned long
+list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long *nr_to_walk)
+{
+
+ struct list_lru_node *nlru = &lru->node[nid];
struct list_head *item, *n;
- unsigned long removed = 0;
+ unsigned long isolated = 0;
/*
* If we don't keep state of at which pass we are, we can loop at
* LRU_RETRY, since we have no guarantees that the caller will be able
@@ -50,18 +82,20 @@ unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
*/
bool first_pass = true;
- spin_lock(&lru->lock);
+ spin_lock(&nlru->lock);
restart:
- list_for_each_safe(item, n, &lru->list) {
+ list_for_each_safe(item, n, &nlru->list) {
enum lru_status ret;
- ret = isolate(item, &lru->lock, cb_arg);
+ ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED:
- lru->nr_items--;
- removed++;
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ BUG_ON(nlru->nr_items < 0);
+ isolated++;
break;
case LRU_ROTATE:
- list_move_tail(item, &lru->list);
+ list_move_tail(item, &nlru->list);
break;
case LRU_SKIP:
break;
@@ -76,42 +110,84 @@ restart:
BUG();
}
- if (nr_to_walk-- == 0)
+ if ((*nr_to_walk)-- == 0)
break;
}
- spin_unlock(&lru->lock);
- return removed;
+
+ spin_unlock(&nlru->lock);
+ return isolated;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_node);
+
+unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk)
+{
+ unsigned long isolated = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes) {
+ isolated += list_lru_walk_node(lru, nid, isolate,
+ cb_arg, &nr_to_walk);
+ if (nr_to_walk <= 0)
+ break;
+ }
+ return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk);
-unsigned long list_lru_dispose_all(struct list_lru *lru,
- list_lru_dispose_cb dispose)
+static unsigned long list_lru_dispose_all_node(struct list_lru *lru, int nid,
+ list_lru_dispose_cb dispose)
{
- unsigned long disposed = 0;
+ struct list_lru_node *nlru = &lru->node[nid];
LIST_HEAD(dispose_list);
+ unsigned long disposed = 0;
- spin_lock(&lru->lock);
- while (!list_empty(&lru->list)) {
- list_splice_init(&lru->list, &dispose_list);
- disposed += lru->nr_items;
- lru->nr_items = 0;
- spin_unlock(&lru->lock);
+ spin_lock(&nlru->lock);
+ while (!list_empty(&nlru->list)) {
+ list_splice_init(&nlru->list, &dispose_list);
+ disposed += nlru->nr_items;
+ nlru->nr_items = 0;
+ node_clear(nid, lru->active_nodes);
+ spin_unlock(&nlru->lock);
dispose(&dispose_list);
- spin_lock(&lru->lock);
+ spin_lock(&nlru->lock);
}
- spin_unlock(&lru->lock);
+ spin_unlock(&nlru->lock);
return disposed;
}
+unsigned long list_lru_dispose_all(struct list_lru *lru,
+ list_lru_dispose_cb dispose)
+{
+ unsigned long disposed;
+ unsigned long total = 0;
+ int nid;
+
+ do {
+ disposed = 0;
+ for_each_node_mask(nid, lru->active_nodes) {
+ disposed += list_lru_dispose_all_node(lru, nid,
+ dispose);
+ }
+ total += disposed;
+ } while (disposed != 0);
+
+ return total;
+}
+
int list_lru_init(struct list_lru *lru)
{
- spin_lock_init(&lru->lock);
- INIT_LIST_HEAD(&lru->list);
- lru->nr_items = 0;
+ int i;
+ nodes_clear(lru->active_nodes);
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ spin_lock_init(&lru->node[i].lock);
+ INIT_LIST_HEAD(&lru->node[i].list);
+ lru->node[i].nr_items = 0;
+ }
return 0;
}
EXPORT_SYMBOL_GPL(list_lru_init);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 11/25] list_lru: per-node list infrastructure
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Now that we have an LRU list API, we can start to enhance the
implementation. This splits the single LRU list into per-node lists
and locks to enhance scalability. Items are placed on lists
according to the node the memory belongs to. To make scanning the
lists efficient, also track whether the per-node lists have entries
in them in a active nodemask.
Note:
We use a fixed-size array for the node LRU, this struct can be very big
if MAX_NUMNODES is big. If this becomes a problem this is fixable by
turning this into a pointer and dynamically allocating this to
nr_node_ids. This quantity is firwmare-provided, and still would provide
room for all nodes at the cost of a pointer lookup and an extra
allocation. Because that allocation will most likely come from a
different slab cache than the main structure holding this structure, we
may very well fail.
[ glommer: fixed warnings, added note about node lru ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
---
include/linux/list_lru.h | 23 ++++++--
mm/list_lru.c | 146 +++++++++++++++++++++++++++++++++++------------
2 files changed, 129 insertions(+), 40 deletions(-)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 1a548b0..f4d4cb6 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -8,6 +8,7 @@
#define _LRU_LIST_H
#include <linux/list.h>
+#include <linux/nodemask.h>
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
@@ -18,11 +19,26 @@ enum lru_status {
internally, but has to return locked. */
};
-struct list_lru {
+struct list_lru_node {
spinlock_t lock;
struct list_head list;
/* kept as signed so we can catch imbalance bugs */
long nr_items;
+} ____cacheline_aligned_in_smp;
+
+struct list_lru {
+ /*
+ * Because we use a fixed-size array, this struct can be very big if
+ * MAX_NUMNODES is big. If this becomes a problem this is fixable by
+ * turning this into a pointer and dynamically allocating this to
+ * nr_node_ids. This quantity is firwmare-provided, and still would
+ * provide room for all nodes at the cost of a pointer lookup and an
+ * extra allocation. Because that allocation will most likely come from
+ * a different slab cache than the main structure holding this
+ * structure, we may very well fail.
+ */
+ struct list_lru_node node[MAX_NUMNODES];
+ nodemask_t active_nodes;
};
int list_lru_init(struct list_lru *lru);
@@ -66,10 +82,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
-static inline unsigned long list_lru_count(struct list_lru *lru)
-{
- return lru->nr_items;
-}
+unsigned long list_lru_count(struct list_lru *lru);
typedef enum lru_status
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index dd74c54..f2d1d6e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -6,41 +6,73 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/mm.h>
#include <linux/list_lru.h>
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
- spin_lock(&lru->lock);
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ BUG_ON(nlru->nr_items < 0);
if (list_empty(item)) {
- list_add_tail(item, &lru->list);
- lru->nr_items++;
- spin_unlock(&lru->lock);
+ list_add_tail(item, &nlru->list);
+ if (nlru->nr_items++ == 0)
+ node_set(nid, lru->active_nodes);
+ spin_unlock(&nlru->lock);
return true;
}
- spin_unlock(&lru->lock);
+ spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);
bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
- spin_lock(&lru->lock);
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
if (!list_empty(item)) {
list_del_init(item);
- lru->nr_items--;
- spin_unlock(&lru->lock);
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ BUG_ON(nlru->nr_items < 0);
+ spin_unlock(&nlru->lock);
return true;
}
- spin_unlock(&lru->lock);
+ spin_unlock(&nlru->lock);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long nr_to_walk)
+unsigned long list_lru_count(struct list_lru *lru)
{
+ unsigned long count = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes) {
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ BUG_ON(nlru->nr_items < 0);
+ count += nlru->nr_items;
+ spin_unlock(&nlru->lock);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(list_lru_count);
+
+static unsigned long
+list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long *nr_to_walk)
+{
+
+ struct list_lru_node *nlru = &lru->node[nid];
struct list_head *item, *n;
- unsigned long removed = 0;
+ unsigned long isolated = 0;
/*
* If we don't keep state of at which pass we are, we can loop at
* LRU_RETRY, since we have no guarantees that the caller will be able
@@ -50,18 +82,20 @@ unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
*/
bool first_pass = true;
- spin_lock(&lru->lock);
+ spin_lock(&nlru->lock);
restart:
- list_for_each_safe(item, n, &lru->list) {
+ list_for_each_safe(item, n, &nlru->list) {
enum lru_status ret;
- ret = isolate(item, &lru->lock, cb_arg);
+ ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED:
- lru->nr_items--;
- removed++;
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ BUG_ON(nlru->nr_items < 0);
+ isolated++;
break;
case LRU_ROTATE:
- list_move_tail(item, &lru->list);
+ list_move_tail(item, &nlru->list);
break;
case LRU_SKIP:
break;
@@ -76,42 +110,84 @@ restart:
BUG();
}
- if (nr_to_walk-- == 0)
+ if ((*nr_to_walk)-- == 0)
break;
}
- spin_unlock(&lru->lock);
- return removed;
+
+ spin_unlock(&nlru->lock);
+ return isolated;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_node);
+
+unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk)
+{
+ unsigned long isolated = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes) {
+ isolated += list_lru_walk_node(lru, nid, isolate,
+ cb_arg, &nr_to_walk);
+ if (nr_to_walk <= 0)
+ break;
+ }
+ return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk);
-unsigned long list_lru_dispose_all(struct list_lru *lru,
- list_lru_dispose_cb dispose)
+static unsigned long list_lru_dispose_all_node(struct list_lru *lru, int nid,
+ list_lru_dispose_cb dispose)
{
- unsigned long disposed = 0;
+ struct list_lru_node *nlru = &lru->node[nid];
LIST_HEAD(dispose_list);
+ unsigned long disposed = 0;
- spin_lock(&lru->lock);
- while (!list_empty(&lru->list)) {
- list_splice_init(&lru->list, &dispose_list);
- disposed += lru->nr_items;
- lru->nr_items = 0;
- spin_unlock(&lru->lock);
+ spin_lock(&nlru->lock);
+ while (!list_empty(&nlru->list)) {
+ list_splice_init(&nlru->list, &dispose_list);
+ disposed += nlru->nr_items;
+ nlru->nr_items = 0;
+ node_clear(nid, lru->active_nodes);
+ spin_unlock(&nlru->lock);
dispose(&dispose_list);
- spin_lock(&lru->lock);
+ spin_lock(&nlru->lock);
}
- spin_unlock(&lru->lock);
+ spin_unlock(&nlru->lock);
return disposed;
}
+unsigned long list_lru_dispose_all(struct list_lru *lru,
+ list_lru_dispose_cb dispose)
+{
+ unsigned long disposed;
+ unsigned long total = 0;
+ int nid;
+
+ do {
+ disposed = 0;
+ for_each_node_mask(nid, lru->active_nodes) {
+ disposed += list_lru_dispose_all_node(lru, nid,
+ dispose);
+ }
+ total += disposed;
+ } while (disposed != 0);
+
+ return total;
+}
+
int list_lru_init(struct list_lru *lru)
{
- spin_lock_init(&lru->lock);
- INIT_LIST_HEAD(&lru->list);
- lru->nr_items = 0;
+ int i;
+ nodes_clear(lru->active_nodes);
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ spin_lock_init(&lru->node[i].lock);
+ INIT_LIST_HEAD(&lru->node[i].list);
+ lru->node[i].nr_items = 0;
+ }
return 0;
}
EXPORT_SYMBOL_GPL(list_lru_init);
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 12/25] list_lru: per-node API
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner
This patch adapts the list_lru API to accept an optional node argument,
to be used by NUMA aware shrinking functions. Code that does not care
about the NUMA placement of objects can still call into the very same
functions as before. They will simply iterate over all nodes.
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Cc: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
include/linux/list_lru.h | 39 ++++++++++++++++++++++++++++++++++-----
mm/list_lru.c | 37 +++++++++----------------------------
2 files changed, 43 insertions(+), 33 deletions(-)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f4d4cb6..2fe13e1 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -75,20 +75,32 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
bool list_lru_del(struct list_lru *lru, struct list_head *item);
/**
- * list_lru_count: return the number of objects currently held by @lru
+ * list_lru_count_node: return the number of objects currently held by @lru
* @lru: the lru pointer.
+ * @nid: the node id to count from.
*
* Always return a non-negative number, 0 for empty lists. There is no
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
-unsigned long list_lru_count(struct list_lru *lru);
+unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+static inline unsigned long list_lru_count(struct list_lru *lru)
+{
+ long count = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes)
+ count += list_lru_count_node(lru, nid);
+
+ return count;
+}
typedef enum lru_status
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
/**
- * list_lru_walk: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
+ * @nid: the node id to scan from.
* @isolate: callback function that is resposible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
@@ -106,8 +118,25 @@ typedef enum lru_status
*
* Return value: the number of objects effectively removed from the LRU.
*/
-unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long nr_to_walk);
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk);
+
+static inline unsigned long
+list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk)
+{
+ long isolated = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes) {
+ isolated += list_lru_walk_node(lru, nid, isolate,
+ cb_arg, &nr_to_walk);
+ if (nr_to_walk <= 0)
+ break;
+ }
+ return isolated;
+}
typedef void (*list_lru_dispose_cb)(struct list_head *dispose_list);
/**
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f2d1d6e..2822817 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -47,25 +47,22 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long list_lru_count(struct list_lru *lru)
+unsigned long
+list_lru_count_node(struct list_lru *lru, int nid)
{
unsigned long count = 0;
- int nid;
-
- for_each_node_mask(nid, lru->active_nodes) {
- struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_node *nlru = &lru->node[nid];
- spin_lock(&nlru->lock);
- BUG_ON(nlru->nr_items < 0);
- count += nlru->nr_items;
- spin_unlock(&nlru->lock);
- }
+ spin_lock(&nlru->lock);
+ BUG_ON(nlru->nr_items < 0);
+ count += nlru->nr_items;
+ spin_unlock(&nlru->lock);
return count;
}
-EXPORT_SYMBOL_GPL(list_lru_count);
+EXPORT_SYMBOL_GPL(list_lru_count_node);
-static unsigned long
+unsigned long
list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
void *cb_arg, unsigned long *nr_to_walk)
{
@@ -120,22 +117,6 @@ restart:
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long nr_to_walk)
-{
- unsigned long isolated = 0;
- int nid;
-
- for_each_node_mask(nid, lru->active_nodes) {
- isolated += list_lru_walk_node(lru, nid, isolate,
- cb_arg, &nr_to_walk);
- if (nr_to_walk <= 0)
- break;
- }
- return isolated;
-}
-EXPORT_SYMBOL_GPL(list_lru_walk);
-
static unsigned long list_lru_dispose_all_node(struct list_lru *lru, int nid,
list_lru_dispose_cb dispose)
{
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 12/25] list_lru: per-node API
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner
This patch adapts the list_lru API to accept an optional node argument,
to be used by NUMA aware shrinking functions. Code that does not care
about the NUMA placement of objects can still call into the very same
functions as before. They will simply iterate over all nodes.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
---
include/linux/list_lru.h | 39 ++++++++++++++++++++++++++++++++++-----
mm/list_lru.c | 37 +++++++++----------------------------
2 files changed, 43 insertions(+), 33 deletions(-)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index f4d4cb6..2fe13e1 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -75,20 +75,32 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
bool list_lru_del(struct list_lru *lru, struct list_head *item);
/**
- * list_lru_count: return the number of objects currently held by @lru
+ * list_lru_count_node: return the number of objects currently held by @lru
* @lru: the lru pointer.
+ * @nid: the node id to count from.
*
* Always return a non-negative number, 0 for empty lists. There is no
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
-unsigned long list_lru_count(struct list_lru *lru);
+unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+static inline unsigned long list_lru_count(struct list_lru *lru)
+{
+ long count = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes)
+ count += list_lru_count_node(lru, nid);
+
+ return count;
+}
typedef enum lru_status
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
/**
- * list_lru_walk: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
+ * @nid: the node id to scan from.
* @isolate: callback function that is resposible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
@@ -106,8 +118,25 @@ typedef enum lru_status
*
* Return value: the number of objects effectively removed from the LRU.
*/
-unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long nr_to_walk);
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk);
+
+static inline unsigned long
+list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long nr_to_walk)
+{
+ long isolated = 0;
+ int nid;
+
+ for_each_node_mask(nid, lru->active_nodes) {
+ isolated += list_lru_walk_node(lru, nid, isolate,
+ cb_arg, &nr_to_walk);
+ if (nr_to_walk <= 0)
+ break;
+ }
+ return isolated;
+}
typedef void (*list_lru_dispose_cb)(struct list_head *dispose_list);
/**
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f2d1d6e..2822817 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -47,25 +47,22 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long list_lru_count(struct list_lru *lru)
+unsigned long
+list_lru_count_node(struct list_lru *lru, int nid)
{
unsigned long count = 0;
- int nid;
-
- for_each_node_mask(nid, lru->active_nodes) {
- struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_node *nlru = &lru->node[nid];
- spin_lock(&nlru->lock);
- BUG_ON(nlru->nr_items < 0);
- count += nlru->nr_items;
- spin_unlock(&nlru->lock);
- }
+ spin_lock(&nlru->lock);
+ BUG_ON(nlru->nr_items < 0);
+ count += nlru->nr_items;
+ spin_unlock(&nlru->lock);
return count;
}
-EXPORT_SYMBOL_GPL(list_lru_count);
+EXPORT_SYMBOL_GPL(list_lru_count_node);
-static unsigned long
+unsigned long
list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
void *cb_arg, unsigned long *nr_to_walk)
{
@@ -120,22 +117,6 @@ restart:
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long nr_to_walk)
-{
- unsigned long isolated = 0;
- int nid;
-
- for_each_node_mask(nid, lru->active_nodes) {
- isolated += list_lru_walk_node(lru, nid, isolate,
- cb_arg, &nr_to_walk);
- if (nr_to_walk <= 0)
- break;
- }
- return isolated;
-}
-EXPORT_SYMBOL_GPL(list_lru_walk);
-
static unsigned long list_lru_dispose_all_node(struct list_lru *lru, int nid,
list_lru_dispose_cb dispose)
{
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 13/25] shrinker: add node awareness
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Pass the node of the current zone being reclaimed to shrink_slab(),
allowing the shrinker control nodemask to be set appropriately for
node aware shrinkers.
[ v3: update ashmem ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
drivers/staging/android/ashmem.c | 3 +++
fs/drop_caches.c | 1 +
include/linux/shrinker.h | 3 +++
mm/memory-failure.c | 2 ++
mm/vmscan.c | 11 ++++++++---
5 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 21a3f72..65f36d7 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -692,6 +692,9 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
.gfp_mask = GFP_KERNEL,
.nr_to_scan = 0,
};
+
+ nodes_setall(sc.nodes_to_scan);
+
ret = ashmem_shrink(&ashmem_shrinker, &sc);
sc.nr_to_scan = ret;
ashmem_shrink(&ashmem_shrinker, &sc);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055..9fd702f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -44,6 +44,7 @@ static void drop_slab(void)
.gfp_mask = GFP_KERNEL,
};
+ nodes_setall(shrink.nodes_to_scan);
do {
nr_objects = shrink_slab(&shrink, 1000, 1000);
} while (nr_objects > 10);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 884e762..76f520c 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -16,6 +16,9 @@ struct shrink_control {
/* How many slab objects shrinker() should scan and try to reclaim */
unsigned long nr_to_scan;
+
+ /* shrink from these nodes */
+ nodemask_t nodes_to_scan;
};
#define SHRINK_STOP (~0UL)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2c13aa7..09ae111 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -248,10 +248,12 @@ void shake_page(struct page *p, int access)
*/
if (access) {
int nr;
+ int nid = page_to_nid(p);
do {
struct shrink_control shrink = {
.gfp_mask = GFP_KERNEL,
};
+ node_set(nid, shrink.nodes_to_scan);
nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dfc5685..f39cae0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2385,12 +2385,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
+
+ nodes_clear(shrink->nodes_to_scan);
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone),
+ shrink->nodes_to_scan);
}
shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2847,6 +2851,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
return true;
shrink_zone(zone, sc);
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
@@ -3555,10 +3561,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
- *
- * Note that shrink_slab will free memory on all zones and may
- * take a long time.
*/
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 13/25] shrinker: add node awareness
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Pass the node of the current zone being reclaimed to shrink_slab(),
allowing the shrinker control nodemask to be set appropriately for
node aware shrinkers.
[ v3: update ashmem ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Acked-by: Mel Gorman <mgorman@suse.de>
---
drivers/staging/android/ashmem.c | 3 +++
fs/drop_caches.c | 1 +
include/linux/shrinker.h | 3 +++
mm/memory-failure.c | 2 ++
mm/vmscan.c | 11 ++++++++---
5 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 21a3f72..65f36d7 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -692,6 +692,9 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
.gfp_mask = GFP_KERNEL,
.nr_to_scan = 0,
};
+
+ nodes_setall(sc.nodes_to_scan);
+
ret = ashmem_shrink(&ashmem_shrinker, &sc);
sc.nr_to_scan = ret;
ashmem_shrink(&ashmem_shrinker, &sc);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055..9fd702f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -44,6 +44,7 @@ static void drop_slab(void)
.gfp_mask = GFP_KERNEL,
};
+ nodes_setall(shrink.nodes_to_scan);
do {
nr_objects = shrink_slab(&shrink, 1000, 1000);
} while (nr_objects > 10);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 884e762..76f520c 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -16,6 +16,9 @@ struct shrink_control {
/* How many slab objects shrinker() should scan and try to reclaim */
unsigned long nr_to_scan;
+
+ /* shrink from these nodes */
+ nodemask_t nodes_to_scan;
};
#define SHRINK_STOP (~0UL)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2c13aa7..09ae111 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -248,10 +248,12 @@ void shake_page(struct page *p, int access)
*/
if (access) {
int nr;
+ int nid = page_to_nid(p);
do {
struct shrink_control shrink = {
.gfp_mask = GFP_KERNEL,
};
+ node_set(nid, shrink.nodes_to_scan);
nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dfc5685..f39cae0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2385,12 +2385,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
+
+ nodes_clear(shrink->nodes_to_scan);
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone),
+ shrink->nodes_to_scan);
}
shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2847,6 +2851,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
return true;
shrink_zone(zone, sc);
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
@@ -3555,10 +3561,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
- *
- * Note that shrink_slab will free memory on all zones and may
- * take a long time.
*/
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 14/25] vmscan: per-node deferred work
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner
The list_lru infrastructure already keeps per-node LRU lists in its
node-specific list_lru_node arrays and provide us with a per-node API, and the
shrinkers are properly equiped with node information. This means that we can
now focus our shrinking effort in a single node, but the work that is deferred
from one run to another is kept global at nr_in_batch. Work can be deferred,
for instance, during direct reclaim under a GFP_NOFS allocation, where
situation, all the filesystem shrinkers will be prevented from running and
accumulate in nr_in_batch the amount of work they should have done, but could
not.
This creates an impedance problem, where upon node pressure, work deferred will
accumulate and end up being flushed in other nodes. The problem we describe is
particularly harmful in big machines, where many nodes can accumulate at the
same time, all adding to the global counter nr_in_batch. As we accumulate
more and more, we start to ask for the caches to flush even bigger numbers. The
result is that the caches are depleted and do not stabilize. To achieve stable
steady state behavior, we need to tackle it differently.
In this patch we keep the deferred count per-node, in the new array
nr_deferred[] (the name is also a bit more descriptive) and will never
accumulate that to other nodes.
[ v11: simplified numa awareness handling ]
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Cc: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
include/linux/shrinker.h | 14 ++-
mm/vmscan.c | 241 +++++++++++++++++++++++++++--------------------
2 files changed, 152 insertions(+), 103 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 76f520c..8f80f24 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -19,6 +19,8 @@ struct shrink_control {
/* shrink from these nodes */
nodemask_t nodes_to_scan;
+ /* current node being shrunk (for NUMA aware shrinkers) */
+ int nid;
};
#define SHRINK_STOP (~0UL)
@@ -44,6 +46,8 @@ struct shrink_control {
* due to potential deadlocks. If SHRINK_STOP is returned, then no further
* attempts to call the @scan_objects will be made from the current reclaim
* context.
+ *
+ * @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
@@ -54,12 +58,18 @@ struct shrinker {
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
+ unsigned long flags;
/* These are for internal use */
struct list_head list;
- atomic_long_t nr_in_batch; /* objs pending delete */
+ /* objs pending delete, per node */
+ atomic_long_t *nr_deferred;
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
+
+/* Flags */
+#define SHRINKER_NUMA_AWARE (1 << 0)
+
+extern int register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f39cae0..22ac8de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
}
/*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
*/
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
{
- atomic_long_set(&shrinker->nr_in_batch, 0);
+ size_t size = sizeof(*shrinker->nr_deferred);
+
+ /*
+ * If we only have one possible node in the system anyway, save
+ * ourselves the trouble and disable NUMA aware behavior. This way we
+ * will save memory and some small loop time later.
+ */
+ if (nr_node_ids == 1)
+ shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+ if (shrinker->flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ return -ENOMEM;
+
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
+ return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
}
#define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long max_pass;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+ if (shrinker->count_objects)
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ else
+ max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ if (max_pass == 0)
+ return 0;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
+ }
+
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
+
+ while (total_scan >= batch_size) {
+
+ if (shrinker->scan_objects) {
+ unsigned long ret;
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+ } else {
+ int nr_before;
+ long ret;
+
+ nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ ret = do_shrinker_shrink(shrinker, shrinkctl,
+ batch_size);
+ if (ret == -1)
+ break;
+ if (ret < nr_before)
+ freed += nr_before - ret;
+ }
+
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
+
+ cond_resched();
+ }
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_deferred[nid]);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+ trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ return freed;
+}
+
/*
* Call the shrink functions to age shrinkable caches
*
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- unsigned long long delta;
- long total_scan;
- long max_pass;
- long nr;
- long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
-
- if (shrinker->count_objects)
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- else
- max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
- if (max_pass == 0)
- continue;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
- total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
- do_div(delta, lru_pages + 1);
- total_scan += delta;
- if (total_scan < 0) {
- printk(KERN_ERR
- "shrink_slab: %pF negative objects to delete nr=%ld\n",
- shrinker->shrink, total_scan);
- total_scan = max_pass;
- }
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
-
- trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
-
- while (total_scan >= batch_size) {
-
- if (shrinker->scan_objects) {
- unsigned long ret;
- shrinkctl->nr_to_scan = batch_size;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (!node_online(shrinkctl->nid))
+ continue;
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
- } else {
- int nr_before;
- long ret;
-
- nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
- ret = do_shrinker_shrink(shrinker, shrinkctl,
- batch_size);
- if (ret == -1)
- break;
- if (ret < nr_before)
- freed += nr_before - ret;
- }
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+ (shrinkctl->nid != 0))
+ break;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
- cond_resched();
}
-
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
- */
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
- &shrinker->nr_in_batch);
- else
- new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
- trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 14/25] vmscan: per-node deferred work
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner
The list_lru infrastructure already keeps per-node LRU lists in its
node-specific list_lru_node arrays and provide us with a per-node API, and the
shrinkers are properly equiped with node information. This means that we can
now focus our shrinking effort in a single node, but the work that is deferred
from one run to another is kept global at nr_in_batch. Work can be deferred,
for instance, during direct reclaim under a GFP_NOFS allocation, where
situation, all the filesystem shrinkers will be prevented from running and
accumulate in nr_in_batch the amount of work they should have done, but could
not.
This creates an impedance problem, where upon node pressure, work deferred will
accumulate and end up being flushed in other nodes. The problem we describe is
particularly harmful in big machines, where many nodes can accumulate at the
same time, all adding to the global counter nr_in_batch. As we accumulate
more and more, we start to ask for the caches to flush even bigger numbers. The
result is that the caches are depleted and do not stabilize. To achieve stable
steady state behavior, we need to tackle it differently.
In this patch we keep the deferred count per-node, in the new array
nr_deferred[] (the name is also a bit more descriptive) and will never
accumulate that to other nodes.
[ v11: simplified numa awareness handling ]
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
---
include/linux/shrinker.h | 14 ++-
mm/vmscan.c | 241 +++++++++++++++++++++++++++--------------------
2 files changed, 152 insertions(+), 103 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 76f520c..8f80f24 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -19,6 +19,8 @@ struct shrink_control {
/* shrink from these nodes */
nodemask_t nodes_to_scan;
+ /* current node being shrunk (for NUMA aware shrinkers) */
+ int nid;
};
#define SHRINK_STOP (~0UL)
@@ -44,6 +46,8 @@ struct shrink_control {
* due to potential deadlocks. If SHRINK_STOP is returned, then no further
* attempts to call the @scan_objects will be made from the current reclaim
* context.
+ *
+ * @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
int (*shrink)(struct shrinker *, struct shrink_control *sc);
@@ -54,12 +58,18 @@ struct shrinker {
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
+ unsigned long flags;
/* These are for internal use */
struct list_head list;
- atomic_long_t nr_in_batch; /* objs pending delete */
+ /* objs pending delete, per node */
+ atomic_long_t *nr_deferred;
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
+
+/* Flags */
+#define SHRINKER_NUMA_AWARE (1 << 0)
+
+extern int register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f39cae0..22ac8de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
}
/*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
*/
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
{
- atomic_long_set(&shrinker->nr_in_batch, 0);
+ size_t size = sizeof(*shrinker->nr_deferred);
+
+ /*
+ * If we only have one possible node in the system anyway, save
+ * ourselves the trouble and disable NUMA aware behavior. This way we
+ * will save memory and some small loop time later.
+ */
+ if (nr_node_ids == 1)
+ shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+ if (shrinker->flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ return -ENOMEM;
+
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
+ return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
}
#define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long max_pass;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+ if (shrinker->count_objects)
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ else
+ max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ if (max_pass == 0)
+ return 0;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
+ }
+
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
+
+ while (total_scan >= batch_size) {
+
+ if (shrinker->scan_objects) {
+ unsigned long ret;
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+ } else {
+ int nr_before;
+ long ret;
+
+ nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ ret = do_shrinker_shrink(shrinker, shrinkctl,
+ batch_size);
+ if (ret == -1)
+ break;
+ if (ret < nr_before)
+ freed += nr_before - ret;
+ }
+
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
+
+ cond_resched();
+ }
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_deferred[nid]);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+ trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ return freed;
+}
+
/*
* Call the shrink functions to age shrinkable caches
*
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- unsigned long long delta;
- long total_scan;
- long max_pass;
- long nr;
- long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
-
- if (shrinker->count_objects)
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- else
- max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
- if (max_pass == 0)
- continue;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
- total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
- do_div(delta, lru_pages + 1);
- total_scan += delta;
- if (total_scan < 0) {
- printk(KERN_ERR
- "shrink_slab: %pF negative objects to delete nr=%ld\n",
- shrinker->shrink, total_scan);
- total_scan = max_pass;
- }
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
-
- trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
-
- while (total_scan >= batch_size) {
-
- if (shrinker->scan_objects) {
- unsigned long ret;
- shrinkctl->nr_to_scan = batch_size;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (!node_online(shrinkctl->nid))
+ continue;
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
- } else {
- int nr_before;
- long ret;
-
- nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
- ret = do_shrinker_shrink(shrinker, shrinkctl,
- batch_size);
- if (ret == -1)
- break;
- if (ret < nr_before)
- freed += nr_before - ret;
- }
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+ (shrinkctl->nid != 0))
+ break;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
- cond_resched();
}
-
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
- */
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
- &shrinker->nr_in_batch);
- else
- new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
- trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 15/25] fs: convert inode and dentry shrinking to be node aware
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Now that the shrinker is passing a node in the scan control
structure, we can pass this to the the generic LRU list code to
isolate reclaim to the lists on matching nodes.
v7: refactoring of the LRU list API in a separate patch
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
fs/dcache.c | 8 +++++---
fs/inode.c | 7 ++++---
fs/internal.h | 6 ++++--
fs/super.c | 23 ++++++++++++++---------
fs/xfs/xfs_super.c | 6 ++++--
include/linux/fs.h | 4 ++--
6 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 00722b3..d3feea1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -891,6 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* prune_dcache_sb - shrink the dcache
* @sb: superblock
* @nr_to_scan : number of entries to try to free
+ * @nid: which node to scan for freeable entities
*
* Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
* done when we need more memory an called from the superblock shrinker
@@ -899,13 +900,14 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* This function may fail to free any resources if all the dentries are in
* use.
*/
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid)
{
LIST_HEAD(dispose);
long freed;
- freed = list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate,
- &dispose, nr_to_scan);
+ freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+ &dispose, &nr_to_scan);
shrink_dentry_list(&dispose);
return freed;
}
diff --git a/fs/inode.c b/fs/inode.c
index 5d85521..00b804e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -746,13 +746,14 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list().
*/
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid)
{
LIST_HEAD(freeable);
long freed;
- freed = list_lru_walk(&sb->s_inode_lru, inode_lru_isolate,
- &freeable, nr_to_scan);
+ freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
+ &freeable, &nr_to_scan);
dispose_list(&freeable);
return freed;
}
diff --git a/fs/internal.h b/fs/internal.h
index ea43c89..8902d56 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,7 +110,8 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan);
+extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid);
extern void inode_add_lru(struct inode *inode);
/*
@@ -126,7 +127,8 @@ extern int invalidate_inodes(struct super_block *, bool);
* dcache.c
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan);
+extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid);
/*
* read_write.c
diff --git a/fs/super.c b/fs/super.c
index 7fe934d..85a6104 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -75,10 +75,10 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
return SHRINK_STOP;
if (sb->s_op && sb->s_op->nr_cached_objects)
- fs_objects = sb->s_op->nr_cached_objects(sb);
+ fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
- inodes = list_lru_count(&sb->s_inode_lru);
- dentries = list_lru_count(&sb->s_dentry_lru);
+ inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
+ dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
total_objects = dentries + inodes + fs_objects + 1;
/* proportion the scan between the caches */
@@ -89,13 +89,14 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
* prune the dcache first as the icache is pinned by it, then
* prune the icache, followed by the filesystem specific caches
*/
- freed = prune_dcache_sb(sb, dentries);
- freed += prune_icache_sb(sb, inodes);
+ freed = prune_dcache_sb(sb, dentries, sc->nid);
+ freed += prune_icache_sb(sb, inodes, sc->nid);
if (fs_objects) {
fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
total_objects);
- freed += sb->s_op->free_cached_objects(sb, fs_objects);
+ freed += sb->s_op->free_cached_objects(sb, fs_objects,
+ sc->nid);
}
drop_super(sb);
@@ -113,10 +114,13 @@ static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc
return 0;
if (sb->s_op && sb->s_op->nr_cached_objects)
- total_objects = sb->s_op->nr_cached_objects(sb);
+ total_objects = sb->s_op->nr_cached_objects(sb,
+ sc->nid);
- total_objects += list_lru_count(&sb->s_dentry_lru);
- total_objects += list_lru_count(&sb->s_inode_lru);
+ total_objects += list_lru_count_node(&sb->s_dentry_lru,
+ sc->nid);
+ total_objects += list_lru_count_node(&sb->s_inode_lru,
+ sc->nid);
total_objects = vfs_pressure_ratio(total_objects);
drop_super(sb);
@@ -232,6 +236,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
+ s->s_shrink.flags = SHRINKER_NUMA_AWARE;
}
out:
return s;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 443a8bc..ec2b267 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1536,7 +1536,8 @@ xfs_fs_mount(
static long
xfs_fs_nr_cached_objects(
- struct super_block *sb)
+ struct super_block *sb,
+ int nid)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
@@ -1544,7 +1545,8 @@ xfs_fs_nr_cached_objects(
static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- long nr_to_scan)
+ long nr_to_scan,
+ int nid)
{
return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 976258f..610955f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1610,8 +1610,8 @@ struct super_operations {
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
- long (*nr_cached_objects)(struct super_block *);
- long (*free_cached_objects)(struct super_block *, long);
+ long (*nr_cached_objects)(struct super_block *, int);
+ long (*free_cached_objects)(struct super_block *, long, int);
};
/*
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 15/25] fs: convert inode and dentry shrinking to be node aware
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Now that the shrinker is passing a node in the scan control
structure, we can pass this to the the generic LRU list code to
isolate reclaim to the lists on matching nodes.
v7: refactoring of the LRU list API in a separate patch
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Mel Gorman <mgorman@suse.de>
---
fs/dcache.c | 8 +++++---
fs/inode.c | 7 ++++---
fs/internal.h | 6 ++++--
fs/super.c | 23 ++++++++++++++---------
fs/xfs/xfs_super.c | 6 ++++--
include/linux/fs.h | 4 ++--
6 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 00722b3..d3feea1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -891,6 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* prune_dcache_sb - shrink the dcache
* @sb: superblock
* @nr_to_scan : number of entries to try to free
+ * @nid: which node to scan for freeable entities
*
* Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
* done when we need more memory an called from the superblock shrinker
@@ -899,13 +900,14 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* This function may fail to free any resources if all the dentries are in
* use.
*/
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid)
{
LIST_HEAD(dispose);
long freed;
- freed = list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate,
- &dispose, nr_to_scan);
+ freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+ &dispose, &nr_to_scan);
shrink_dentry_list(&dispose);
return freed;
}
diff --git a/fs/inode.c b/fs/inode.c
index 5d85521..00b804e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -746,13 +746,14 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list().
*/
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan)
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid)
{
LIST_HEAD(freeable);
long freed;
- freed = list_lru_walk(&sb->s_inode_lru, inode_lru_isolate,
- &freeable, nr_to_scan);
+ freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
+ &freeable, &nr_to_scan);
dispose_list(&freeable);
return freed;
}
diff --git a/fs/internal.h b/fs/internal.h
index ea43c89..8902d56 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,7 +110,8 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan);
+extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid);
extern void inode_add_lru(struct inode *inode);
/*
@@ -126,7 +127,8 @@ extern int invalidate_inodes(struct super_block *, bool);
* dcache.c
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan);
+extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid);
/*
* read_write.c
diff --git a/fs/super.c b/fs/super.c
index 7fe934d..85a6104 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -75,10 +75,10 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
return SHRINK_STOP;
if (sb->s_op && sb->s_op->nr_cached_objects)
- fs_objects = sb->s_op->nr_cached_objects(sb);
+ fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
- inodes = list_lru_count(&sb->s_inode_lru);
- dentries = list_lru_count(&sb->s_dentry_lru);
+ inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
+ dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
total_objects = dentries + inodes + fs_objects + 1;
/* proportion the scan between the caches */
@@ -89,13 +89,14 @@ static long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
* prune the dcache first as the icache is pinned by it, then
* prune the icache, followed by the filesystem specific caches
*/
- freed = prune_dcache_sb(sb, dentries);
- freed += prune_icache_sb(sb, inodes);
+ freed = prune_dcache_sb(sb, dentries, sc->nid);
+ freed += prune_icache_sb(sb, inodes, sc->nid);
if (fs_objects) {
fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
total_objects);
- freed += sb->s_op->free_cached_objects(sb, fs_objects);
+ freed += sb->s_op->free_cached_objects(sb, fs_objects,
+ sc->nid);
}
drop_super(sb);
@@ -113,10 +114,13 @@ static long super_cache_count(struct shrinker *shrink, struct shrink_control *sc
return 0;
if (sb->s_op && sb->s_op->nr_cached_objects)
- total_objects = sb->s_op->nr_cached_objects(sb);
+ total_objects = sb->s_op->nr_cached_objects(sb,
+ sc->nid);
- total_objects += list_lru_count(&sb->s_dentry_lru);
- total_objects += list_lru_count(&sb->s_inode_lru);
+ total_objects += list_lru_count_node(&sb->s_dentry_lru,
+ sc->nid);
+ total_objects += list_lru_count_node(&sb->s_inode_lru,
+ sc->nid);
total_objects = vfs_pressure_ratio(total_objects);
drop_super(sb);
@@ -232,6 +236,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
+ s->s_shrink.flags = SHRINKER_NUMA_AWARE;
}
out:
return s;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 443a8bc..ec2b267 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1536,7 +1536,8 @@ xfs_fs_mount(
static long
xfs_fs_nr_cached_objects(
- struct super_block *sb)
+ struct super_block *sb,
+ int nid)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
@@ -1544,7 +1545,8 @@ xfs_fs_nr_cached_objects(
static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- long nr_to_scan)
+ long nr_to_scan,
+ int nid)
{
return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 976258f..610955f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1610,8 +1610,8 @@ struct super_operations {
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
- long (*nr_cached_objects)(struct super_block *);
- long (*free_cached_objects)(struct super_block *, long);
+ long (*nr_cached_objects)(struct super_block *, int);
+ long (*free_cached_objects)(struct super_block *, long, int);
};
/*
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 16/25] xfs: convert buftarg LRU to generic code
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Convert the buftarg LRU to use the new generic LRU list and take
advantage of the functionality it supplies to make the buffer cache
shrinker node aware.
* v7: Add NUMA aware flag
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
fs/xfs/xfs_buf.c | 170 ++++++++++++++++++++++++++-----------------------------
fs/xfs/xfs_buf.h | 5 +-
2 files changed, 82 insertions(+), 93 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1b2472a..b19b8a4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -85,20 +85,14 @@ xfs_buf_vmap_len(
* The LRU takes a new reference to the buffer so that it will only be freed
* once the shrinker takes the buffer off the LRU.
*/
-STATIC void
+static void
xfs_buf_lru_add(
struct xfs_buf *bp)
{
- struct xfs_buftarg *btp = bp->b_target;
-
- spin_lock(&btp->bt_lru_lock);
- if (list_empty(&bp->b_lru)) {
- atomic_inc(&bp->b_hold);
- list_add_tail(&bp->b_lru, &btp->bt_lru);
- btp->bt_lru_nr++;
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
+ atomic_inc(&bp->b_hold);
}
- spin_unlock(&btp->bt_lru_lock);
}
/*
@@ -107,24 +101,13 @@ xfs_buf_lru_add(
* The unlocked check is safe here because it only occurs when there are not
* b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
* to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
+ * xfs_buf_free().
*/
-STATIC void
+static void
xfs_buf_lru_del(
struct xfs_buf *bp)
{
- struct xfs_buftarg *btp = bp->b_target;
-
- if (list_empty(&bp->b_lru))
- return;
-
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- }
- spin_unlock(&btp->bt_lru_lock);
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
}
/*
@@ -151,18 +134,10 @@ xfs_buf_stale(
bp->b_flags &= ~_XBF_DELWRI_Q;
atomic_set(&(bp)->b_lru_ref, 0);
- if (!list_empty(&bp->b_lru)) {
- struct xfs_buftarg *btp = bp->b_target;
-
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru) &&
- !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- atomic_dec(&bp->b_hold);
- }
- spin_unlock(&btp->bt_lru_lock);
- }
+ if (!(bp->b_lru_flags & _XBF_LRU_DISPOSE) &&
+ (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ atomic_dec(&bp->b_hold);
+
ASSERT(atomic_read(&bp->b_hold) >= 1);
}
@@ -1501,83 +1476,97 @@ xfs_buf_iomove(
* returned. These buffers will have an elevated hold count, so wait on those
* while freeing all the buffers only held by the LRU.
*/
-void
-xfs_wait_buftarg(
- struct xfs_buftarg *btp)
+static enum lru_status
+xfs_buftarg_wait_rele(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+
{
- struct xfs_buf *bp;
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
-restart:
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
- if (atomic_read(&bp->b_hold) > 1) {
- trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- spin_unlock(&btp->bt_lru_lock);
- delay(100);
- goto restart;
- }
+ if (atomic_read(&bp->b_hold) > 1) {
+ /* need to wait */
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ spin_unlock(lru_lock);
+ delay(100);
+ } else {
/*
* clear the LRU reference count so the buffer doesn't get
* ignored in xfs_buf_rele().
*/
atomic_set(&bp->b_lru_ref, 0);
- spin_unlock(&btp->bt_lru_lock);
+ spin_unlock(lru_lock);
xfs_buf_rele(bp);
- spin_lock(&btp->bt_lru_lock);
}
- spin_unlock(&btp->bt_lru_lock);
+
+ spin_lock(lru_lock);
+ return LRU_RETRY;
}
-int
-xfs_buftarg_shrink(
+void
+xfs_wait_buftarg(
+ struct xfs_buftarg *btp)
+{
+ while (list_lru_count(&btp->bt_lru))
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ NULL, LONG_MAX);
+}
+
+static enum lru_status
+xfs_buftarg_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0))
+ return LRU_ROTATE;
+
+ bp->b_lru_flags |= _XBF_LRU_DISPOSE;
+ list_move(item, dispose);
+ return LRU_REMOVED;
+}
+
+static long
+xfs_buftarg_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
- struct xfs_buf *bp;
- int nr_to_scan = sc->nr_to_scan;
LIST_HEAD(dispose);
+ long freed;
+ unsigned long nr_to_scan = sc->nr_to_scan;
- if (!nr_to_scan)
- return btp->bt_lru_nr;
-
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- if (nr_to_scan-- <= 0)
- break;
-
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
- /*
- * Decrement the b_lru_ref count unless the value is already
- * zero. If the value is already zero, we need to reclaim the
- * buffer, otherwise it gets another trip through the LRU.
- */
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- continue;
- }
-
- /*
- * remove the buffer from the LRU now to avoid needing another
- * lock round trip inside xfs_buf_rele().
- */
- list_move(&bp->b_lru, &dispose);
- btp->bt_lru_nr--;
- bp->b_lru_flags |= _XBF_LRU_DISPOSE;
- }
- spin_unlock(&btp->bt_lru_lock);
+ freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+ &dispose, &nr_to_scan);
while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
- return btp->bt_lru_nr;
+ return freed;
+}
+
+static long
+xfs_buftarg_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ return list_lru_count_node(&btp->bt_lru, sc->nid);
}
void
@@ -1659,12 +1648,13 @@ xfs_alloc_buftarg(
if (!btp->bt_bdi)
goto error;
- INIT_LIST_HEAD(&btp->bt_lru);
- spin_lock_init(&btp->bt_lru_lock);
+ list_lru_init(&btp->bt_lru);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+ btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&btp->bt_shrinker);
return btp;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 433a12e..5ec7d35 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
+#include <linux/list_lru.h>
/*
* Base types
@@ -92,9 +93,7 @@ typedef struct xfs_buftarg {
/* LRU control structures */
struct shrinker bt_shrinker;
- struct list_head bt_lru;
- spinlock_t bt_lru_lock;
- unsigned int bt_lru_nr;
+ struct list_lru bt_lru;
} xfs_buftarg_t;
struct xfs_buf;
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 16/25] xfs: convert buftarg LRU to generic code
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner
From: Dave Chinner <dchinner@redhat.com>
Convert the buftarg LRU to use the new generic LRU list and take
advantage of the functionality it supplies to make the buffer cache
shrinker node aware.
* v7: Add NUMA aware flag
Signed-off-by: Glauber Costa <glommer@openvz.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/xfs/xfs_buf.c | 170 ++++++++++++++++++++++++++-----------------------------
fs/xfs/xfs_buf.h | 5 +-
2 files changed, 82 insertions(+), 93 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1b2472a..b19b8a4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -85,20 +85,14 @@ xfs_buf_vmap_len(
* The LRU takes a new reference to the buffer so that it will only be freed
* once the shrinker takes the buffer off the LRU.
*/
-STATIC void
+static void
xfs_buf_lru_add(
struct xfs_buf *bp)
{
- struct xfs_buftarg *btp = bp->b_target;
-
- spin_lock(&btp->bt_lru_lock);
- if (list_empty(&bp->b_lru)) {
- atomic_inc(&bp->b_hold);
- list_add_tail(&bp->b_lru, &btp->bt_lru);
- btp->bt_lru_nr++;
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
+ atomic_inc(&bp->b_hold);
}
- spin_unlock(&btp->bt_lru_lock);
}
/*
@@ -107,24 +101,13 @@ xfs_buf_lru_add(
* The unlocked check is safe here because it only occurs when there are not
* b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
* to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
+ * xfs_buf_free().
*/
-STATIC void
+static void
xfs_buf_lru_del(
struct xfs_buf *bp)
{
- struct xfs_buftarg *btp = bp->b_target;
-
- if (list_empty(&bp->b_lru))
- return;
-
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- }
- spin_unlock(&btp->bt_lru_lock);
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
}
/*
@@ -151,18 +134,10 @@ xfs_buf_stale(
bp->b_flags &= ~_XBF_DELWRI_Q;
atomic_set(&(bp)->b_lru_ref, 0);
- if (!list_empty(&bp->b_lru)) {
- struct xfs_buftarg *btp = bp->b_target;
-
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru) &&
- !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- atomic_dec(&bp->b_hold);
- }
- spin_unlock(&btp->bt_lru_lock);
- }
+ if (!(bp->b_lru_flags & _XBF_LRU_DISPOSE) &&
+ (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ atomic_dec(&bp->b_hold);
+
ASSERT(atomic_read(&bp->b_hold) >= 1);
}
@@ -1501,83 +1476,97 @@ xfs_buf_iomove(
* returned. These buffers will have an elevated hold count, so wait on those
* while freeing all the buffers only held by the LRU.
*/
-void
-xfs_wait_buftarg(
- struct xfs_buftarg *btp)
+static enum lru_status
+xfs_buftarg_wait_rele(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+
{
- struct xfs_buf *bp;
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
-restart:
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
- if (atomic_read(&bp->b_hold) > 1) {
- trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- spin_unlock(&btp->bt_lru_lock);
- delay(100);
- goto restart;
- }
+ if (atomic_read(&bp->b_hold) > 1) {
+ /* need to wait */
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ spin_unlock(lru_lock);
+ delay(100);
+ } else {
/*
* clear the LRU reference count so the buffer doesn't get
* ignored in xfs_buf_rele().
*/
atomic_set(&bp->b_lru_ref, 0);
- spin_unlock(&btp->bt_lru_lock);
+ spin_unlock(lru_lock);
xfs_buf_rele(bp);
- spin_lock(&btp->bt_lru_lock);
}
- spin_unlock(&btp->bt_lru_lock);
+
+ spin_lock(lru_lock);
+ return LRU_RETRY;
}
-int
-xfs_buftarg_shrink(
+void
+xfs_wait_buftarg(
+ struct xfs_buftarg *btp)
+{
+ while (list_lru_count(&btp->bt_lru))
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ NULL, LONG_MAX);
+}
+
+static enum lru_status
+xfs_buftarg_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0))
+ return LRU_ROTATE;
+
+ bp->b_lru_flags |= _XBF_LRU_DISPOSE;
+ list_move(item, dispose);
+ return LRU_REMOVED;
+}
+
+static long
+xfs_buftarg_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
- struct xfs_buf *bp;
- int nr_to_scan = sc->nr_to_scan;
LIST_HEAD(dispose);
+ long freed;
+ unsigned long nr_to_scan = sc->nr_to_scan;
- if (!nr_to_scan)
- return btp->bt_lru_nr;
-
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- if (nr_to_scan-- <= 0)
- break;
-
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
- /*
- * Decrement the b_lru_ref count unless the value is already
- * zero. If the value is already zero, we need to reclaim the
- * buffer, otherwise it gets another trip through the LRU.
- */
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- continue;
- }
-
- /*
- * remove the buffer from the LRU now to avoid needing another
- * lock round trip inside xfs_buf_rele().
- */
- list_move(&bp->b_lru, &dispose);
- btp->bt_lru_nr--;
- bp->b_lru_flags |= _XBF_LRU_DISPOSE;
- }
- spin_unlock(&btp->bt_lru_lock);
+ freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+ &dispose, &nr_to_scan);
while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
- return btp->bt_lru_nr;
+ return freed;
+}
+
+static long
+xfs_buftarg_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ return list_lru_count_node(&btp->bt_lru, sc->nid);
}
void
@@ -1659,12 +1648,13 @@ xfs_alloc_buftarg(
if (!btp->bt_bdi)
goto error;
- INIT_LIST_HEAD(&btp->bt_lru);
- spin_lock_init(&btp->bt_lru_lock);
+ list_lru_init(&btp->bt_lru);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+ btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&btp->bt_shrinker);
return btp;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 433a12e..5ec7d35 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
+#include <linux/list_lru.h>
/*
* Base types
@@ -92,9 +93,7 @@ typedef struct xfs_buftarg {
/* LRU control structures */
struct shrinker bt_shrinker;
- struct list_head bt_lru;
- spinlock_t bt_lru_lock;
- unsigned int bt_lru_nr;
+ struct list_lru bt_lru;
} xfs_buftarg_t;
struct xfs_buf;
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 17/25] xfs: rework buffer dispose list tracking
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
In converting the buffer lru lists to use the generic code, the
locking for marking the buffers as on the dispose list was lost.
This results in confusion in LRU buffer tracking and acocunting,
resulting in reference counts being mucked up and filesystem beig
unmountable.
To fix this, introduce an internal buffer spinlock to protect the
state field that holds the dispose list information. Because there
is now locking needed around xfs_buf_lru_add/del, and they are used
in exactly one place each two lines apart, get rid of the wrappers
and code the logic directly in place.
Further, the LRU emptying code used on unmount is less than optimal.
Convert it to use a dispose list as per a normal shrinker walk, and
repeat the walk that fills the dispose list until the LRU is empty.
Thi avoids needing to drop and regain the LRU lock for every item
being freed, and allows the same logic as the shrinker isolate call
to be used. Simpler, easier to understand.
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
fs/xfs/xfs_buf.c | 125 +++++++++++++++++++++++++++++++------------------------
fs/xfs/xfs_buf.h | 12 ++++--
2 files changed, 79 insertions(+), 58 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b19b8a4..c3f8ea9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,37 +80,6 @@ xfs_buf_vmap_len(
}
/*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-static void
-xfs_buf_lru_add(
- struct xfs_buf *bp)
-{
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
- bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
- atomic_inc(&bp->b_hold);
- }
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free().
- */
-static void
-xfs_buf_lru_del(
- struct xfs_buf *bp)
-{
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-}
-
-/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -133,12 +102,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- atomic_set(&(bp)->b_lru_ref, 0);
- if (!(bp->b_lru_flags & _XBF_LRU_DISPOSE) &&
+ spin_lock(&bp->b_lock);
+ atomic_set(&bp->b_lru_ref, 0);
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
atomic_dec(&bp->b_hold);
ASSERT(atomic_read(&bp->b_hold) >= 1);
+ spin_unlock(&bp->b_lock);
}
static int
@@ -202,6 +173,7 @@ _xfs_buf_alloc(
INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
+ spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
bp->b_target = target;
bp->b_flags = flags;
@@ -891,12 +863,33 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- if (!(bp->b_flags & XBF_STALE) &&
- atomic_read(&bp->b_lru_ref)) {
- xfs_buf_lru_add(bp);
+ spin_lock(&bp->b_lock);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new
+ * reference to the buffer for the LRU and clear the
+ * (now stale) dispose list state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
+ }
+ spin_unlock(&bp->b_lock);
spin_unlock(&pag->pag_buf_lock);
} else {
- xfs_buf_lru_del(bp);
+ /*
+ * most of the time buffers will already be removed from
+ * the LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the
+ * buffer was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
+ }
+ spin_unlock(&bp->b_lock);
+
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
@@ -1484,33 +1477,48 @@ xfs_buftarg_wait_rele(
{
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
if (atomic_read(&bp->b_hold) > 1) {
- /* need to wait */
+ /* need to wait, so skip it this pass */
trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
- spin_unlock(lru_lock);
- delay(100);
- } else {
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- spin_unlock(lru_lock);
- xfs_buf_rele(bp);
+ return LRU_SKIP;
}
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
- spin_lock(lru_lock);
- return LRU_RETRY;
+ /*
+ * clear the LRU reference count so the buffer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
}
void
xfs_wait_buftarg(
struct xfs_buftarg *btp)
{
- while (list_lru_count(&btp->bt_lru))
+ LIST_HEAD(dispose);
+ int loop = 0;
+
+ /* loop until there is nothing left on the lru list. */
+ while (list_lru_count(&btp->bt_lru)) {
list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
- NULL, LONG_MAX);
+ &dispose, LONG_MAX);
+
+ while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ xfs_buf_rele(bp);
+ }
+ if (loop++ != 0)
+ delay(100);
+ }
}
static enum lru_status
@@ -1523,15 +1531,24 @@ xfs_buftarg_isolate(
struct list_head *dispose = arg;
/*
+ * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ /*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0))
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ spin_unlock(&bp->b_lock);
return LRU_ROTATE;
+ }
- bp->b_lru_flags |= _XBF_LRU_DISPOSE;
+ bp->b_state |= XFS_BSTATE_DISPOSE;
list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
return LRU_REMOVED;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5ec7d35..e656833 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -60,7 +60,6 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
-#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
typedef unsigned int xfs_buf_flags_t;
@@ -79,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }, \
- { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
+ { _XBF_COMPOUND, "COMPOUND" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
typedef struct xfs_buftarg {
dev_t bt_dev;
@@ -136,7 +139,8 @@ typedef struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
- xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
+ spinlock_t b_lock; /* internal state lock */
+ unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 17/25] xfs: rework buffer dispose list tracking
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
In converting the buffer lru lists to use the generic code, the
locking for marking the buffers as on the dispose list was lost.
This results in confusion in LRU buffer tracking and acocunting,
resulting in reference counts being mucked up and filesystem beig
unmountable.
To fix this, introduce an internal buffer spinlock to protect the
state field that holds the dispose list information. Because there
is now locking needed around xfs_buf_lru_add/del, and they are used
in exactly one place each two lines apart, get rid of the wrappers
and code the logic directly in place.
Further, the LRU emptying code used on unmount is less than optimal.
Convert it to use a dispose list as per a normal shrinker walk, and
repeat the walk that fills the dispose list until the LRU is empty.
Thi avoids needing to drop and regain the LRU lock for every item
being freed, and allows the same logic as the shrinker isolate call
to be used. Simpler, easier to understand.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
---
fs/xfs/xfs_buf.c | 125 +++++++++++++++++++++++++++++++------------------------
fs/xfs/xfs_buf.h | 12 ++++--
2 files changed, 79 insertions(+), 58 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b19b8a4..c3f8ea9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,37 +80,6 @@ xfs_buf_vmap_len(
}
/*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-static void
-xfs_buf_lru_add(
- struct xfs_buf *bp)
-{
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
- bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
- atomic_inc(&bp->b_hold);
- }
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free().
- */
-static void
-xfs_buf_lru_del(
- struct xfs_buf *bp)
-{
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-}
-
-/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -133,12 +102,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- atomic_set(&(bp)->b_lru_ref, 0);
- if (!(bp->b_lru_flags & _XBF_LRU_DISPOSE) &&
+ spin_lock(&bp->b_lock);
+ atomic_set(&bp->b_lru_ref, 0);
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
atomic_dec(&bp->b_hold);
ASSERT(atomic_read(&bp->b_hold) >= 1);
+ spin_unlock(&bp->b_lock);
}
static int
@@ -202,6 +173,7 @@ _xfs_buf_alloc(
INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
+ spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
bp->b_target = target;
bp->b_flags = flags;
@@ -891,12 +863,33 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- if (!(bp->b_flags & XBF_STALE) &&
- atomic_read(&bp->b_lru_ref)) {
- xfs_buf_lru_add(bp);
+ spin_lock(&bp->b_lock);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new
+ * reference to the buffer for the LRU and clear the
+ * (now stale) dispose list state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
+ }
+ spin_unlock(&bp->b_lock);
spin_unlock(&pag->pag_buf_lock);
} else {
- xfs_buf_lru_del(bp);
+ /*
+ * most of the time buffers will already be removed from
+ * the LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the
+ * buffer was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
+ }
+ spin_unlock(&bp->b_lock);
+
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
@@ -1484,33 +1477,48 @@ xfs_buftarg_wait_rele(
{
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
if (atomic_read(&bp->b_hold) > 1) {
- /* need to wait */
+ /* need to wait, so skip it this pass */
trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
- spin_unlock(lru_lock);
- delay(100);
- } else {
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- spin_unlock(lru_lock);
- xfs_buf_rele(bp);
+ return LRU_SKIP;
}
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
- spin_lock(lru_lock);
- return LRU_RETRY;
+ /*
+ * clear the LRU reference count so the buffer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
}
void
xfs_wait_buftarg(
struct xfs_buftarg *btp)
{
- while (list_lru_count(&btp->bt_lru))
+ LIST_HEAD(dispose);
+ int loop = 0;
+
+ /* loop until there is nothing left on the lru list. */
+ while (list_lru_count(&btp->bt_lru)) {
list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
- NULL, LONG_MAX);
+ &dispose, LONG_MAX);
+
+ while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ xfs_buf_rele(bp);
+ }
+ if (loop++ != 0)
+ delay(100);
+ }
}
static enum lru_status
@@ -1523,15 +1531,24 @@ xfs_buftarg_isolate(
struct list_head *dispose = arg;
/*
+ * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ /*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0))
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ spin_unlock(&bp->b_lock);
return LRU_ROTATE;
+ }
- bp->b_lru_flags |= _XBF_LRU_DISPOSE;
+ bp->b_state |= XFS_BSTATE_DISPOSE;
list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
return LRU_REMOVED;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5ec7d35..e656833 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -60,7 +60,6 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
-#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
typedef unsigned int xfs_buf_flags_t;
@@ -79,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }, \
- { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
+ { _XBF_COMPOUND, "COMPOUND" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
typedef struct xfs_buftarg {
dev_t bt_dev;
@@ -136,7 +139,8 @@ typedef struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
- xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
+ spinlock_t b_lock; /* internal state lock */
+ unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 18/25] xfs: convert dquot cache lru to list_lru
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Convert the XFS dquot lru to use the list_lru construct and convert
the shrinker to being node aware.
* v7: Add NUMA aware flag
[ glommer: edited for conflicts + warning fixes ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
fs/xfs/xfs_dquot.c | 7 +-
fs/xfs/xfs_qm.c | 277 +++++++++++++++++++++++++++--------------------------
fs/xfs/xfs_qm.h | 4 +-
3 files changed, 144 insertions(+), 144 deletions(-)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a..a2c5672 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -939,13 +939,8 @@ xfs_qm_dqput_final(
trace_xfs_dqput_free(dqp);
- mutex_lock(&qi->qi_lru_lock);
- if (list_empty(&dqp->q_lru)) {
- list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
- qi->qi_lru_count++;
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(xs_qm_dquot_unused);
- }
- mutex_unlock(&qi->qi_lru_lock);
/*
* If we just added a udquot to the freelist, then we want to release
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f10506b..bd6c12a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -51,8 +51,9 @@
*/
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
* currently is the only interface into the radix tree code that allows
@@ -197,12 +198,9 @@ xfs_qm_dqpurge(
* We move dquots to the freelist as soon as their reference count
* hits zero, so it really should be on the freelist here.
*/
- mutex_lock(&qi->qi_lru_lock);
ASSERT(!list_empty(&dqp->q_lru));
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
+ list_lru_del(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(xs_qm_dquot_unused);
- mutex_unlock(&qi->qi_lru_lock);
xfs_qm_dqdestroy(dqp);
@@ -632,6 +630,141 @@ xfs_qm_calc_dquots_per_chunk(
return ndquots;
}
+struct xfs_qm_isolate {
+ struct list_head buffers;
+ struct list_head dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_dquot *dqp = container_of(item,
+ struct xfs_dquot, q_lru);
+ struct xfs_qm_isolate *isol = arg;
+
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_miss_busy;
+
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
+ XFS_STATS_INC(xs_qm_dqwants);
+
+ trace_xfs_dqreclaim_want(dqp);
+ list_del_init(&dqp->q_lru);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ return 0;
+ }
+
+ /*
+ * If the dquot is dirty, flush it. If it's already being flushed, just
+ * skip it so there is time for the IO to complete before we try to
+ * reclaim it again on the next LRU pass.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ xfs_dqunlock(dqp);
+ goto out_miss_busy;
+ }
+
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ trace_xfs_dqreclaim_dirty(dqp);
+
+ /* we have to drop the LRU lock to flush the dquot */
+ spin_unlock(lru_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+ __func__, dqp);
+ goto out_unlock_dirty;
+ }
+
+ xfs_buf_delwri_queue(bp, &isol->buffers);
+ xfs_buf_relse(bp);
+ goto out_unlock_dirty;
+ }
+ xfs_dqfunlock(dqp);
+
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
+
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_lru, &isol->dispose);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ trace_xfs_dqreclaim_done(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaims);
+ return 0;
+
+out_miss_busy:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ return 2;
+
+out_unlock_dirty:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ return 3;
+}
+
+static long
+xfs_qm_shrink_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+ struct xfs_qm_isolate isol;
+ long freed;
+ int error;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ return 0;
+
+ INIT_LIST_HEAD(&isol.buffers);
+ INIT_LIST_HEAD(&isol.dispose);
+
+ freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+ &nr_to_scan);
+
+ error = xfs_buf_delwri_submit(&isol.buffers);
+ if (error)
+ xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+ while (!list_empty(&isol.dispose)) {
+ struct xfs_dquot *dqp;
+
+ dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+ list_del_init(&dqp->q_lru);
+ xfs_qm_dqfree_one(dqp);
+ }
+
+ return freed;
+}
+
+static long
+xfs_qm_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+
+ return list_lru_count_node(&qi->qi_lru, sc->nid);
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -662,9 +795,7 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
- INIT_LIST_HEAD(&qinf->qi_lru_list);
- qinf->qi_lru_count = 0;
- mutex_init(&qinf->qi_lru_lock);
+ list_lru_init(&qinf->qi_lru);
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -730,8 +861,10 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
- qinf->qi_shrinker.shrink = xfs_qm_shake;
+ qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+ qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+ qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&qinf->qi_shrinker);
return 0;
}
@@ -1482,132 +1615,6 @@ xfs_qm_dqfree_one(
xfs_qm_dqdestroy(dqp);
}
-STATIC void
-xfs_qm_dqreclaim_one(
- struct xfs_dquot *dqp,
- struct list_head *buffer_list,
- struct list_head *dispose_list)
-{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- int error;
-
- if (!xfs_dqlock_nowait(dqp))
- goto out_move_tail;
-
- /*
- * This dquot has acquired a reference in the meantime remove it from
- * the freelist and try again.
- */
- if (dqp->q_nrefs) {
- xfs_dqunlock(dqp);
-
- trace_xfs_dqreclaim_want(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
-
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
- return;
- }
-
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto out_unlock_move_tail;
-
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- goto out_unlock_move_tail;
- }
-
- xfs_buf_delwri_queue(bp, buffer_list);
- xfs_buf_relse(bp);
- /*
- * Give the dquot another try on the freelist, as the
- * flushing will take some time.
- */
- goto out_unlock_move_tail;
- }
- xfs_dqfunlock(dqp);
-
- /*
- * Prevent lookups now that we are past the point of no return.
- */
- dqp->dq_flags |= XFS_DQ_FREEING;
- xfs_dqunlock(dqp);
-
- ASSERT(dqp->q_nrefs == 0);
- list_move_tail(&dqp->q_lru, dispose_list);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
-
- trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
- return;
-
- /*
- * Move the dquot to the tail of the list so that we don't spin on it.
- */
-out_unlock_move_tail:
- xfs_dqunlock(dqp);
-out_move_tail:
- list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct xfs_quotainfo *qi =
- container_of(shrink, struct xfs_quotainfo, qi_shrinker);
- int nr_to_scan = sc->nr_to_scan;
- LIST_HEAD (buffer_list);
- LIST_HEAD (dispose_list);
- struct xfs_dquot *dqp;
- int error;
-
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
- return 0;
- if (!nr_to_scan)
- goto out;
-
- mutex_lock(&qi->qi_lru_lock);
- while (!list_empty(&qi->qi_lru_list)) {
- if (nr_to_scan-- <= 0)
- break;
- dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
- q_lru);
- xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
- }
- mutex_unlock(&qi->qi_lru_lock);
-
- error = xfs_buf_delwri_submit(&buffer_list);
- if (error)
- xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
- while (!list_empty(&dispose_list)) {
- dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
- list_del_init(&dqp->q_lru);
- xfs_qm_dqfree_one(dqp);
- }
-
-out:
- return vfs_pressure_ratio(qi->qi_lru_count);
-}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e..8173b5e 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -47,9 +47,7 @@ typedef struct xfs_quotainfo {
struct mutex qi_tree_lock;
xfs_inode_t *qi_uquotaip; /* user quota inode */
xfs_inode_t *qi_gquotaip; /* group quota inode */
- struct list_head qi_lru_list;
- struct mutex qi_lru_lock;
- int qi_lru_count;
+ struct list_lru qi_lru;
int qi_dquots;
time_t qi_btimelimit; /* limit for blks timer */
time_t qi_itimelimit; /* limit for inodes timer */
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 18/25] xfs: convert dquot cache lru to list_lru
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
Convert the XFS dquot lru to use the list_lru construct and convert
the shrinker to being node aware.
* v7: Add NUMA aware flag
[ glommer: edited for conflicts + warning fixes ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
---
fs/xfs/xfs_dquot.c | 7 +-
fs/xfs/xfs_qm.c | 277 +++++++++++++++++++++++++++--------------------------
fs/xfs/xfs_qm.h | 4 +-
3 files changed, 144 insertions(+), 144 deletions(-)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a..a2c5672 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -939,13 +939,8 @@ xfs_qm_dqput_final(
trace_xfs_dqput_free(dqp);
- mutex_lock(&qi->qi_lru_lock);
- if (list_empty(&dqp->q_lru)) {
- list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
- qi->qi_lru_count++;
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(xs_qm_dquot_unused);
- }
- mutex_unlock(&qi->qi_lru_lock);
/*
* If we just added a udquot to the freelist, then we want to release
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f10506b..bd6c12a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -51,8 +51,9 @@
*/
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
* currently is the only interface into the radix tree code that allows
@@ -197,12 +198,9 @@ xfs_qm_dqpurge(
* We move dquots to the freelist as soon as their reference count
* hits zero, so it really should be on the freelist here.
*/
- mutex_lock(&qi->qi_lru_lock);
ASSERT(!list_empty(&dqp->q_lru));
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
+ list_lru_del(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(xs_qm_dquot_unused);
- mutex_unlock(&qi->qi_lru_lock);
xfs_qm_dqdestroy(dqp);
@@ -632,6 +630,141 @@ xfs_qm_calc_dquots_per_chunk(
return ndquots;
}
+struct xfs_qm_isolate {
+ struct list_head buffers;
+ struct list_head dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_dquot *dqp = container_of(item,
+ struct xfs_dquot, q_lru);
+ struct xfs_qm_isolate *isol = arg;
+
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_miss_busy;
+
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
+ XFS_STATS_INC(xs_qm_dqwants);
+
+ trace_xfs_dqreclaim_want(dqp);
+ list_del_init(&dqp->q_lru);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ return 0;
+ }
+
+ /*
+ * If the dquot is dirty, flush it. If it's already being flushed, just
+ * skip it so there is time for the IO to complete before we try to
+ * reclaim it again on the next LRU pass.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ xfs_dqunlock(dqp);
+ goto out_miss_busy;
+ }
+
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ trace_xfs_dqreclaim_dirty(dqp);
+
+ /* we have to drop the LRU lock to flush the dquot */
+ spin_unlock(lru_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+ __func__, dqp);
+ goto out_unlock_dirty;
+ }
+
+ xfs_buf_delwri_queue(bp, &isol->buffers);
+ xfs_buf_relse(bp);
+ goto out_unlock_dirty;
+ }
+ xfs_dqfunlock(dqp);
+
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
+
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_lru, &isol->dispose);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ trace_xfs_dqreclaim_done(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaims);
+ return 0;
+
+out_miss_busy:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ return 2;
+
+out_unlock_dirty:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ return 3;
+}
+
+static long
+xfs_qm_shrink_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+ struct xfs_qm_isolate isol;
+ long freed;
+ int error;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ return 0;
+
+ INIT_LIST_HEAD(&isol.buffers);
+ INIT_LIST_HEAD(&isol.dispose);
+
+ freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+ &nr_to_scan);
+
+ error = xfs_buf_delwri_submit(&isol.buffers);
+ if (error)
+ xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+ while (!list_empty(&isol.dispose)) {
+ struct xfs_dquot *dqp;
+
+ dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+ list_del_init(&dqp->q_lru);
+ xfs_qm_dqfree_one(dqp);
+ }
+
+ return freed;
+}
+
+static long
+xfs_qm_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+
+ return list_lru_count_node(&qi->qi_lru, sc->nid);
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -662,9 +795,7 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
- INIT_LIST_HEAD(&qinf->qi_lru_list);
- qinf->qi_lru_count = 0;
- mutex_init(&qinf->qi_lru_lock);
+ list_lru_init(&qinf->qi_lru);
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -730,8 +861,10 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
- qinf->qi_shrinker.shrink = xfs_qm_shake;
+ qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+ qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+ qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&qinf->qi_shrinker);
return 0;
}
@@ -1482,132 +1615,6 @@ xfs_qm_dqfree_one(
xfs_qm_dqdestroy(dqp);
}
-STATIC void
-xfs_qm_dqreclaim_one(
- struct xfs_dquot *dqp,
- struct list_head *buffer_list,
- struct list_head *dispose_list)
-{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- int error;
-
- if (!xfs_dqlock_nowait(dqp))
- goto out_move_tail;
-
- /*
- * This dquot has acquired a reference in the meantime remove it from
- * the freelist and try again.
- */
- if (dqp->q_nrefs) {
- xfs_dqunlock(dqp);
-
- trace_xfs_dqreclaim_want(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
-
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
- return;
- }
-
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto out_unlock_move_tail;
-
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- goto out_unlock_move_tail;
- }
-
- xfs_buf_delwri_queue(bp, buffer_list);
- xfs_buf_relse(bp);
- /*
- * Give the dquot another try on the freelist, as the
- * flushing will take some time.
- */
- goto out_unlock_move_tail;
- }
- xfs_dqfunlock(dqp);
-
- /*
- * Prevent lookups now that we are past the point of no return.
- */
- dqp->dq_flags |= XFS_DQ_FREEING;
- xfs_dqunlock(dqp);
-
- ASSERT(dqp->q_nrefs == 0);
- list_move_tail(&dqp->q_lru, dispose_list);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
-
- trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
- return;
-
- /*
- * Move the dquot to the tail of the list so that we don't spin on it.
- */
-out_unlock_move_tail:
- xfs_dqunlock(dqp);
-out_move_tail:
- list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct xfs_quotainfo *qi =
- container_of(shrink, struct xfs_quotainfo, qi_shrinker);
- int nr_to_scan = sc->nr_to_scan;
- LIST_HEAD (buffer_list);
- LIST_HEAD (dispose_list);
- struct xfs_dquot *dqp;
- int error;
-
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
- return 0;
- if (!nr_to_scan)
- goto out;
-
- mutex_lock(&qi->qi_lru_lock);
- while (!list_empty(&qi->qi_lru_list)) {
- if (nr_to_scan-- <= 0)
- break;
- dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
- q_lru);
- xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
- }
- mutex_unlock(&qi->qi_lru_lock);
-
- error = xfs_buf_delwri_submit(&buffer_list);
- if (error)
- xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
- while (!list_empty(&dispose_list)) {
- dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
- list_del_init(&dqp->q_lru);
- xfs_qm_dqfree_one(dqp);
- }
-
-out:
- return vfs_pressure_ratio(qi->qi_lru_count);
-}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e..8173b5e 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -47,9 +47,7 @@ typedef struct xfs_quotainfo {
struct mutex qi_tree_lock;
xfs_inode_t *qi_uquotaip; /* user quota inode */
xfs_inode_t *qi_gquotaip; /* group quota inode */
- struct list_head qi_lru_list;
- struct mutex qi_lru_lock;
- int qi_lru_count;
+ struct list_lru qi_lru;
int qi_dquots;
time_t qi_btimelimit; /* limit for blks timer */
time_t qi_itimelimit; /* limit for inodes timer */
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 21/25] i915: bail out earlier when shrinker cannot acquire mutex
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner,
Kent Overstreet
The main shrinker driver will keep trying for a while to free objects if
the returned value from the shrink scan procedure is 0. That means "no
objects now", but a retry could very well succeed.
But what we should say here is a different thing: that it is impossible to
shrink, and we would better bail out soon. We find this behavior more
appropriate for the case where the lock cannot be taken. Specially given the
hammer behavior of the i915: if another thread is already shrinking, we are
likely not to be able to shrink anything anyway when we finally acquire the
mutex.
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Acked-by: Daniel Vetter <daniel.vetter-/w4YWyX8dFk@public.gmane.org>
CC: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
CC: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
drivers/gpu/drm/i915/i915_gem.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 22a0556..101504f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4497,10 +4497,10 @@ i915_gem_inactive_count(struct shrinker *shrinker, struct shrink_control *sc)
if (!mutex_trylock(&dev->struct_mutex)) {
if (!mutex_is_locked_by(&dev->struct_mutex, current))
- return 0;
+ return SHRINK_STOP;
if (dev_priv->mm.shrinker_no_lock_stealing)
- return 0;
+ return SHRINK_STOP;
unlock = false;
}
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 21/25] i915: bail out earlier when shrinker cannot acquire mutex
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner, Kent Overstreet
The main shrinker driver will keep trying for a while to free objects if
the returned value from the shrink scan procedure is 0. That means "no
objects now", but a retry could very well succeed.
But what we should say here is a different thing: that it is impossible to
shrink, and we would better bail out soon. We find this behavior more
appropriate for the case where the lock cannot be taken. Specially given the
hammer behavior of the i915: if another thread is already shrinking, we are
likely not to be able to shrink anything anyway when we finally acquire the
mutex.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
CC: Dave Chinner <dchinner@redhat.com>
CC: Mel Gorman <mgorman@suse.de>
CC: Kent Overstreet <koverstreet@google.com>
---
drivers/gpu/drm/i915/i915_gem.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 22a0556..101504f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4497,10 +4497,10 @@ i915_gem_inactive_count(struct shrinker *shrinker, struct shrink_control *sc)
if (!mutex_trylock(&dev->struct_mutex)) {
if (!mutex_is_locked_by(&dev->struct_mutex, current))
- return 0;
+ return SHRINK_STOP;
if (dev_priv->mm.shrinker_no_lock_stealing)
- return 0;
+ return SHRINK_STOP;
unlock = false;
}
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 23/25] hugepage: convert huge zero page shrinker to new shrinker API
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner
It consists of:
* returning long instead of int
* separating count from scan
* returning the number of freed entities in scan
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Reviewed-by: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
CC: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
mm/huge_memory.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 243e710..8dc36f5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -211,24 +211,29 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static int shrink_huge_zero_page(struct shrinker *shrink,
- struct shrink_control *sc)
+static long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- if (!sc->nr_to_scan)
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+static long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
__free_page(zero_page);
+ return HPAGE_PMD_NR;
}
return 0;
}
static struct shrinker huge_zero_page_shrinker = {
- .shrink = shrink_huge_zero_page,
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
.seeks = DEFAULT_SEEKS,
};
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 23/25] hugepage: convert huge zero page shrinker to new shrinker API
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner
It consists of:
* returning long instead of int
* separating count from scan
* returning the number of freed entities in scan
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Greg Thelen <gthelen@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
CC: Dave Chinner <dchinner@redhat.com>
---
mm/huge_memory.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 243e710..8dc36f5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -211,24 +211,29 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static int shrink_huge_zero_page(struct shrinker *shrink,
- struct shrink_control *sc)
+static long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- if (!sc->nr_to_scan)
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+static long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
__free_page(zero_page);
+ return HPAGE_PMD_NR;
}
return 0;
}
static struct shrinker huge_zero_page_shrinker = {
- .shrink = shrink_huge_zero_page,
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
.seeks = DEFAULT_SEEKS,
};
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 24/25] shrinker: Kill old ->shrink API.
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Dave Chinner, Glauber Costa
From: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
There are no more users of this API, so kill it dead, dead, dead and
quietly bury the corpse in a shallow, unmarked grave in a dark
forest deep in the hills...
[ glommer: added flowers to the grave ]
Signed-off-by: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Reviewed-by: Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Acked-by: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
include/linux/shrinker.h | 15 +++++----------
include/trace/events/vmscan.h | 4 ++--
mm/vmscan.c | 40 ++++++++--------------------------------
3 files changed, 15 insertions(+), 44 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 8f80f24..68c0970 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -7,14 +7,15 @@
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
- *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrink_control {
gfp_t gfp_mask;
- /* How many slab objects shrinker() should scan and try to reclaim */
+ /*
+ * How many objects scan_objects should scan and try to reclaim.
+ * This is reset before every call, so it is safe for callees
+ * to modify.
+ */
unsigned long nr_to_scan;
/* shrink from these nodes */
@@ -27,11 +28,6 @@ struct shrink_control {
/*
* A callback you can register to apply pressure to ageable caches.
*
- * @shrink() should look through the least-recently-used 'nr_to_scan' entries
- * and attempt to free them up. It should return the number of objects which
- * remain in the cache. If it returns -1, it means it cannot do any scanning at
- * this time (eg. there is a risk of deadlock).
- *
* @count_objects should return the number of freeable items in the cache. If
* there are no objects to free or the number of freeable items cannot be
* determined, it should return 0. No deadlock checks should be done during the
@@ -50,7 +46,6 @@ struct shrink_control {
* @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
- int (*shrink)(struct shrinker *, struct shrink_control *sc);
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 63cfccc..132a985 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -202,7 +202,7 @@ TRACE_EVENT(mm_shrink_slab_start,
TP_fast_assign(
__entry->shr = shr;
- __entry->shrink = shr->shrink;
+ __entry->shrink = shr->scan_objects;
__entry->nr_objects_to_shrink = nr_objects_to_shrink;
__entry->gfp_flags = sc->gfp_mask;
__entry->pgs_scanned = pgs_scanned;
@@ -241,7 +241,7 @@ TRACE_EVENT(mm_shrink_slab_end,
TP_fast_assign(
__entry->shr = shr;
- __entry->shrink = shr->shrink;
+ __entry->shrink = shr->scan_objects;
__entry->unused_scan = unused_scan_cnt;
__entry->new_scan = new_scan_cnt;
__entry->retval = shrinker_retval;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 22ac8de..fe73724 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -194,14 +194,6 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
- struct shrink_control *sc,
- unsigned long nr_to_scan)
-{
- sc->nr_to_scan = nr_to_scan;
- return (*shrinker->shrink)(shrinker, sc);
-}
-
#define SHRINK_BATCH 128
static unsigned long
@@ -218,10 +210,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- if (shrinker->count_objects)
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- else
- max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
if (max_pass == 0)
return 0;
@@ -240,7 +229,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
- shrinker->shrink, total_scan);
+ shrinker->scan_objects, total_scan);
total_scan = max_pass;
}
@@ -273,26 +262,13 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
while (total_scan >= batch_size) {
- if (shrinker->scan_objects) {
- unsigned long ret;
- shrinkctl->nr_to_scan = batch_size;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
-
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
- } else {
- int nr_before;
- long ret;
+ unsigned long ret;
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
- nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
- ret = do_shrinker_shrink(shrinker, shrinkctl,
- batch_size);
- if (ret == -1)
- break;
- if (ret < nr_before)
- freed += nr_before - ret;
- }
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
count_vm_events(SLABS_SCANNED, batch_size);
total_scan -= batch_size;
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 24/25] shrinker: Kill old ->shrink API.
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner,
Glauber Costa
From: Dave Chinner <dchinner@redhat.com>
There are no more users of this API, so kill it dead, dead, dead and
quietly bury the corpse in a shallow, unmarked grave in a dark
forest deep in the hills...
[ glommer: added flowers to the grave ]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Glauber Costa <glommer@openvz.org>
Reviewed-by: Greg Thelen <gthelen@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
---
include/linux/shrinker.h | 15 +++++----------
include/trace/events/vmscan.h | 4 ++--
mm/vmscan.c | 40 ++++++++--------------------------------
3 files changed, 15 insertions(+), 44 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 8f80f24..68c0970 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -7,14 +7,15 @@
*
* The 'gfpmask' refers to the allocation we are currently trying to
* fulfil.
- *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
*/
struct shrink_control {
gfp_t gfp_mask;
- /* How many slab objects shrinker() should scan and try to reclaim */
+ /*
+ * How many objects scan_objects should scan and try to reclaim.
+ * This is reset before every call, so it is safe for callees
+ * to modify.
+ */
unsigned long nr_to_scan;
/* shrink from these nodes */
@@ -27,11 +28,6 @@ struct shrink_control {
/*
* A callback you can register to apply pressure to ageable caches.
*
- * @shrink() should look through the least-recently-used 'nr_to_scan' entries
- * and attempt to free them up. It should return the number of objects which
- * remain in the cache. If it returns -1, it means it cannot do any scanning at
- * this time (eg. there is a risk of deadlock).
- *
* @count_objects should return the number of freeable items in the cache. If
* there are no objects to free or the number of freeable items cannot be
* determined, it should return 0. No deadlock checks should be done during the
@@ -50,7 +46,6 @@ struct shrink_control {
* @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
- int (*shrink)(struct shrinker *, struct shrink_control *sc);
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 63cfccc..132a985 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -202,7 +202,7 @@ TRACE_EVENT(mm_shrink_slab_start,
TP_fast_assign(
__entry->shr = shr;
- __entry->shrink = shr->shrink;
+ __entry->shrink = shr->scan_objects;
__entry->nr_objects_to_shrink = nr_objects_to_shrink;
__entry->gfp_flags = sc->gfp_mask;
__entry->pgs_scanned = pgs_scanned;
@@ -241,7 +241,7 @@ TRACE_EVENT(mm_shrink_slab_end,
TP_fast_assign(
__entry->shr = shr;
- __entry->shrink = shr->shrink;
+ __entry->shrink = shr->scan_objects;
__entry->unused_scan = unused_scan_cnt;
__entry->new_scan = new_scan_cnt;
__entry->retval = shrinker_retval;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 22ac8de..fe73724 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -194,14 +194,6 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
- struct shrink_control *sc,
- unsigned long nr_to_scan)
-{
- sc->nr_to_scan = nr_to_scan;
- return (*shrinker->shrink)(shrinker, sc);
-}
-
#define SHRINK_BATCH 128
static unsigned long
@@ -218,10 +210,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- if (shrinker->count_objects)
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- else
- max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ max_pass = shrinker->count_objects(shrinker, shrinkctl);
if (max_pass == 0)
return 0;
@@ -240,7 +229,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
- shrinker->shrink, total_scan);
+ shrinker->scan_objects, total_scan);
total_scan = max_pass;
}
@@ -273,26 +262,13 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
while (total_scan >= batch_size) {
- if (shrinker->scan_objects) {
- unsigned long ret;
- shrinkctl->nr_to_scan = batch_size;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
-
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
- } else {
- int nr_before;
- long ret;
+ unsigned long ret;
+ shrinkctl->nr_to_scan = batch_size;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
- nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
- ret = do_shrinker_shrink(shrinker, shrinkctl,
- batch_size);
- if (ret == -1)
- break;
- if (ret < nr_before)
- freed += nr_before - ret;
- }
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
count_vm_events(SLABS_SCANNED, batch_size);
total_scan -= batch_size;
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 20:34 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA, Glauber Costa, Dave Chinner
We currently use a compile-time constant to size the node array for the
list_lru structure. Due to this, we don't need to allocate any memory at
initialization time. But as a consequence, the structures that contain
embedded list_lru lists can become way too big (the superblock for
instance contains two of them).
This patch aims at ameliorating this situation by dynamically allocating
the node arrays with the firmware provided nr_node_ids.
Signed-off-by: Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Cc: Dave Chinner <dchinner-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>
---
fs/super.c | 11 +++++++++--
fs/xfs/xfs_buf.c | 6 +++++-
fs/xfs/xfs_qm.c | 10 ++++++++--
include/linux/list_lru.h | 13 ++-----------
mm/list_lru.c | 14 +++++++++++++-
5 files changed, 37 insertions(+), 17 deletions(-)
diff --git a/fs/super.c b/fs/super.c
index 85a6104..1b6ef7b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
- list_lru_init(&s->s_dentry_lru);
- list_lru_init(&s->s_inode_lru);
+
+ if (list_lru_init(&s->s_dentry_lru))
+ goto err_out;
+ if (list_lru_init(&s->s_inode_lru))
+ goto err_out_dentry_lru;
+
INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
}
out:
return s;
+
+err_out_dentry_lru:
+ list_lru_destroy(&s->s_dentry_lru);
err_out:
security_sb_free(s);
#ifdef CONFIG_SMP
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c3f8ea9..9c2b656 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1591,6 +1591,7 @@ xfs_free_buftarg(
struct xfs_mount *mp,
struct xfs_buftarg *btp)
{
+ list_lru_destroy(&btp->bt_lru);
unregister_shrinker(&btp->bt_shrinker);
if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1665,9 +1666,12 @@ xfs_alloc_buftarg(
if (!btp->bt_bdi)
goto error;
- list_lru_init(&btp->bt_lru);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
+
+ if (list_lru_init(&btp->bt_lru))
+ goto error;
+
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index bd6c12a..b840000 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -781,11 +781,18 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+ if ((error = list_lru_init(&qinf->qi_lru))) {
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
+ }
+
/*
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
if ((error = xfs_qm_init_quotainos(mp))) {
+ list_lru_destroy(&qinf->qi_lru);
kmem_free(qinf);
mp->m_quotainfo = NULL;
return error;
@@ -795,8 +802,6 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
- list_lru_init(&qinf->qi_lru);
-
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -884,6 +889,7 @@ xfs_qm_destroy_quotainfo(
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
+ list_lru_destroy(&qi->qi_lru);
unregister_shrinker(&qi->qi_shrinker);
if (qi->qi_uquotaip) {
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 2fe13e1..ff57503 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -27,20 +27,11 @@ struct list_lru_node {
} ____cacheline_aligned_in_smp;
struct list_lru {
- /*
- * Because we use a fixed-size array, this struct can be very big if
- * MAX_NUMNODES is big. If this becomes a problem this is fixable by
- * turning this into a pointer and dynamically allocating this to
- * nr_node_ids. This quantity is firwmare-provided, and still would
- * provide room for all nodes at the cost of a pointer lookup and an
- * extra allocation. Because that allocation will most likely come from
- * a different slab cache than the main structure holding this
- * structure, we may very well fail.
- */
- struct list_lru_node node[MAX_NUMNODES];
+ struct list_lru_node *node;
nodemask_t active_nodes;
};
+void list_lru_destroy(struct list_lru *lru);
int list_lru_init(struct list_lru *lru);
/**
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 2822817..700d322 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
+#include <linux/slab.h>
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
@@ -162,9 +163,14 @@ unsigned long list_lru_dispose_all(struct list_lru *lru,
int list_lru_init(struct list_lru *lru)
{
int i;
+ size_t size = sizeof(*lru->node) * nr_node_ids;
+
+ lru->node = kzalloc(size, GFP_KERNEL);
+ if (!lru->node)
+ return -ENOMEM;
nodes_clear(lru->active_nodes);
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
@@ -172,3 +178,9 @@ int list_lru_init(struct list_lru *lru)
return 0;
}
EXPORT_SYMBOL_GPL(list_lru_init);
+
+void list_lru_destroy(struct list_lru *lru)
+{
+ kfree(lru->node);
+}
+EXPORT_SYMBOL_GPL(list_lru_destroy);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 75+ messages in thread
* [PATCH v11 25/25] list_lru: dynamically adjust node arrays
@ 2013-06-06 20:34 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-06 20:34 UTC (permalink / raw)
To: akpm
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Glauber Costa,
Dave Chinner
We currently use a compile-time constant to size the node array for the
list_lru structure. Due to this, we don't need to allocate any memory at
initialization time. But as a consequence, the structures that contain
embedded list_lru lists can become way too big (the superblock for
instance contains two of them).
This patch aims at ameliorating this situation by dynamically allocating
the node arrays with the firmware provided nr_node_ids.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
---
fs/super.c | 11 +++++++++--
fs/xfs/xfs_buf.c | 6 +++++-
fs/xfs/xfs_qm.c | 10 ++++++++--
include/linux/list_lru.h | 13 ++-----------
mm/list_lru.c | 14 +++++++++++++-
5 files changed, 37 insertions(+), 17 deletions(-)
diff --git a/fs/super.c b/fs/super.c
index 85a6104..1b6ef7b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
- list_lru_init(&s->s_dentry_lru);
- list_lru_init(&s->s_inode_lru);
+
+ if (list_lru_init(&s->s_dentry_lru))
+ goto err_out;
+ if (list_lru_init(&s->s_inode_lru))
+ goto err_out_dentry_lru;
+
INIT_LIST_HEAD(&s->s_mounts);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
}
out:
return s;
+
+err_out_dentry_lru:
+ list_lru_destroy(&s->s_dentry_lru);
err_out:
security_sb_free(s);
#ifdef CONFIG_SMP
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c3f8ea9..9c2b656 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1591,6 +1591,7 @@ xfs_free_buftarg(
struct xfs_mount *mp,
struct xfs_buftarg *btp)
{
+ list_lru_destroy(&btp->bt_lru);
unregister_shrinker(&btp->bt_shrinker);
if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1665,9 +1666,12 @@ xfs_alloc_buftarg(
if (!btp->bt_bdi)
goto error;
- list_lru_init(&btp->bt_lru);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
+
+ if (list_lru_init(&btp->bt_lru))
+ goto error;
+
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index bd6c12a..b840000 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -781,11 +781,18 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+ if ((error = list_lru_init(&qinf->qi_lru))) {
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
+ }
+
/*
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
if ((error = xfs_qm_init_quotainos(mp))) {
+ list_lru_destroy(&qinf->qi_lru);
kmem_free(qinf);
mp->m_quotainfo = NULL;
return error;
@@ -795,8 +802,6 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
- list_lru_init(&qinf->qi_lru);
-
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -884,6 +889,7 @@ xfs_qm_destroy_quotainfo(
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
+ list_lru_destroy(&qi->qi_lru);
unregister_shrinker(&qi->qi_shrinker);
if (qi->qi_uquotaip) {
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 2fe13e1..ff57503 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -27,20 +27,11 @@ struct list_lru_node {
} ____cacheline_aligned_in_smp;
struct list_lru {
- /*
- * Because we use a fixed-size array, this struct can be very big if
- * MAX_NUMNODES is big. If this becomes a problem this is fixable by
- * turning this into a pointer and dynamically allocating this to
- * nr_node_ids. This quantity is firwmare-provided, and still would
- * provide room for all nodes at the cost of a pointer lookup and an
- * extra allocation. Because that allocation will most likely come from
- * a different slab cache than the main structure holding this
- * structure, we may very well fail.
- */
- struct list_lru_node node[MAX_NUMNODES];
+ struct list_lru_node *node;
nodemask_t active_nodes;
};
+void list_lru_destroy(struct list_lru *lru);
int list_lru_init(struct list_lru *lru);
/**
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 2822817..700d322 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
+#include <linux/slab.h>
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
@@ -162,9 +163,14 @@ unsigned long list_lru_dispose_all(struct list_lru *lru,
int list_lru_init(struct list_lru *lru)
{
int i;
+ size_t size = sizeof(*lru->node) * nr_node_ids;
+
+ lru->node = kzalloc(size, GFP_KERNEL);
+ if (!lru->node)
+ return -ENOMEM;
nodes_clear(lru->active_nodes);
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
@@ -172,3 +178,9 @@ int list_lru_init(struct list_lru *lru)
return 0;
}
EXPORT_SYMBOL_GPL(list_lru_init);
+
+void list_lru_destroy(struct list_lru *lru)
+{
+ kfree(lru->node);
+}
+EXPORT_SYMBOL_GPL(list_lru_destroy);
--
1.8.1.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-18 9:42 ` Li Zhong
-1 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-18 9:42 UTC (permalink / raw)
To: Glauber Costa
Cc: akpm, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner
On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> We currently use a compile-time constant to size the node array for the
> list_lru structure. Due to this, we don't need to allocate any memory at
> initialization time. But as a consequence, the structures that contain
> embedded list_lru lists can become way too big (the superblock for
> instance contains two of them).
>
> This patch aims at ameliorating this situation by dynamically allocating
> the node arrays with the firmware provided nr_node_ids.
>
> Signed-off-by: Glauber Costa <glommer@openvz.org>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Mel Gorman <mgorman@suse.de>
> ---
> fs/super.c | 11 +++++++++--
> fs/xfs/xfs_buf.c | 6 +++++-
> fs/xfs/xfs_qm.c | 10 ++++++++--
> include/linux/list_lru.h | 13 ++-----------
> mm/list_lru.c | 14 +++++++++++++-
> 5 files changed, 37 insertions(+), 17 deletions(-)
>
> diff --git a/fs/super.c b/fs/super.c
> index 85a6104..1b6ef7b 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> INIT_HLIST_NODE(&s->s_instances);
> INIT_HLIST_BL_HEAD(&s->s_anon);
> INIT_LIST_HEAD(&s->s_inodes);
> - list_lru_init(&s->s_dentry_lru);
> - list_lru_init(&s->s_inode_lru);
> +
> + if (list_lru_init(&s->s_dentry_lru))
> + goto err_out;
> + if (list_lru_init(&s->s_inode_lru))
> + goto err_out_dentry_lru;
> +
> INIT_LIST_HEAD(&s->s_mounts);
> init_rwsem(&s->s_umount);
> lockdep_set_class(&s->s_umount, &type->s_umount_key);
> @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> }
> out:
> return s;
> +
> +err_out_dentry_lru:
> + list_lru_destroy(&s->s_dentry_lru);
> err_out:
> security_sb_free(s);
> #ifdef CONFIG_SMP
It seems we also need to call list_lru_destroy() in destroy_super()?
like below:
-----------
diff --git a/fs/super.c b/fs/super.c
index b79e732..06ee3af 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -269,6 +269,8 @@ err_out:
*/
static inline void destroy_super(struct super_block *s)
{
+ list_lru_destroy(&s->s_inode_lru);
+ list_lru_destroy(&s->s_dentry_lru);
#ifdef CONFIG_SMP
free_percpu(s->s_files);
#endif
-----------
Thanks, Zhong
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index c3f8ea9..9c2b656 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1591,6 +1591,7 @@ xfs_free_buftarg(
> struct xfs_mount *mp,
> struct xfs_buftarg *btp)
> {
> + list_lru_destroy(&btp->bt_lru);
> unregister_shrinker(&btp->bt_shrinker);
>
> if (mp->m_flags & XFS_MOUNT_BARRIER)
> @@ -1665,9 +1666,12 @@ xfs_alloc_buftarg(
> if (!btp->bt_bdi)
> goto error;
>
> - list_lru_init(&btp->bt_lru);
> if (xfs_setsize_buftarg_early(btp, bdev))
> goto error;
> +
> + if (list_lru_init(&btp->bt_lru))
> + goto error;
> +
> btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
> btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
> btp->bt_shrinker.seeks = DEFAULT_SEEKS;
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index bd6c12a..b840000 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -781,11 +781,18 @@ xfs_qm_init_quotainfo(
>
> qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
>
> + if ((error = list_lru_init(&qinf->qi_lru))) {
> + kmem_free(qinf);
> + mp->m_quotainfo = NULL;
> + return error;
> + }
> +
> /*
> * See if quotainodes are setup, and if not, allocate them,
> * and change the superblock accordingly.
> */
> if ((error = xfs_qm_init_quotainos(mp))) {
> + list_lru_destroy(&qinf->qi_lru);
> kmem_free(qinf);
> mp->m_quotainfo = NULL;
> return error;
> @@ -795,8 +802,6 @@ xfs_qm_init_quotainfo(
> INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
> mutex_init(&qinf->qi_tree_lock);
>
> - list_lru_init(&qinf->qi_lru);
> -
> /* mutex used to serialize quotaoffs */
> mutex_init(&qinf->qi_quotaofflock);
>
> @@ -884,6 +889,7 @@ xfs_qm_destroy_quotainfo(
> qi = mp->m_quotainfo;
> ASSERT(qi != NULL);
>
> + list_lru_destroy(&qi->qi_lru);
> unregister_shrinker(&qi->qi_shrinker);
>
> if (qi->qi_uquotaip) {
> diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
> index 2fe13e1..ff57503 100644
> --- a/include/linux/list_lru.h
> +++ b/include/linux/list_lru.h
> @@ -27,20 +27,11 @@ struct list_lru_node {
> } ____cacheline_aligned_in_smp;
>
> struct list_lru {
> - /*
> - * Because we use a fixed-size array, this struct can be very big if
> - * MAX_NUMNODES is big. If this becomes a problem this is fixable by
> - * turning this into a pointer and dynamically allocating this to
> - * nr_node_ids. This quantity is firwmare-provided, and still would
> - * provide room for all nodes at the cost of a pointer lookup and an
> - * extra allocation. Because that allocation will most likely come from
> - * a different slab cache than the main structure holding this
> - * structure, we may very well fail.
> - */
> - struct list_lru_node node[MAX_NUMNODES];
> + struct list_lru_node *node;
> nodemask_t active_nodes;
> };
>
> +void list_lru_destroy(struct list_lru *lru);
> int list_lru_init(struct list_lru *lru);
>
> /**
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 2822817..700d322 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -8,6 +8,7 @@
> #include <linux/module.h>
> #include <linux/mm.h>
> #include <linux/list_lru.h>
> +#include <linux/slab.h>
>
> bool list_lru_add(struct list_lru *lru, struct list_head *item)
> {
> @@ -162,9 +163,14 @@ unsigned long list_lru_dispose_all(struct list_lru *lru,
> int list_lru_init(struct list_lru *lru)
> {
> int i;
> + size_t size = sizeof(*lru->node) * nr_node_ids;
> +
> + lru->node = kzalloc(size, GFP_KERNEL);
> + if (!lru->node)
> + return -ENOMEM;
>
> nodes_clear(lru->active_nodes);
> - for (i = 0; i < MAX_NUMNODES; i++) {
> + for (i = 0; i < nr_node_ids; i++) {
> spin_lock_init(&lru->node[i].lock);
> INIT_LIST_HEAD(&lru->node[i].list);
> lru->node[i].nr_items = 0;
> @@ -172,3 +178,9 @@ int list_lru_init(struct list_lru *lru)
> return 0;
> }
> EXPORT_SYMBOL_GPL(list_lru_init);
> +
> +void list_lru_destroy(struct list_lru *lru)
> +{
> + kfree(lru->node);
> +}
> +EXPORT_SYMBOL_GPL(list_lru_destroy);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
@ 2013-06-18 9:42 ` Li Zhong
0 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-18 9:42 UTC (permalink / raw)
To: Glauber Costa
Cc: akpm, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen, Dave Chinner
On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> We currently use a compile-time constant to size the node array for the
> list_lru structure. Due to this, we don't need to allocate any memory at
> initialization time. But as a consequence, the structures that contain
> embedded list_lru lists can become way too big (the superblock for
> instance contains two of them).
>
> This patch aims at ameliorating this situation by dynamically allocating
> the node arrays with the firmware provided nr_node_ids.
>
> Signed-off-by: Glauber Costa <glommer@openvz.org>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Mel Gorman <mgorman@suse.de>
> ---
> fs/super.c | 11 +++++++++--
> fs/xfs/xfs_buf.c | 6 +++++-
> fs/xfs/xfs_qm.c | 10 ++++++++--
> include/linux/list_lru.h | 13 ++-----------
> mm/list_lru.c | 14 +++++++++++++-
> 5 files changed, 37 insertions(+), 17 deletions(-)
>
> diff --git a/fs/super.c b/fs/super.c
> index 85a6104..1b6ef7b 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> INIT_HLIST_NODE(&s->s_instances);
> INIT_HLIST_BL_HEAD(&s->s_anon);
> INIT_LIST_HEAD(&s->s_inodes);
> - list_lru_init(&s->s_dentry_lru);
> - list_lru_init(&s->s_inode_lru);
> +
> + if (list_lru_init(&s->s_dentry_lru))
> + goto err_out;
> + if (list_lru_init(&s->s_inode_lru))
> + goto err_out_dentry_lru;
> +
> INIT_LIST_HEAD(&s->s_mounts);
> init_rwsem(&s->s_umount);
> lockdep_set_class(&s->s_umount, &type->s_umount_key);
> @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> }
> out:
> return s;
> +
> +err_out_dentry_lru:
> + list_lru_destroy(&s->s_dentry_lru);
> err_out:
> security_sb_free(s);
> #ifdef CONFIG_SMP
It seems we also need to call list_lru_destroy() in destroy_super()?
like below:
-----------
diff --git a/fs/super.c b/fs/super.c
index b79e732..06ee3af 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -269,6 +269,8 @@ err_out:
*/
static inline void destroy_super(struct super_block *s)
{
+ list_lru_destroy(&s->s_inode_lru);
+ list_lru_destroy(&s->s_dentry_lru);
#ifdef CONFIG_SMP
free_percpu(s->s_files);
#endif
-----------
Thanks, Zhong
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index c3f8ea9..9c2b656 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1591,6 +1591,7 @@ xfs_free_buftarg(
> struct xfs_mount *mp,
> struct xfs_buftarg *btp)
> {
> + list_lru_destroy(&btp->bt_lru);
> unregister_shrinker(&btp->bt_shrinker);
>
> if (mp->m_flags & XFS_MOUNT_BARRIER)
> @@ -1665,9 +1666,12 @@ xfs_alloc_buftarg(
> if (!btp->bt_bdi)
> goto error;
>
> - list_lru_init(&btp->bt_lru);
> if (xfs_setsize_buftarg_early(btp, bdev))
> goto error;
> +
> + if (list_lru_init(&btp->bt_lru))
> + goto error;
> +
> btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
> btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
> btp->bt_shrinker.seeks = DEFAULT_SEEKS;
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index bd6c12a..b840000 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -781,11 +781,18 @@ xfs_qm_init_quotainfo(
>
> qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
>
> + if ((error = list_lru_init(&qinf->qi_lru))) {
> + kmem_free(qinf);
> + mp->m_quotainfo = NULL;
> + return error;
> + }
> +
> /*
> * See if quotainodes are setup, and if not, allocate them,
> * and change the superblock accordingly.
> */
> if ((error = xfs_qm_init_quotainos(mp))) {
> + list_lru_destroy(&qinf->qi_lru);
> kmem_free(qinf);
> mp->m_quotainfo = NULL;
> return error;
> @@ -795,8 +802,6 @@ xfs_qm_init_quotainfo(
> INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
> mutex_init(&qinf->qi_tree_lock);
>
> - list_lru_init(&qinf->qi_lru);
> -
> /* mutex used to serialize quotaoffs */
> mutex_init(&qinf->qi_quotaofflock);
>
> @@ -884,6 +889,7 @@ xfs_qm_destroy_quotainfo(
> qi = mp->m_quotainfo;
> ASSERT(qi != NULL);
>
> + list_lru_destroy(&qi->qi_lru);
> unregister_shrinker(&qi->qi_shrinker);
>
> if (qi->qi_uquotaip) {
> diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
> index 2fe13e1..ff57503 100644
> --- a/include/linux/list_lru.h
> +++ b/include/linux/list_lru.h
> @@ -27,20 +27,11 @@ struct list_lru_node {
> } ____cacheline_aligned_in_smp;
>
> struct list_lru {
> - /*
> - * Because we use a fixed-size array, this struct can be very big if
> - * MAX_NUMNODES is big. If this becomes a problem this is fixable by
> - * turning this into a pointer and dynamically allocating this to
> - * nr_node_ids. This quantity is firwmare-provided, and still would
> - * provide room for all nodes at the cost of a pointer lookup and an
> - * extra allocation. Because that allocation will most likely come from
> - * a different slab cache than the main structure holding this
> - * structure, we may very well fail.
> - */
> - struct list_lru_node node[MAX_NUMNODES];
> + struct list_lru_node *node;
> nodemask_t active_nodes;
> };
>
> +void list_lru_destroy(struct list_lru *lru);
> int list_lru_init(struct list_lru *lru);
>
> /**
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 2822817..700d322 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -8,6 +8,7 @@
> #include <linux/module.h>
> #include <linux/mm.h>
> #include <linux/list_lru.h>
> +#include <linux/slab.h>
>
> bool list_lru_add(struct list_lru *lru, struct list_head *item)
> {
> @@ -162,9 +163,14 @@ unsigned long list_lru_dispose_all(struct list_lru *lru,
> int list_lru_init(struct list_lru *lru)
> {
> int i;
> + size_t size = sizeof(*lru->node) * nr_node_ids;
> +
> + lru->node = kzalloc(size, GFP_KERNEL);
> + if (!lru->node)
> + return -ENOMEM;
>
> nodes_clear(lru->active_nodes);
> - for (i = 0; i < MAX_NUMNODES; i++) {
> + for (i = 0; i < nr_node_ids; i++) {
> spin_lock_init(&lru->node[i].lock);
> INIT_LIST_HEAD(&lru->node[i].list);
> lru->node[i].nr_items = 0;
> @@ -172,3 +178,9 @@ int list_lru_init(struct list_lru *lru)
> return 0;
> }
> EXPORT_SYMBOL_GPL(list_lru_init);
> +
> +void list_lru_destroy(struct list_lru *lru)
> +{
> + kfree(lru->node);
> +}
> +EXPORT_SYMBOL_GPL(list_lru_destroy);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-18 9:42 ` Li Zhong
(?)
@ 2013-06-19 7:31 ` Glauber Costa
2013-06-19 9:12 ` Li Zhong
-1 siblings, 1 reply; 75+ messages in thread
From: Glauber Costa @ 2013-06-19 7:31 UTC (permalink / raw)
To: Li Zhong
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
>
> > diff --git a/fs/super.c b/fs/super.c
> > index 85a6104..1b6ef7b 100644
> > --- a/fs/super.c
> > +++ b/fs/super.c
> > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > INIT_HLIST_NODE(&s->s_instances);
> > INIT_HLIST_BL_HEAD(&s->s_anon);
> > INIT_LIST_HEAD(&s->s_inodes);
> > - list_lru_init(&s->s_dentry_lru);
> > - list_lru_init(&s->s_inode_lru);
> > +
> > + if (list_lru_init(&s->s_dentry_lru))
> > + goto err_out;
> > + if (list_lru_init(&s->s_inode_lru))
> > + goto err_out_dentry_lru;
> > +
> > INIT_LIST_HEAD(&s->s_mounts);
> > init_rwsem(&s->s_umount);
> > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > }
> > out:
> > return s;
> > +
> > +err_out_dentry_lru:
> > + list_lru_destroy(&s->s_dentry_lru);
> > err_out:
> > security_sb_free(s);
> > #ifdef CONFIG_SMP
>
> It seems we also need to call list_lru_destroy() in destroy_super()?
> like below:
>
> -----------
> diff --git a/fs/super.c b/fs/super.c
> index b79e732..06ee3af 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -269,6 +269,8 @@ err_out:
> */
> static inline void destroy_super(struct super_block *s)
> {
> + list_lru_destroy(&s->s_inode_lru);
> + list_lru_destroy(&s->s_dentry_lru);
> #ifdef CONFIG_SMP
> free_percpu(s->s_files);
> #endif
Hi
Thanks for taking a look at this.
list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-19 7:31 ` Glauber Costa
@ 2013-06-19 9:12 ` Li Zhong
0 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-19 9:12 UTC (permalink / raw)
To: Glauber Costa
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, 2013-06-19 at 11:31 +0400, Glauber Costa wrote:
> On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> > On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> >
> > > diff --git a/fs/super.c b/fs/super.c
> > > index 85a6104..1b6ef7b 100644
> > > --- a/fs/super.c
> > > +++ b/fs/super.c
> > > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > INIT_HLIST_NODE(&s->s_instances);
> > > INIT_HLIST_BL_HEAD(&s->s_anon);
> > > INIT_LIST_HEAD(&s->s_inodes);
> > > - list_lru_init(&s->s_dentry_lru);
> > > - list_lru_init(&s->s_inode_lru);
> > > +
> > > + if (list_lru_init(&s->s_dentry_lru))
> > > + goto err_out;
> > > + if (list_lru_init(&s->s_inode_lru))
> > > + goto err_out_dentry_lru;
> > > +
> > > INIT_LIST_HEAD(&s->s_mounts);
> > > init_rwsem(&s->s_umount);
> > > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > }
> > > out:
> > > return s;
> > > +
> > > +err_out_dentry_lru:
> > > + list_lru_destroy(&s->s_dentry_lru);
> > > err_out:
> > > security_sb_free(s);
> > > #ifdef CONFIG_SMP
> >
> > It seems we also need to call list_lru_destroy() in destroy_super()?
> > like below:
> >
> > -----------
> > diff --git a/fs/super.c b/fs/super.c
> > index b79e732..06ee3af 100644
> > --- a/fs/super.c
> > +++ b/fs/super.c
> > @@ -269,6 +269,8 @@ err_out:
> > */
> > static inline void destroy_super(struct super_block *s)
> > {
> > + list_lru_destroy(&s->s_inode_lru);
> > + list_lru_destroy(&s->s_dentry_lru);
> > #ifdef CONFIG_SMP
> > free_percpu(s->s_files);
> > #endif
>
> Hi
>
> Thanks for taking a look at this.
>
> list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
Sorry, I'm a little confused...
I didn't see list_lru_destroy() called in deactivate_locked_super().
Maybe I missed something?
But it seems other memory allocated in alloc_super(), are freed in
destroy_super(), e.g. ->s_files, why don't we also free this one here?
Thanks, Zhong
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
@ 2013-06-19 9:12 ` Li Zhong
0 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-19 9:12 UTC (permalink / raw)
To: Glauber Costa
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, 2013-06-19 at 11:31 +0400, Glauber Costa wrote:
> On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> > On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> >
> > > diff --git a/fs/super.c b/fs/super.c
> > > index 85a6104..1b6ef7b 100644
> > > --- a/fs/super.c
> > > +++ b/fs/super.c
> > > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > INIT_HLIST_NODE(&s->s_instances);
> > > INIT_HLIST_BL_HEAD(&s->s_anon);
> > > INIT_LIST_HEAD(&s->s_inodes);
> > > - list_lru_init(&s->s_dentry_lru);
> > > - list_lru_init(&s->s_inode_lru);
> > > +
> > > + if (list_lru_init(&s->s_dentry_lru))
> > > + goto err_out;
> > > + if (list_lru_init(&s->s_inode_lru))
> > > + goto err_out_dentry_lru;
> > > +
> > > INIT_LIST_HEAD(&s->s_mounts);
> > > init_rwsem(&s->s_umount);
> > > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > }
> > > out:
> > > return s;
> > > +
> > > +err_out_dentry_lru:
> > > + list_lru_destroy(&s->s_dentry_lru);
> > > err_out:
> > > security_sb_free(s);
> > > #ifdef CONFIG_SMP
> >
> > It seems we also need to call list_lru_destroy() in destroy_super()?
> > like below:
> >
> > -----------
> > diff --git a/fs/super.c b/fs/super.c
> > index b79e732..06ee3af 100644
> > --- a/fs/super.c
> > +++ b/fs/super.c
> > @@ -269,6 +269,8 @@ err_out:
> > */
> > static inline void destroy_super(struct super_block *s)
> > {
> > + list_lru_destroy(&s->s_inode_lru);
> > + list_lru_destroy(&s->s_dentry_lru);
> > #ifdef CONFIG_SMP
> > free_percpu(s->s_files);
> > #endif
>
> Hi
>
> Thanks for taking a look at this.
>
> list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
Sorry, I'm a little confused...
I didn't see list_lru_destroy() called in deactivate_locked_super().
Maybe I missed something?
But it seems other memory allocated in alloc_super(), are freed in
destroy_super(), e.g. ->s_files, why don't we also free this one here?
Thanks, Zhong
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-19 9:12 ` Li Zhong
@ 2013-06-19 13:29 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-19 13:29 UTC (permalink / raw)
To: Li Zhong
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, Jun 19, 2013 at 05:12:28PM +0800, Li Zhong wrote:
> On Wed, 2013-06-19 at 11:31 +0400, Glauber Costa wrote:
> > On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> > > On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> > >
> > > > diff --git a/fs/super.c b/fs/super.c
> > > > index 85a6104..1b6ef7b 100644
> > > > --- a/fs/super.c
> > > > +++ b/fs/super.c
> > > > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > INIT_HLIST_NODE(&s->s_instances);
> > > > INIT_HLIST_BL_HEAD(&s->s_anon);
> > > > INIT_LIST_HEAD(&s->s_inodes);
> > > > - list_lru_init(&s->s_dentry_lru);
> > > > - list_lru_init(&s->s_inode_lru);
> > > > +
> > > > + if (list_lru_init(&s->s_dentry_lru))
> > > > + goto err_out;
> > > > + if (list_lru_init(&s->s_inode_lru))
> > > > + goto err_out_dentry_lru;
> > > > +
> > > > INIT_LIST_HEAD(&s->s_mounts);
> > > > init_rwsem(&s->s_umount);
> > > > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > > > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > }
> > > > out:
> > > > return s;
> > > > +
> > > > +err_out_dentry_lru:
> > > > + list_lru_destroy(&s->s_dentry_lru);
> > > > err_out:
> > > > security_sb_free(s);
> > > > #ifdef CONFIG_SMP
> > >
> > > It seems we also need to call list_lru_destroy() in destroy_super()?
> > > like below:
> > >
> > > -----------
> > > diff --git a/fs/super.c b/fs/super.c
> > > index b79e732..06ee3af 100644
> > > --- a/fs/super.c
> > > +++ b/fs/super.c
> > > @@ -269,6 +269,8 @@ err_out:
> > > */
> > > static inline void destroy_super(struct super_block *s)
> > > {
> > > + list_lru_destroy(&s->s_inode_lru);
> > > + list_lru_destroy(&s->s_dentry_lru);
> > > #ifdef CONFIG_SMP
> > > free_percpu(s->s_files);
> > > #endif
> >
> > Hi
> >
> > Thanks for taking a look at this.
> >
> > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
>
> Sorry, I'm a little confused...
>
> I didn't see list_lru_destroy() called in deactivate_locked_super().
> Maybe I missed something?
Err... the code in my tree reads:
unregister_shrinker(&s->s_shrink);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
put_filesystem(fs);
put_super(s);
But then I have just checked Andrew's, and it is not there - thank you.
Andrew, should I send a patch for you to fold it ?
>
> But it seems other memory allocated in alloc_super(), are freed in
> destroy_super(), e.g. ->s_files, why don't we also free this one here?
Because we want this close to unregister_shrinker, it is a more natural
location for this.
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
@ 2013-06-19 13:29 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-19 13:29 UTC (permalink / raw)
To: Li Zhong
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, Jun 19, 2013 at 05:12:28PM +0800, Li Zhong wrote:
> On Wed, 2013-06-19 at 11:31 +0400, Glauber Costa wrote:
> > On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> > > On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> > >
> > > > diff --git a/fs/super.c b/fs/super.c
> > > > index 85a6104..1b6ef7b 100644
> > > > --- a/fs/super.c
> > > > +++ b/fs/super.c
> > > > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > INIT_HLIST_NODE(&s->s_instances);
> > > > INIT_HLIST_BL_HEAD(&s->s_anon);
> > > > INIT_LIST_HEAD(&s->s_inodes);
> > > > - list_lru_init(&s->s_dentry_lru);
> > > > - list_lru_init(&s->s_inode_lru);
> > > > +
> > > > + if (list_lru_init(&s->s_dentry_lru))
> > > > + goto err_out;
> > > > + if (list_lru_init(&s->s_inode_lru))
> > > > + goto err_out_dentry_lru;
> > > > +
> > > > INIT_LIST_HEAD(&s->s_mounts);
> > > > init_rwsem(&s->s_umount);
> > > > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > > > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > }
> > > > out:
> > > > return s;
> > > > +
> > > > +err_out_dentry_lru:
> > > > + list_lru_destroy(&s->s_dentry_lru);
> > > > err_out:
> > > > security_sb_free(s);
> > > > #ifdef CONFIG_SMP
> > >
> > > It seems we also need to call list_lru_destroy() in destroy_super()?
> > > like below:
> > >
> > > -----------
> > > diff --git a/fs/super.c b/fs/super.c
> > > index b79e732..06ee3af 100644
> > > --- a/fs/super.c
> > > +++ b/fs/super.c
> > > @@ -269,6 +269,8 @@ err_out:
> > > */
> > > static inline void destroy_super(struct super_block *s)
> > > {
> > > + list_lru_destroy(&s->s_inode_lru);
> > > + list_lru_destroy(&s->s_dentry_lru);
> > > #ifdef CONFIG_SMP
> > > free_percpu(s->s_files);
> > > #endif
> >
> > Hi
> >
> > Thanks for taking a look at this.
> >
> > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
>
> Sorry, I'm a little confused...
>
> I didn't see list_lru_destroy() called in deactivate_locked_super().
> Maybe I missed something?
Err... the code in my tree reads:
unregister_shrinker(&s->s_shrink);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
put_filesystem(fs);
put_super(s);
But then I have just checked Andrew's, and it is not there - thank you.
Andrew, should I send a patch for you to fold it ?
>
> But it seems other memory allocated in alloc_super(), are freed in
> destroy_super(), e.g. ->s_files, why don't we also free this one here?
Because we want this close to unregister_shrinker, it is a more natural
location for this.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-19 13:29 ` Glauber Costa
(?)
@ 2013-06-19 17:14 ` Andrew Morton
2013-06-20 0:50 ` Li Zhong
-1 siblings, 1 reply; 75+ messages in thread
From: Andrew Morton @ 2013-06-19 17:14 UTC (permalink / raw)
To: Glauber Costa
Cc: Li Zhong, Glauber Costa, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, 19 Jun 2013 17:29:06 +0400 Glauber Costa <glommer@gmail.com> wrote:
> > > Thanks for taking a look at this.
> > >
> > > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
> >
> > Sorry, I'm a little confused...
> >
> > I didn't see list_lru_destroy() called in deactivate_locked_super().
> > Maybe I missed something?
>
> Err... the code in my tree reads:
>
> unregister_shrinker(&s->s_shrink);
> list_lru_destroy(&s->s_dentry_lru);
> list_lru_destroy(&s->s_inode_lru);
> put_filesystem(fs);
> put_super(s);
>
> But then I have just checked Andrew's, and it is not there - thank you.
That is added by "super: targeted memcg reclaim", which is in the part
of the series which we decided to defer.
> Andrew, should I send a patch for you to fold it ?
Sure. Perhaps you could check for any other things which should be
brought over from the not-merged-yet patches?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-19 17:14 ` Andrew Morton
@ 2013-06-20 0:50 ` Li Zhong
0 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-20 0:50 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, Glauber Costa, linux-fsdevel, mgorman, david,
linux-mm, cgroups, kamezawa.hiroyu, mhocko, hannes, hughd,
gthelen, Dave Chinner
On Wed, 2013-06-19 at 10:14 -0700, Andrew Morton wrote:
> On Wed, 19 Jun 2013 17:29:06 +0400 Glauber Costa <glommer@gmail.com> wrote:
>
> > > > Thanks for taking a look at this.
> > > >
> > > > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
> > >
> > > Sorry, I'm a little confused...
> > >
> > > I didn't see list_lru_destroy() called in deactivate_locked_super().
> > > Maybe I missed something?
> >
> > Err... the code in my tree reads:
> >
> > unregister_shrinker(&s->s_shrink);
> > list_lru_destroy(&s->s_dentry_lru);
> > list_lru_destroy(&s->s_inode_lru);
> > put_filesystem(fs);
> > put_super(s);
> >
> > But then I have just checked Andrew's, and it is not there - thank you.
>
> That is added by "super: targeted memcg reclaim", which is in the part
> of the series which we decided to defer.
Oh, yes, there it is. Sorry for the noise ...
>
> > Andrew, should I send a patch for you to fold it ?
>
> Sure. Perhaps you could check for any other things which should be
> brought over from the not-merged-yet patches?
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
@ 2013-06-20 0:50 ` Li Zhong
0 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-20 0:50 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, Glauber Costa, linux-fsdevel, mgorman, david,
linux-mm, cgroups, kamezawa.hiroyu, mhocko, hannes, hughd,
gthelen, Dave Chinner
On Wed, 2013-06-19 at 10:14 -0700, Andrew Morton wrote:
> On Wed, 19 Jun 2013 17:29:06 +0400 Glauber Costa <glommer@gmail.com> wrote:
>
> > > > Thanks for taking a look at this.
> > > >
> > > > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
> > >
> > > Sorry, I'm a little confused...
> > >
> > > I didn't see list_lru_destroy() called in deactivate_locked_super().
> > > Maybe I missed something?
> >
> > Err... the code in my tree reads:
> >
> > unregister_shrinker(&s->s_shrink);
> > list_lru_destroy(&s->s_dentry_lru);
> > list_lru_destroy(&s->s_inode_lru);
> > put_filesystem(fs);
> > put_super(s);
> >
> > But then I have just checked Andrew's, and it is not there - thank you.
>
> That is added by "super: targeted memcg reclaim", which is in the part
> of the series which we decided to defer.
Oh, yes, there it is. Sorry for the noise ...
>
> > Andrew, should I send a patch for you to fold it ?
>
> Sure. Perhaps you could check for any other things which should be
> brought over from the not-merged-yet patches?
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-19 13:29 ` Glauber Costa
@ 2013-06-20 1:35 ` Li Zhong
-1 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-20 1:35 UTC (permalink / raw)
To: Glauber Costa
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, 2013-06-19 at 17:29 +0400, Glauber Costa wrote:
> On Wed, Jun 19, 2013 at 05:12:28PM +0800, Li Zhong wrote:
> > On Wed, 2013-06-19 at 11:31 +0400, Glauber Costa wrote:
> > > On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> > > > On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> > > >
> > > > > diff --git a/fs/super.c b/fs/super.c
> > > > > index 85a6104..1b6ef7b 100644
> > > > > --- a/fs/super.c
> > > > > +++ b/fs/super.c
> > > > > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > > INIT_HLIST_NODE(&s->s_instances);
> > > > > INIT_HLIST_BL_HEAD(&s->s_anon);
> > > > > INIT_LIST_HEAD(&s->s_inodes);
> > > > > - list_lru_init(&s->s_dentry_lru);
> > > > > - list_lru_init(&s->s_inode_lru);
> > > > > +
> > > > > + if (list_lru_init(&s->s_dentry_lru))
> > > > > + goto err_out;
> > > > > + if (list_lru_init(&s->s_inode_lru))
> > > > > + goto err_out_dentry_lru;
> > > > > +
> > > > > INIT_LIST_HEAD(&s->s_mounts);
> > > > > init_rwsem(&s->s_umount);
> > > > > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > > > > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > > }
> > > > > out:
> > > > > return s;
> > > > > +
> > > > > +err_out_dentry_lru:
> > > > > + list_lru_destroy(&s->s_dentry_lru);
> > > > > err_out:
> > > > > security_sb_free(s);
> > > > > #ifdef CONFIG_SMP
> > > >
> > > > It seems we also need to call list_lru_destroy() in destroy_super()?
> > > > like below:
> > > >
> > > > -----------
> > > > diff --git a/fs/super.c b/fs/super.c
> > > > index b79e732..06ee3af 100644
> > > > --- a/fs/super.c
> > > > +++ b/fs/super.c
> > > > @@ -269,6 +269,8 @@ err_out:
> > > > */
> > > > static inline void destroy_super(struct super_block *s)
> > > > {
> > > > + list_lru_destroy(&s->s_inode_lru);
> > > > + list_lru_destroy(&s->s_dentry_lru);
> > > > #ifdef CONFIG_SMP
> > > > free_percpu(s->s_files);
> > > > #endif
> > >
> > > Hi
> > >
> > > Thanks for taking a look at this.
> > >
> > > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
> >
> > Sorry, I'm a little confused...
> >
> > I didn't see list_lru_destroy() called in deactivate_locked_super().
> > Maybe I missed something?
>
> Err... the code in my tree reads:
>
> unregister_shrinker(&s->s_shrink);
> list_lru_destroy(&s->s_dentry_lru);
> list_lru_destroy(&s->s_inode_lru);
> put_filesystem(fs);
> put_super(s);
>
> But then I have just checked Andrew's, and it is not there - thank you.
>
> Andrew, should I send a patch for you to fold it ?
>
>
> >
> > But it seems other memory allocated in alloc_super(), are freed in
> > destroy_super(), e.g. ->s_files, why don't we also free this one here?
> Because we want this close to unregister_shrinker, it is a more natural
> location for this.
OK, I see. However, in some rare cases, destroy_super() might be called
in a row with alloc_super(), e.g. in sget(), if !test and err when set()
the fs private info.
By the way, there is memory allocated in register_shrinker() based on
nr_node_ids, so maybe we need to free it in unregister_shrinker()? Maybe
that's already covered in the deferred part.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
@ 2013-06-20 1:35 ` Li Zhong
0 siblings, 0 replies; 75+ messages in thread
From: Li Zhong @ 2013-06-20 1:35 UTC (permalink / raw)
To: Glauber Costa
Cc: Glauber Costa, akpm, linux-fsdevel, mgorman, david, linux-mm,
cgroups, kamezawa.hiroyu, mhocko, hannes, hughd, gthelen,
Dave Chinner
On Wed, 2013-06-19 at 17:29 +0400, Glauber Costa wrote:
> On Wed, Jun 19, 2013 at 05:12:28PM +0800, Li Zhong wrote:
> > On Wed, 2013-06-19 at 11:31 +0400, Glauber Costa wrote:
> > > On Tue, Jun 18, 2013 at 05:42:01PM +0800, Li Zhong wrote:
> > > > On Fri, 2013-06-07 at 00:34 +0400, Glauber Costa wrote:
> > > >
> > > > > diff --git a/fs/super.c b/fs/super.c
> > > > > index 85a6104..1b6ef7b 100644
> > > > > --- a/fs/super.c
> > > > > +++ b/fs/super.c
> > > > > @@ -199,8 +199,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > > INIT_HLIST_NODE(&s->s_instances);
> > > > > INIT_HLIST_BL_HEAD(&s->s_anon);
> > > > > INIT_LIST_HEAD(&s->s_inodes);
> > > > > - list_lru_init(&s->s_dentry_lru);
> > > > > - list_lru_init(&s->s_inode_lru);
> > > > > +
> > > > > + if (list_lru_init(&s->s_dentry_lru))
> > > > > + goto err_out;
> > > > > + if (list_lru_init(&s->s_inode_lru))
> > > > > + goto err_out_dentry_lru;
> > > > > +
> > > > > INIT_LIST_HEAD(&s->s_mounts);
> > > > > init_rwsem(&s->s_umount);
> > > > > lockdep_set_class(&s->s_umount, &type->s_umount_key);
> > > > > @@ -240,6 +244,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
> > > > > }
> > > > > out:
> > > > > return s;
> > > > > +
> > > > > +err_out_dentry_lru:
> > > > > + list_lru_destroy(&s->s_dentry_lru);
> > > > > err_out:
> > > > > security_sb_free(s);
> > > > > #ifdef CONFIG_SMP
> > > >
> > > > It seems we also need to call list_lru_destroy() in destroy_super()?
> > > > like below:
> > > >
> > > > -----------
> > > > diff --git a/fs/super.c b/fs/super.c
> > > > index b79e732..06ee3af 100644
> > > > --- a/fs/super.c
> > > > +++ b/fs/super.c
> > > > @@ -269,6 +269,8 @@ err_out:
> > > > */
> > > > static inline void destroy_super(struct super_block *s)
> > > > {
> > > > + list_lru_destroy(&s->s_inode_lru);
> > > > + list_lru_destroy(&s->s_dentry_lru);
> > > > #ifdef CONFIG_SMP
> > > > free_percpu(s->s_files);
> > > > #endif
> > >
> > > Hi
> > >
> > > Thanks for taking a look at this.
> > >
> > > list_lru_destroy is called by deactivate_lock_super, so we should be fine already.
> >
> > Sorry, I'm a little confused...
> >
> > I didn't see list_lru_destroy() called in deactivate_locked_super().
> > Maybe I missed something?
>
> Err... the code in my tree reads:
>
> unregister_shrinker(&s->s_shrink);
> list_lru_destroy(&s->s_dentry_lru);
> list_lru_destroy(&s->s_inode_lru);
> put_filesystem(fs);
> put_super(s);
>
> But then I have just checked Andrew's, and it is not there - thank you.
>
> Andrew, should I send a patch for you to fold it ?
>
>
> >
> > But it seems other memory allocated in alloc_super(), are freed in
> > destroy_super(), e.g. ->s_files, why don't we also free this one here?
> Because we want this close to unregister_shrinker, it is a more natural
> location for this.
OK, I see. However, in some rare cases, destroy_super() might be called
in a row with alloc_super(), e.g. in sget(), if !test and err when set()
the fs private info.
By the way, there is memory allocated in register_shrinker() based on
nr_node_ids, so maybe we need to free it in unregister_shrinker()? Maybe
that's already covered in the deferred part.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 25/25] list_lru: dynamically adjust node arrays
2013-06-06 20:34 ` Glauber Costa
(?)
(?)
@ 2013-06-20 2:37 ` Dave Chinner
-1 siblings, 0 replies; 75+ messages in thread
From: Dave Chinner @ 2013-06-20 2:37 UTC (permalink / raw)
To: Glauber Costa
Cc: akpm, linux-fsdevel, mgorman, linux-mm, cgroups, kamezawa.hiroyu,
mhocko, hannes, hughd, gthelen, Dave Chinner
On Fri, Jun 07, 2013 at 12:34:58AM +0400, Glauber Costa wrote:
> We currently use a compile-time constant to size the node array for the
> list_lru structure. Due to this, we don't need to allocate any memory at
> initialization time. But as a consequence, the structures that contain
> embedded list_lru lists can become way too big (the superblock for
> instance contains two of them).
>
> This patch aims at ameliorating this situation by dynamically allocating
> the node arrays with the firmware provided nr_node_ids.
>
> Signed-off-by: Glauber Costa <glommer@openvz.org>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Mel Gorman <mgorman@suse.de>
Just a small bug:
> index c3f8ea9..9c2b656 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1591,6 +1591,7 @@ xfs_free_buftarg(
> struct xfs_mount *mp,
> struct xfs_buftarg *btp)
> {
> + list_lru_destroy(&btp->bt_lru);
> unregister_shrinker(&btp->bt_shrinker);
Unregister the shrinker before destroying the list the shrinker
walks. Same for all the other cases....
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
2013-06-06 20:34 ` Glauber Costa
@ 2013-06-06 21:15 ` Andrew Morton
-1 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2013-06-06 21:15 UTC (permalink / raw)
To: Glauber Costa
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, mgorman-l3A5Bk7waGM,
david-FqsqvQoI3Ljby3iVrkZq2A, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA
On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
> Andrew,
>
> I believe I have addressed most of your comments, while attempting to address
> all of them. If there is anything I have missed after this long day, let me
> know and I will go over it promptly.
I'll trust you - I've had my fill of costacode this week ;)
Can you send over a nice introductory [patch 0/n] as an overview of the
whole series?
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
@ 2013-06-06 21:15 ` Andrew Morton
0 siblings, 0 replies; 75+ messages in thread
From: Andrew Morton @ 2013-06-06 21:15 UTC (permalink / raw)
To: Glauber Costa
Cc: linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer@openvz.org> wrote:
> Andrew,
>
> I believe I have addressed most of your comments, while attempting to address
> all of them. If there is anything I have missed after this long day, let me
> know and I will go over it promptly.
I'll trust you - I've had my fill of costacode this week ;)
Can you send over a nice introductory [patch 0/n] as an overview of the
whole series?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
2013-06-06 21:15 ` Andrew Morton
@ 2013-06-07 6:11 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 6:11 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
On 06/07/2013 01:15 AM, Andrew Morton wrote:
> On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer@openvz.org> wrote:
>
>> Andrew,
>>
>> I believe I have addressed most of your comments, while attempting to address
>> all of them. If there is anything I have missed after this long day, let me
>> know and I will go over it promptly.
>
> I'll trust you - I've had my fill of costacode this week ;)
>
In all fairness, since I have just sent the first part of the series,
most of it is still davecode.
> Can you send over a nice introductory [patch 0/n] as an overview of the
> whole series?
>
Sure. I will do shortly, as soon as I get to my office.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
@ 2013-06-07 6:11 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 6:11 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
On 06/07/2013 01:15 AM, Andrew Morton wrote:
> On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer@openvz.org> wrote:
>
>> Andrew,
>>
>> I believe I have addressed most of your comments, while attempting to address
>> all of them. If there is anything I have missed after this long day, let me
>> know and I will go over it promptly.
>
> I'll trust you - I've had my fill of costacode this week ;)
>
In all fairness, since I have just sent the first part of the series,
most of it is still davecode.
> Can you send over a nice introductory [patch 0/n] as an overview of the
> whole series?
>
Sure. I will do shortly, as soon as I get to my office.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
[parent not found: <51B1797D.3010209-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>]
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
2013-06-07 6:11 ` Glauber Costa
(?)
@ 2013-06-07 7:08 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 7:08 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
mgorman-l3A5Bk7waGM, david-FqsqvQoI3Ljby3iVrkZq2A,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA
On 06/07/2013 10:11 AM, Glauber Costa wrote:
>> > I'll trust you - I've had my fill of costacode this week ;)
>> >
> In all fairness, since I have just sent the first part of the series,
> most of it is still davecode.
>
Although that makes me wonder if all those "Cc" people have been writing
in changelogs were actually "Costa code", and I was being attributed to
all the work being done without knowing it.
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
@ 2013-06-07 7:08 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 7:08 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
mgorman-l3A5Bk7waGM, david-FqsqvQoI3Ljby3iVrkZq2A,
linux-mm-Bw31MaZKKs3YtjvyW6yDsg, cgroups-u79uwXL29TY76Z2rM5mHXA,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A, mhocko-Y4LbUc7mvzI,
hannes-druUgvl0LCNAfugRpC6u6w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
gthelen-hpIqsD4AKlfQT0dZR+AlfA
On 06/07/2013 10:11 AM, Glauber Costa wrote:
>> > I'll trust you - I've had my fill of costacode this week ;)
>> >
> In all fairness, since I have just sent the first part of the series,
> most of it is still davecode.
>
Although that makes me wonder if all those "Cc" people have been writing
in changelogs were actually "Costa code", and I was being attributed to
all the work being done without knowing it.
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
@ 2013-06-07 7:08 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 7:08 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
On 06/07/2013 10:11 AM, Glauber Costa wrote:
>> > I'll trust you - I've had my fill of costacode this week ;)
>> >
> In all fairness, since I have just sent the first part of the series,
> most of it is still davecode.
>
Although that makes me wonder if all those "Cc" people have been writing
in changelogs were actually "Costa code", and I was being attributed to
all the work being done without knowing it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
2013-06-06 21:15 ` Andrew Morton
(?)
@ 2013-06-07 8:04 ` Glauber Costa
-1 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 8:04 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
[-- Attachment #1: Type: text/plain, Size: 515 bytes --]
On 06/07/2013 01:15 AM, Andrew Morton wrote:
> On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer@openvz.org> wrote:
>
>> Andrew,
>>
>> I believe I have addressed most of your comments, while attempting to address
>> all of them. If there is anything I have missed after this long day, let me
>> know and I will go over it promptly.
>
> I'll trust you - I've had my fill of costacode this week ;)
>
> Can you send over a nice introductory [patch 0/n] as an overview of the
> whole series?
>
here it is.
[-- Attachment #2: 0000-cover-letter.patch --]
[-- Type: text/x-patch, Size: 6345 bytes --]
>From 748b830897f3b62271f92a01abc2a32c1d5d41cd Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@openvz.org>
Date: Fri, 7 Jun 2013 00:00:44 +0400
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Subject: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
This series rework out current object cache shrinking infrastructure in two
main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version. It is
modeled after the filesystem users: dentries, inodes, and xfs (for various
tasks), but we expect that other users could benefit in the near future with
little or no modification. Let us know if you have any issues.
* The underlying list_lru being proposed automatically and transparently keeps
the elements in per-node lists, and is able to manipulate the node lists
individually. Given this infrastructure, we are able to modify the up-to-now
hammer called shrink_slab to proceed with node-reclaim instead of always
searching memory from all over like it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a global
lock. The locks usually disappear from the profilers with this change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were performance
tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537)
yielding no visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically, those
two pieces of work have been posted together. This version presents only
the infrastructure work, deferring the memcg work for a later time, so we can
focus on getting this part tested. You can see more about the history of
such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
arch/x86/kvm/mmu.c | 24 ++-
drivers/gpu/drm/i915/i915_dma.c | 4 +-
drivers/gpu/drm/i915/i915_gem.c | 71 +++++---
drivers/gpu/drm/ttm/ttm_page_alloc.c | 44 +++--
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 51 ++++--
drivers/md/bcache/btree.c | 43 +++--
drivers/md/bcache/sysfs.c | 2 +-
drivers/md/dm-bufio.c | 61 ++++---
drivers/staging/android/ashmem.c | 44 +++--
drivers/staging/android/lowmemorykiller.c | 40 +++--
drivers/staging/zcache/zcache-main.c | 29 +--
fs/dcache.c | 270 +++++++++++++++++-----------
fs/drop_caches.c | 1 +
fs/ext4/extents_status.c | 30 ++--
fs/gfs2/glock.c | 30 ++--
fs/gfs2/main.c | 3 +-
fs/gfs2/quota.c | 16 +-
fs/gfs2/quota.h | 4 +-
fs/inode.c | 193 +++++++++-----------
fs/internal.h | 6 +-
fs/mbcache.c | 49 ++---
fs/nfs/dir.c | 16 +-
fs/nfs/internal.h | 4 +-
fs/nfs/super.c | 3 +-
fs/nfsd/nfscache.c | 31 +++-
fs/quota/dquot.c | 34 ++--
fs/super.c | 106 ++++++-----
fs/ubifs/shrinker.c | 22 ++-
fs/ubifs/super.c | 3 +-
fs/ubifs/ubifs.h | 3 +-
fs/xfs/xfs_buf.c | 253 +++++++++++++-------------
fs/xfs/xfs_buf.h | 17 +-
fs/xfs/xfs_dquot.c | 7 +-
fs/xfs/xfs_icache.c | 4 +-
fs/xfs/xfs_icache.h | 2 +-
fs/xfs/xfs_qm.c | 285 ++++++++++++++++--------------
fs/xfs/xfs_qm.h | 4 +-
fs/xfs/xfs_super.c | 12 +-
include/linux/dcache.h | 14 +-
include/linux/fs.h | 25 ++-
include/linux/list_lru.h | 148 ++++++++++++++++
include/linux/shrinker.h | 54 ++++--
include/trace/events/vmscan.h | 4 +-
include/uapi/linux/fs.h | 6 +-
kernel/sysctl.c | 6 +-
mm/Makefile | 2 +-
mm/huge_memory.c | 17 +-
mm/list_lru.c | 186 +++++++++++++++++++
mm/memory-failure.c | 2 +
mm/vmscan.c | 242 ++++++++++++++-----------
net/sunrpc/auth.c | 41 +++--
51 files changed, 1620 insertions(+), 948 deletions(-)
create mode 100644 include/linux/list_lru.h
create mode 100644 mm/list_lru.c
--
1.8.1.4
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
@ 2013-06-07 8:04 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 8:04 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
[-- Attachment #1: Type: text/plain, Size: 515 bytes --]
On 06/07/2013 01:15 AM, Andrew Morton wrote:
> On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer@openvz.org> wrote:
>
>> Andrew,
>>
>> I believe I have addressed most of your comments, while attempting to address
>> all of them. If there is anything I have missed after this long day, let me
>> know and I will go over it promptly.
>
> I'll trust you - I've had my fill of costacode this week ;)
>
> Can you send over a nice introductory [patch 0/n] as an overview of the
> whole series?
>
here it is.
[-- Attachment #2: 0000-cover-letter.patch --]
[-- Type: text/x-patch, Size: 6344 bytes --]
From 748b830897f3b62271f92a01abc2a32c1d5d41cd Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@openvz.org>
Date: Fri, 7 Jun 2013 00:00:44 +0400
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Subject: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
This series rework out current object cache shrinking infrastructure in two
main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version. It is
modeled after the filesystem users: dentries, inodes, and xfs (for various
tasks), but we expect that other users could benefit in the near future with
little or no modification. Let us know if you have any issues.
* The underlying list_lru being proposed automatically and transparently keeps
the elements in per-node lists, and is able to manipulate the node lists
individually. Given this infrastructure, we are able to modify the up-to-now
hammer called shrink_slab to proceed with node-reclaim instead of always
searching memory from all over like it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a global
lock. The locks usually disappear from the profilers with this change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were performance
tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537)
yielding no visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically, those
two pieces of work have been posted together. This version presents only
the infrastructure work, deferring the memcg work for a later time, so we can
focus on getting this part tested. You can see more about the history of
such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
arch/x86/kvm/mmu.c | 24 ++-
drivers/gpu/drm/i915/i915_dma.c | 4 +-
drivers/gpu/drm/i915/i915_gem.c | 71 +++++---
drivers/gpu/drm/ttm/ttm_page_alloc.c | 44 +++--
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 51 ++++--
drivers/md/bcache/btree.c | 43 +++--
drivers/md/bcache/sysfs.c | 2 +-
drivers/md/dm-bufio.c | 61 ++++---
drivers/staging/android/ashmem.c | 44 +++--
drivers/staging/android/lowmemorykiller.c | 40 +++--
drivers/staging/zcache/zcache-main.c | 29 +--
fs/dcache.c | 270 +++++++++++++++++-----------
fs/drop_caches.c | 1 +
fs/ext4/extents_status.c | 30 ++--
fs/gfs2/glock.c | 30 ++--
fs/gfs2/main.c | 3 +-
fs/gfs2/quota.c | 16 +-
fs/gfs2/quota.h | 4 +-
fs/inode.c | 193 +++++++++-----------
fs/internal.h | 6 +-
fs/mbcache.c | 49 ++---
fs/nfs/dir.c | 16 +-
fs/nfs/internal.h | 4 +-
fs/nfs/super.c | 3 +-
fs/nfsd/nfscache.c | 31 +++-
fs/quota/dquot.c | 34 ++--
fs/super.c | 106 ++++++-----
fs/ubifs/shrinker.c | 22 ++-
fs/ubifs/super.c | 3 +-
fs/ubifs/ubifs.h | 3 +-
fs/xfs/xfs_buf.c | 253 +++++++++++++-------------
fs/xfs/xfs_buf.h | 17 +-
fs/xfs/xfs_dquot.c | 7 +-
fs/xfs/xfs_icache.c | 4 +-
fs/xfs/xfs_icache.h | 2 +-
fs/xfs/xfs_qm.c | 285 ++++++++++++++++--------------
fs/xfs/xfs_qm.h | 4 +-
fs/xfs/xfs_super.c | 12 +-
include/linux/dcache.h | 14 +-
include/linux/fs.h | 25 ++-
include/linux/list_lru.h | 148 ++++++++++++++++
include/linux/shrinker.h | 54 ++++--
include/trace/events/vmscan.h | 4 +-
include/uapi/linux/fs.h | 6 +-
kernel/sysctl.c | 6 +-
mm/Makefile | 2 +-
mm/huge_memory.c | 17 +-
mm/list_lru.c | 186 +++++++++++++++++++
mm/memory-failure.c | 2 +
mm/vmscan.c | 242 ++++++++++++++-----------
net/sunrpc/auth.c | 41 +++--
51 files changed, 1620 insertions(+), 948 deletions(-)
create mode 100644 include/linux/list_lru.h
create mode 100644 mm/list_lru.c
--
1.8.1.4
^ permalink raw reply [flat|nested] 75+ messages in thread
* Re: [PATCH v11 00/25] shrinkers rework: per-numa, generic lists, etc
@ 2013-06-07 8:04 ` Glauber Costa
0 siblings, 0 replies; 75+ messages in thread
From: Glauber Costa @ 2013-06-07 8:04 UTC (permalink / raw)
To: Andrew Morton
Cc: Glauber Costa, linux-fsdevel, mgorman, david, linux-mm, cgroups,
kamezawa.hiroyu, mhocko, hannes, hughd, gthelen
[-- Attachment #1: Type: text/plain, Size: 515 bytes --]
On 06/07/2013 01:15 AM, Andrew Morton wrote:
> On Fri, 7 Jun 2013 00:34:33 +0400 Glauber Costa <glommer@openvz.org> wrote:
>
>> Andrew,
>>
>> I believe I have addressed most of your comments, while attempting to address
>> all of them. If there is anything I have missed after this long day, let me
>> know and I will go over it promptly.
>
> I'll trust you - I've had my fill of costacode this week ;)
>
> Can you send over a nice introductory [patch 0/n] as an overview of the
> whole series?
>
here it is.
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0000-cover-letter.patch --]
[-- Type: text/x-patch; name="0000-cover-letter.patch", Size: 0 bytes --]
^ permalink raw reply [flat|nested] 75+ messages in thread