* [RFC 01/11] vfs: introduce one structure hot_info
2012-09-11 14:27 [RFC 00/11] VFS: hot data tracking zwu.kernel
@ 2012-09-11 14:27 ` zwu.kernel
2012-09-11 14:27 ` [RFC 02/11] vfs: introduce one rb tree - hot_inode_tree zwu.kernel
` (4 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: zwu.kernel @ 2012-09-11 14:27 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, Zhi Yong Wu
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
One root structure hot_info is defined, is hooked
up in super_block, and will be used to hold rb trees
root, hash list root and some other information, etc.
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
include/linux/fs.h | 4 ++++
include/linux/hot_track.h | 26 ++++++++++++++++++++++++++
2 files changed, 30 insertions(+), 0 deletions(-)
create mode 100644 include/linux/hot_track.h
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa11047..6229895 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,6 +415,7 @@ struct inodes_stat_t {
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
+#include <linux/hot_track.h>
#include <asm/byteorder.h>
@@ -1578,6 +1579,9 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+
+ /* Hot data tracking info*/
+ struct hot_info s_hotinfo;
};
/* superblock cache pruning functions */
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
new file mode 100644
index 0000000..5716b93
--- /dev/null
+++ b/include/linux/hot_track.h
@@ -0,0 +1,26 @@
+/*
+ * include/linux/hot_track.h
+ *
+ * This file has definitions for VFS hot data tracking
+ * structures etc.
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ * Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_HOTTRACK_H
+#define _LINUX_HOTTRACK_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/kref.h>
+
+struct hot_info {
+};
+
+#endif /* _LINUX_HOTTRACK_H */
--
1.7.6.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [RFC 02/11] vfs: introduce one rb tree - hot_inode_tree
2012-09-11 14:27 [RFC 00/11] VFS: hot data tracking zwu.kernel
2012-09-11 14:27 ` [RFC 01/11] vfs: introduce one structure hot_info zwu.kernel
@ 2012-09-11 14:27 ` zwu.kernel
2012-09-11 14:27 ` [RFC 03/11] vfs: introduce 2 rb tree items - inode and range zwu.kernel
` (3 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: zwu.kernel @ 2012-09-11 14:27 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, Zhi Yong Wu
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Adds hot_inode_tree struct to keep track of
frequently accessed files, and be keyed by {inode, offset}.
Trees contain hot_inode_items representing those files
and ranges.
Having these trees means that vfs can quickly determine the
temperature of some data by doing some calculations on the
hot_freq_data struct that hangs off of the tree item.
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
fs/Makefile | 3 ++-
fs/hot_rb.c | 30 ++++++++++++++++++++++++++++++
fs/hot_rb.h | 21 +++++++++++++++++++++
include/linux/hot_track.h | 9 +++++++++
4 files changed, 62 insertions(+), 1 deletions(-)
create mode 100644 fs/hot_rb.c
create mode 100644 fs/hot_rb.h
diff --git a/fs/Makefile b/fs/Makefile
index 2fb9779..d3bc906 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,8 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o
+ stack.o fs_struct.o statfs.o \
+ hot_rb.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
new file mode 100644
index 0000000..726d1c5
--- /dev/null
+++ b/fs/hot_rb.c
@@ -0,0 +1,30 @@
+/*
+ * fs/hot_rb.c
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ * Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/blkdev.h>
+#include "hot_rb.h"
+#include "hot_hash.h"
+
+/*
+ * Initialize the inode tree. Should be called for each new inode
+ * access or other user of the hot_inode interface.
+ */
+void hot_rb_inode_tree_init(struct hot_inode_tree *tree)
+{
+ tree->map = RB_ROOT;
+ rwlock_init(&tree->lock);
+}
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
new file mode 100644
index 0000000..895c61c
--- /dev/null
+++ b/fs/hot_rb.h
@@ -0,0 +1,21 @@
+/*
+ * fs/hot_rb.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ * Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_MAP__
+#define __HOT_MAP__
+
+#include <linux/rbtree.h>
+#include <linux/hot_track.h>
+
+void hot_rb_inode_tree_init(struct hot_inode_tree *tree);
+
+#endif /* __HOT_MAP__ */
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index 5716b93..fa2aeb6 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -20,7 +20,16 @@
#include <linux/rbtree.h>
#include <linux/kref.h>
+/* A tree that sits on the hot_info */
+struct hot_inode_tree {
+ struct rb_root map;
+ rwlock_t lock;
+};
+
struct hot_info {
+
+ /* red-black tree that keeps track of fs-wide hot data */
+ struct hot_inode_tree hot_inode_tree;
};
#endif /* _LINUX_HOTTRACK_H */
--
1.7.6.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [RFC 03/11] vfs: introduce 2 rb tree items - inode and range
2012-09-11 14:27 [RFC 00/11] VFS: hot data tracking zwu.kernel
2012-09-11 14:27 ` [RFC 01/11] vfs: introduce one structure hot_info zwu.kernel
2012-09-11 14:27 ` [RFC 02/11] vfs: introduce one rb tree - hot_inode_tree zwu.kernel
@ 2012-09-11 14:27 ` zwu.kernel
2012-09-11 14:27 ` [RFC 04/11] vfs: add support for updating access frequency zwu.kernel
` (2 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: zwu.kernel @ 2012-09-11 14:27 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, Zhi Yong Wu
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Define two items hot_inode_item and hot_range_item,
one of them represents one tracked file
to keep track of its access frequency and the tree of
ranges in this file, while the latter represents
a file range of one inode.
Each of the two structures contains a hot_freq_data
struct with its frequency of access metrics (number of
{reads, writes}, last {read,write} time, frequency of
{reads,writes}).
Also, each hot_inode_item contains one hot_range_tree
struct which is keyed by {inode, offset, length}
and used to keep track of all the ranges in this file.
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
fs/Makefile | 2 +-
fs/dcache.c | 2 +
fs/hot_rb.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
fs/hot_rb.h | 9 +++++
fs/hot_track.c | 26 ++++++++++++++++
fs/hot_track.h | 20 ++++++++++++
include/linux/hot_track.h | 62 +++++++++++++++++++++++++++++++++++++
7 files changed, 194 insertions(+), 1 deletions(-)
create mode 100644 fs/hot_track.c
create mode 100644 fs/hot_track.h
diff --git a/fs/Makefile b/fs/Makefile
index d3bc906..b4f620e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,7 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
stack.o fs_struct.o statfs.o \
- hot_rb.o
+ hot_rb.o hot_track.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/dcache.c b/fs/dcache.c
index 8086636..e64d7e7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -39,6 +39,7 @@
#include <linux/ratelimit.h>
#include "internal.h"
#include "mount.h"
+#include "hot_track.h"
/*
* Usage:
@@ -3164,6 +3165,7 @@ void __init vfs_caches_init(unsigned long mempages)
inode_init();
files_init(mempages);
mnt_init();
+ hot_track_item_cache_init();
bdev_cache_init();
chrdev_init();
}
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index 726d1c5..e2bee75 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -19,6 +19,10 @@
#include "hot_rb.h"
#include "hot_hash.h"
+/* kmem_cache pointers for slab caches */
+static struct kmem_cache *hot_inode_item_cache;
+static struct kmem_cache *hot_range_item_cache;
+
/*
* Initialize the inode tree. Should be called for each new inode
* access or other user of the hot_inode interface.
@@ -28,3 +32,73 @@ void hot_rb_inode_tree_init(struct hot_inode_tree *tree)
tree->map = RB_ROOT;
rwlock_init(&tree->lock);
}
+
+/*
+ * Initialize the hot range tree. Should be called for each new inode
+ * access or other user of the hot_range interface.
+ */
+void hot_rb_range_tree_init(struct hot_range_tree *tree)
+{
+ tree->map = RB_ROOT;
+ rwlock_init(&tree->lock);
+}
+
+/* init hot_inode_item and hot_range_item kmem cache */
+int __init hot_rb_item_cache_init(void)
+{
+ hot_inode_item_cache = kmem_cache_create("hot_inode_item",
+ sizeof(struct hot_inode_item), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ hot_rb_inode_item_init);
+ if (!hot_inode_item_cache)
+ goto inode_err;
+
+ hot_range_item_cache = kmem_cache_create("hot_range_item",
+ sizeof(struct hot_range_item), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ hot_rb_range_item_init);
+ if (!hot_range_item_cache)
+ goto range_err;
+
+ return 0;
+
+range_err:
+ kmem_cache_destroy(hot_inode_item_cache);
+inode_err:
+ return -ENOMEM;
+}
+
+/*
+ * Initialize a new hot_inode_item structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_inode_item()
+ */
+void hot_rb_inode_item_init(void *_item)
+{
+ struct hot_inode_item *he = _item;
+
+ memset(he, 0, sizeof(*he));
+ kref_init(&he->refs);
+ spin_lock_init(&he->lock);
+ he->hot_freq_data.avg_delta_reads = (u64) -1;
+ he->hot_freq_data.avg_delta_writes = (u64) -1;
+ he->hot_freq_data.flags = FREQ_DATA_TYPE_INODE;
+ hot_rb_range_tree_init(&he->hot_range_tree);
+}
+
+/*
+ * Initialize a new hot_range_item structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_range_item()
+ */
+void hot_rb_range_item_init(void *_item)
+{
+ struct hot_range_item *hr = _item;
+
+ memset(hr, 0, sizeof(*hr));
+ kref_init(&hr->refs);
+ spin_lock_init(&hr->lock);
+ hr->hot_freq_data.avg_delta_reads = (u64) -1;
+ hr->hot_freq_data.avg_delta_writes = (u64) -1;
+ hr->hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
+}
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index 895c61c..9a68d699 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -18,4 +18,13 @@
void hot_rb_inode_tree_init(struct hot_inode_tree *tree);
+/* values for hot_freq_data flags */
+#define FREQ_DATA_TYPE_INODE (1 << 0) /* freq data struct is for an inode */
+#define FREQ_DATA_TYPE_RANGE (1 << 1) /* freq data struct is for a range */
+
+void hot_rb_inode_item_init(void *_item);
+void hot_rb_range_item_init(void *_item);
+
+int __init hot_rb_item_cache_init(void);
+
#endif /* __HOT_MAP__ */
diff --git a/fs/hot_track.c b/fs/hot_track.c
new file mode 100644
index 0000000..3690f26
--- /dev/null
+++ b/fs/hot_track.c
@@ -0,0 +1,26 @@
+/*
+ * fs/hot_track.c
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ * Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/parser.h>
+#include "hot_track.h"
+
+/*
+ * Initialize hot_inode_item, hot_range_item
+ * and hot_hash_node kmem cache
+ */
+void __init hot_track_item_cache_init(void)
+{
+ if (hot_rb_item_cache_init())
+ return;
+}
diff --git a/fs/hot_track.h b/fs/hot_track.h
new file mode 100644
index 0000000..cf4cf35
--- /dev/null
+++ b/fs/hot_track.h
@@ -0,0 +1,20 @@
+/*
+ * fs/hot_track.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ * Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_TRACK__
+#define __HOT_TRACK__
+
+#include "hot_rb.h"
+
+void __init hot_track_item_cache_init(void);
+
+#endif /* __HOT_TRACK__ */
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index fa2aeb6..2256496 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -26,6 +26,68 @@ struct hot_inode_tree {
rwlock_t lock;
};
+/* A tree of ranges for each inode in the hot_inode_tree */
+struct hot_range_tree {
+ struct rb_root map;
+ rwlock_t lock;
+};
+
+/* A frequency data struct holds values that are used to
+ * determine temperature of files and file ranges. These structs
+ * are members of hot_inode_item and hot_range_item
+ */
+struct hot_freq_data {
+ struct timespec last_read_time;
+ struct timespec last_write_time;
+ u32 nr_reads;
+ u32 nr_writes;
+ u64 avg_delta_reads;
+ u64 avg_delta_writes;
+ u8 flags;
+ u32 last_temperature;
+};
+
+/* An item representing an inode and its access frequency */
+struct hot_inode_item {
+ /* node for hot_inode_tree rb_tree */
+ struct rb_node rb_node;
+ /* tree of ranges in this inode */
+ struct hot_range_tree hot_range_tree;
+ /* frequency data for this inode */
+ struct hot_freq_data hot_freq_data;
+ /* inode number, copied from inode */
+ unsigned long i_ino;
+ /* used to check for errors in ref counting */
+ u8 in_tree;
+ /* protects hot_freq_data, i_no, in_tree */
+ spinlock_t lock;
+ /* prevents kfree */
+ struct kref refs;
+};
+
+/*
+ * An item representing a range inside of an inode whose frequency
+ * is being tracked
+ */
+struct hot_range_item {
+ /* node for hot_range_tree rb_tree */
+ struct rb_node rb_node;
+ /* frequency data for this range */
+ struct hot_freq_data hot_freq_data;
+ /* the hot_inode_item associated with this hot_range_item */
+ struct hot_inode_item *hot_inode;
+ /* starting offset of this range */
+ u64 start;
+ /* length of this range */
+ u64 len;
+ /* used to check for errors in ref counting */
+ u8 in_tree;
+ /* protects hot_freq_data, start, len, and in_tree */
+ spinlock_t lock;
+ /* prevents kfree */
+ struct kref refs;
+};
+
struct hot_info {
/* red-black tree that keeps track of fs-wide hot data */
--
1.7.6.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [RFC 04/11] vfs: add support for updating access frequency
2012-09-11 14:27 [RFC 00/11] VFS: hot data tracking zwu.kernel
` (2 preceding siblings ...)
2012-09-11 14:27 ` [RFC 03/11] vfs: introduce 2 rb tree items - inode and range zwu.kernel
@ 2012-09-11 14:27 ` zwu.kernel
2012-09-11 14:27 ` [RFC 05/11] vfs: add one new mount option -o hottrack zwu.kernel
2012-09-12 14:31 ` [RFC 00/11] VFS: hot data tracking Zhi Yong Wu
5 siblings, 0 replies; 8+ messages in thread
From: zwu.kernel @ 2012-09-11 14:27 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, Zhi Yong Wu
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Add some utils helpers to update access frequencies
for one file or its range.
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
fs/hot_rb.c | 359 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/hot_rb.h | 28 +++++
2 files changed, 387 insertions(+), 0 deletions(-)
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index e2bee75..560841a 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -102,3 +102,362 @@ void hot_rb_range_item_init(void *_item)
hr->hot_freq_data.avg_delta_writes = (u64) -1;
hr->hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
}
+
+/*
+ * Drops the reference out on hot_inode_item by one and free the structure
+ * if the reference count hits zero
+ */
+void hot_rb_free_hot_inode_item(struct hot_inode_item *he)
+{
+ if (!he)
+ return;
+
+ if (atomic_dec_and_test(&he->refs.refcount)) {
+ WARN_ON(he->in_tree);
+ kmem_cache_free(hot_inode_item_cache, he);
+ }
+}
+
+/*
+ * Drops the reference out on hot_range_item by one and free the structure
+ * if the reference count hits zero
+ */
+void hot_rb_free_hot_range_item(struct hot_range_item *hr)
+{
+ if (!hr)
+ return;
+
+ if (atomic_dec_and_test(&hr->refs.refcount)) {
+ WARN_ON(hr->in_tree);
+ kmem_cache_free(hot_range_item_cache, hr);
+ }
+}
+
+static struct rb_node *hot_rb_insert_hot_inode_item(struct rb_root *root,
+ unsigned long inode_num,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct hot_inode_item *entry;
+
+ /* walk tree to find insertion point */
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct hot_inode_item, rb_node);
+
+ if (inode_num < entry->i_ino)
+ p = &(*p)->rb_left;
+ else if (inode_num > entry->i_ino)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct hot_inode_item, rb_node);
+ entry->in_tree = 1;
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return NULL;
+}
+
+static u64 hot_rb_range_end(struct hot_range_item *hr)
+{
+ if (hr->start + hr->len < hr->start)
+ return (u64)-1;
+
+ return hr->start + hr->len - 1;
+}
+
+static struct rb_node *hot_rb_insert_hot_range_item(struct rb_root *root,
+ u64 start,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct hot_range_item *entry;
+
+ /* ensure start is on a range boundary */
+ start = start & RANGE_SIZE_MASK;
+ /* walk tree to find insertion point */
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct hot_range_item, rb_node);
+
+ if (start < entry->start)
+ p = &(*p)->rb_left;
+ else if (start >= hot_rb_range_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct hot_range_item, rb_node);
+ entry->in_tree = 1;
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return NULL;
+}
+
+/*
+ * Add a hot_inode_item to a hot_inode_tree. If the tree already contains
+ * an item with the index given, return -EEXIST
+ */
+int hot_rb_add_hot_inode_item(struct hot_inode_tree *tree,
+ struct hot_inode_item *he)
+{
+ int ret = 0;
+ struct rb_node *rb;
+
+ rb = hot_rb_insert_hot_inode_item(
+ &tree->map, he->i_ino, &he->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ kref_get(&he->refs);
+
+out:
+ return ret;
+}
+
+/*
+ * Add a hot_range_item to a hot_range_tree. If the tree already contains
+ * an item with the index given, return -EEXIST
+ *
+ * Also optionally aggresively merge ranges (currently disabled)
+ */
+int hot_rb_add_hot_range_item(struct hot_range_tree *tree,
+ struct hot_range_item *hr)
+{
+ int ret = 0;
+ struct rb_node *rb;
+
+ rb = hot_rb_insert_hot_range_item(
+ &tree->map, hr->start, &hr->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ kref_get(&hr->refs);
+
+out:
+ return ret;
+}
+
+/*
+ * Lookup a hot_inode_item in the hot_inode_tree with the given index
+ * (inode_num)
+ */
+struct hot_inode_item
+*hot_rb_lookup_hot_inode_item(struct hot_inode_tree *tree,
+ unsigned long inode_num)
+{
+ struct rb_node **p = &(tree->map.rb_node);
+ struct rb_node *parent = NULL;
+ struct hot_inode_item *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct hot_inode_item, rb_node);
+
+ if (inode_num < entry->i_ino)
+ p = &(*p)->rb_left;
+ else if (inode_num > entry->i_ino)
+ p = &(*p)->rb_right;
+ else {
+ kref_get(&entry->refs);
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Lookup a hot_range_item in a hot_range_tree with the given index
+ * (start, offset)
+ */
+struct hot_range_item
+*hot_rb_lookup_hot_range_item(struct hot_range_tree *tree,
+ u64 start)
+{
+ struct rb_node **p = &(tree->map.rb_node);
+ struct rb_node *parent = NULL;
+ struct hot_range_item *entry;
+
+ /* ensure start is on a range boundary */
+ start = start & RANGE_SIZE_MASK;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct hot_range_item, rb_node);
+
+ if (start < entry->start)
+ p = &(*p)->rb_left;
+ else if (start > hot_rb_range_end(entry))
+ p = &(*p)->rb_right;
+ else {
+ kref_get(&entry->refs);
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+
+/* Update inode frequency struct */
+static struct hot_inode_item *hot_rb_update_inode_freq(struct inode *inode,
+ int rw)
+{
+ struct hot_info *root = &(inode->i_sb->s_hotinfo);
+ struct hot_inode_tree *hitree = &(root->hot_inode_tree);
+ struct hot_inode_item *he;
+
+ read_lock(&hitree->lock);
+ he = hot_rb_lookup_hot_inode_item(hitree, inode->i_ino);
+ read_unlock(&hitree->lock);
+
+ if (!he) {
+ he = kmem_cache_alloc(hot_inode_item_cache,
+ GFP_KERNEL | GFP_NOFS);
+ if (!he)
+ goto out;
+
+ write_lock(&hitree->lock);
+ hot_rb_inode_item_init(he);
+ he->i_ino = inode->i_ino;
+ hot_rb_add_hot_inode_item(hitree, he);
+ write_unlock(&hitree->lock);
+ }
+
+ spin_lock(&he->lock);
+ hot_rb_update_freq(&he->hot_freq_data, rw);
+ spin_unlock(&he->lock);
+
+out:
+ return he;
+}
+
+/* Update range frequency struct */
+static bool hot_rb_update_range_freq(struct hot_inode_item *he,
+ u64 off, u64 len, int rw,
+ struct hot_info *root)
+{
+ struct hot_range_tree *hrtree = &(he->hot_range_tree);
+ struct hot_range_item *hr = NULL;
+ u64 start_off = off & RANGE_SIZE_MASK;
+ u64 end_off = (off + len - 1) & RANGE_SIZE_MASK;
+ u64 cur;
+ int ret = true;
+
+ if (len == 0)
+ return false;
+
+ /*
+ * Align ranges on RANGE_SIZE boundary to prevent proliferation
+ * of range structs
+ */
+ for (cur = start_off; cur <= end_off; cur += RANGE_SIZE) {
+ read_lock(&hrtree->lock);
+ hr = hot_rb_lookup_hot_range_item(hrtree, cur);
+ read_unlock(&hrtree->lock);
+
+ if (!hr) {
+ hr = kmem_cache_alloc(hot_range_item_cache,
+ GFP_KERNEL | GFP_NOFS);
+ if (!hr) {
+ ret = false;
+ goto out;
+ }
+
+ write_lock(&hrtree->lock);
+ hot_rb_range_item_init(hr);
+ hr->start = cur & RANGE_SIZE_MASK;
+ hr->len = RANGE_SIZE;
+ hr->hot_inode = he;
+ hot_rb_add_hot_range_item(hrtree, hr);
+ write_unlock(&hrtree->lock);
+ }
+
+ spin_lock(&hr->lock);
+ hot_rb_update_freq(&hr->hot_freq_data, rw);
+ spin_unlock(&hr->lock);
+ hot_rb_free_hot_range_item(hr);
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * This function does the actual work of updating the frequency numbers,
+ * whatever they turn out to be. FREQ_POWER determines how many atime
+ * deltas we keep track of (as a power of 2). So, setting it to anything above
+ * 16ish is probably overkill. Also, the higher the power, the more bits get
+ * right shifted out of the timestamp, reducing precision, so take note of that
+ * as well.
+ *
+ * The caller should have already locked freq_data's parent's spinlock.
+ *
+ * FREQ_POWER, defined immediately below, determines how heavily to weight
+ * the current frequency numbers against the newest access. For example, a value
+ * of 4 means that the new access information will be weighted 1/16th (ie 2^-4)
+ * as heavily as the existing frequency info. In essence, this is a kludged-
+ * together version of a weighted average, since we can't afford to keep all of
+ * the information that it would take to get a _real_ weighted average.
+ */
+void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw)
+{
+ struct timespec old_atime;
+ struct timespec current_time;
+ struct timespec delta_ts;
+ u64 new_avg;
+ u64 new_delta;
+
+ if (unlikely(rw)) {
+ old_atime = freq_data->last_write_time;
+ freq_data->nr_writes += 1;
+ new_avg = freq_data->avg_delta_writes;
+ } else {
+ old_atime = freq_data->last_read_time;
+ freq_data->nr_reads += 1;
+ new_avg = freq_data->avg_delta_reads;
+ }
+
+ current_time = current_kernel_time();
+ delta_ts = timespec_sub(current_time, old_atime);
+ new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
+
+ new_avg = (new_avg << FREQ_POWER) - new_avg + new_delta;
+ new_avg = new_avg >> FREQ_POWER;
+
+ if (unlikely(rw)) {
+ freq_data->last_write_time = current_time;
+ freq_data->avg_delta_writes = new_avg;
+ } else {
+ freq_data->last_read_time = current_time;
+ freq_data->avg_delta_reads = new_avg;
+ }
+}
+
+/* main function to update access frequency from read/writepage(s) hooks */
+void hot_rb_update_freqs(struct inode *inode, u64 start,
+ u64 len, int rw)
+{
+ struct hot_inode_item *he;
+
+ he = hot_rb_update_inode_freq(inode, rw);
+
+ WARN_ON(!he);
+
+ if (he) {
+ hot_rb_update_range_freq(he, start, len,
+ rw, &(inode->i_sb->s_hotinfo));
+
+ hot_rb_free_hot_inode_item(he);
+ }
+}
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index 9a68d699..4048027 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -21,10 +21,38 @@ void hot_rb_inode_tree_init(struct hot_inode_tree *tree);
/* values for hot_freq_data flags */
#define FREQ_DATA_TYPE_INODE (1 << 0) /* freq data struct is for an inode */
#define FREQ_DATA_TYPE_RANGE (1 << 1) /* freq data struct is for a range */
+/* size of sub-file ranges */
+#define RANGE_SIZE (1<<20)
+#define RANGE_SIZE_MASK (~((u64)(RANGE_SIZE - 1)))
+
+#define FREQ_POWER 4
+
+struct hot_info;
+struct inode;
void hot_rb_inode_item_init(void *_item);
void hot_rb_range_item_init(void *_item);
+struct hot_range_item
+*hot_rb_lookup_hot_range_item(struct hot_range_tree *tree,
+ u64 start);
+
+struct hot_inode_item
+*hot_rb_lookup_hot_inode_item(struct hot_inode_tree *tree,
+ unsigned long inode_num);
+
+int hot_rb_add_hot_inode_item(struct hot_inode_tree *tree,
+ struct hot_inode_item *he);
+int hot_rb_add_hot_range_item(struct hot_range_tree *tree,
+ struct hot_range_item *hr);
+
+void hot_rb_free_hot_inode_item(struct hot_inode_item *he);
+void hot_rb_free_hot_range_item(struct hot_range_item *hr);
+
int __init hot_rb_item_cache_init(void);
+void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw);
+void hot_rb_update_freqs(struct inode *inode, u64 start, u64 len,
+ int rw);
+
#endif /* __HOT_MAP__ */
--
1.7.6.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [RFC 05/11] vfs: add one new mount option -o hottrack
2012-09-11 14:27 [RFC 00/11] VFS: hot data tracking zwu.kernel
` (3 preceding siblings ...)
2012-09-11 14:27 ` [RFC 04/11] vfs: add support for updating access frequency zwu.kernel
@ 2012-09-11 14:27 ` zwu.kernel
2012-09-12 14:31 ` [RFC 00/11] VFS: hot data tracking Zhi Yong Wu
5 siblings, 0 replies; 8+ messages in thread
From: zwu.kernel @ 2012-09-11 14:27 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, Zhi Yong Wu
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Introduce one new mount option '-o hottrack',
and add its parsing support.
Its usage looks like:
mount -o hottrack
mount -o nouser,hottrack
mount -o nouser,hottrack,loop
mount -o hottrack,nouser
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
fs/hot_track.c | 34 ++++++++++++++++++++++++++++++++++
fs/hot_track.h | 1 +
fs/super.c | 5 +++++
include/linux/hot_track.h | 6 ++++++
4 files changed, 46 insertions(+), 0 deletions(-)
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 3690f26..36a41cb 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -16,6 +16,40 @@
#include "hot_track.h"
/*
+ * Regular mount options parser for -hottrack option.
+ * return false if no -hottrack is specified;
+ * otherwise return true. And the -hottrack will be
+ * removed from options.
+ */
+bool hot_track_parse_options(char *options)
+{
+ long len;
+ char *p;
+ static char opts_hot[] = "hottrack";
+
+ if (!options)
+ return false;
+
+ p = strstr(options, opts_hot);
+ if (!p)
+ return false;
+
+ while (p) {
+ len = options + strlen(options) - (p + strlen(opts_hot));
+ if (len == 0) {
+ options[0] = '\0';
+ break;
+ }
+
+ memmove(p, p + strlen(opts_hot) + 1, len);
+ p = strstr(options, opts_hot);
+ }
+
+ printk(KERN_INFO "vfs: turning on hot data tracking\n");
+ return true;
+}
+
+/*
* Initialize hot_inode_item, hot_range_item
* and hot_hash_node kmem cache
*/
diff --git a/fs/hot_track.h b/fs/hot_track.h
index cf4cf35..dc0f5a2 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -15,6 +15,7 @@
#include "hot_rb.h"
+bool hot_track_parse_options(char *options);
void __init hot_track_item_cache_init(void);
#endif /* __HOT_TRACK__ */
diff --git a/fs/super.c b/fs/super.c
index 0902cfa..d5bc781 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -35,6 +35,7 @@
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include "internal.h"
+#include "hot_track.h"
LIST_HEAD(super_blocks);
@@ -1125,6 +1126,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
struct dentry *root;
struct super_block *sb;
char *secdata = NULL;
+ bool hottrack = false;
int error = -ENOMEM;
if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
@@ -1137,6 +1139,9 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
goto out_free_secdata;
}
+ if (data && hot_track_parse_options(data))
+ hottrack = true;
+
root = type->mount(type, flags, name, data);
if (IS_ERR(root)) {
error = PTR_ERR(root);
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index 2256496..b56a467 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -20,6 +20,11 @@
#include <linux/rbtree.h>
#include <linux/kref.h>
+/*
+ * Flags for hot data tracking mount options.
+ */
+#define HOT_MOUNT_HOT_TRACK (1 << 0)
+
/* A tree that sits on the hot_info */
struct hot_inode_tree {
struct rb_root map;
@@ -89,6 +94,7 @@ struct hot_range_item {
};
struct hot_info {
+ unsigned long mount_opt;
/* red-black tree that keeps track of fs-wide hot data */
struct hot_inode_tree hot_inode_tree;
--
1.7.6.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [RFC 00/11] VFS: hot data tracking
2012-09-11 14:27 [RFC 00/11] VFS: hot data tracking zwu.kernel
` (4 preceding siblings ...)
2012-09-11 14:27 ` [RFC 05/11] vfs: add one new mount option -o hottrack zwu.kernel
@ 2012-09-12 14:31 ` Zhi Yong Wu
2012-09-14 7:35 ` Zhi Yong Wu
5 siblings, 1 reply; 8+ messages in thread
From: Zhi Yong Wu @ 2012-09-12 14:31 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, tytso, Zhi Yong Wu
Sorry, forgot CCed to Ted.
On Tue, Sep 11, 2012 at 10:27 PM, <zwu.kernel@gmail.com> wrote:
> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>
> HI, folks
> I have pushed the patchset to my kernel dev git tree:
> git@github.com:wuzhy/kernel.git
>
> Also, you can review it via
> https://github.com/wuzhy/kernel/commits/hottrack
>
> NOTE:
>
> The patchset still has a lot of bugfix and cleanup to do. It is post
> out mainly to make sure it is going in the correct direction and
> hope to get some helpful comments from other guys.
>
> TODO List:
>
> 1.) Need to do scalability or performance tests.
> 2.) Fix up bugs.
> 3.) Strictly split this patchset to keep them in order
> This patchset is in RFC state, i haven't strictly split it
> When it is in PATCH state, i will strictly split it and let
> them in order.
> 4.) Turn some Micro in to tunables
> TIME_TO_KICK, and HEAT_UPDATE_DELAY
> 5.) Rafactor hot_hash_is_aging()
> If you just made the timeout value a timespec and compared
> the _timespecs_, you would be doing a lot fewer conversions.
> 6.) Cleanup some unnecessary lock protect
> 7.) Add more comments to explain how to calc temperature
>
> Ben Chociej, Matt Lupfer and Conor Scott originally wrote this code to
> be very btrfs-specific. I've taken their code and attempted to
> make it more generic and integrate it at the VFS level.
>
> INTRODUCTION:
>
> Essentially, this means maintaining some key stats
> (like number of reads/writes, last read/write time, frequency of
> reads/writes), then distilling those numbers down to a single
> "temperature" value that reflects what data is "hot," and using that
> temperature to move data to SSDs.
>
> The long-term goal of these patches is to allow some FSs,
> e.g. Btrfs to intelligently utilize SSDs in a heterogenous volume.
> Incidentally, this project has been motivated by
> the Project Ideas page on the Btrfs wiki.
>
> Of course, users are warned not to run this code outside of development
> environments. These patches are EXPERIMENTAL, and as such they might eat
> your data and/or memory. That said, the code should be relatively safe
> when the hottrack mount option are disabled.
>
> MOTIVATION:
>
> The overall goal of enabling hot data relocation to SSD has been
> motivated by the Project Ideas page on the Btrfs wiki at
> <https://btrfs.wiki.kernel.org/index.php/Project_ideas>.
> It will divide into two steps. VFS provide hot data tracking function
> while specific FS will provide hot data relocation function.
> So as the first step of this goal, it is hoped that the patchset
> for hot data tracking will eventually mature into VFS.
>
> This is essentially the traditional cache argument: SSD is fast and
> expensive; HDD is cheap but slow. ZFS, for example, can already take
> advantage of SSD caching. Btrfs should also be able to take advantage of
> hybrid storage without many broad, sweeping changes to existing code.
>
> SUMMARY:
>
> - Hooks in existing vfs functions to track data access frequency
>
> - New rbtrees for tracking access frequency of inodes and sub-file
> ranges (hot_rb.c)
> The relationship between super_block and rbtree is as below:
> super_block->s_hotinfo.hot_inode_tree
> In include/linux/fs.h, one struct hot_info s_hotinfo is added to
> super_block struct. Each FS instance can find hot tracking info
> s_hotinfo via its super_block. In this hot_info, it store a lot of hot
> tracking info such as hot_inode_tree, inode and range hash list, etc.
>
> - A hash list for indexing data by its temperature (hot_hash.c)
>
> - A debugfs interface for dumping data from the rbtrees (hot_debugfs.c)
>
> - A background kthread for updating inode heat info
>
> - Mount options for enabling temperature tracking(-o hottrack, default mean disabled)
> (hot_track.c)
>
> - An ioctl to retrieve the frequency information collected for a certain
> file
>
> - Ioctls to enable/disable frequency tracking per inode.
>
> Usage syntax:
>
> root@debian-i386:~# mount -o hottrack /dev/sdb /mnt
> [ 1505.894078] device label test devid 1 transid 29 /dev/sdb
> [ 1505.952977] btrfs: disk space caching is enabled
> [ 1506.069678] vfs: turning on hot data tracking
> root@debian-i386:~# mount -t debugfs none /sys/kernel/debug
> root@debian-i386:~# ls -l /sys/kernel/debug/vfs_hotdata/
> total 0
> drwxr-xr-x 2 root root 0 Aug 8 04:40 sdb
> root@debian-i386:~# ls -l /sys/kernel/debug/vfs_hotdata/sdb
> total 0
> -rw-r--r-- 1 root root 0 Aug 8 04:40 inode_data
> -rw-r--r-- 1 root root 0 Aug 8 04:40 range_data
> root@debian-i386:~# vi /mnt/file
> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/inode_data
> inode #279, reads 0, writes 1, avg read time 18446744073709551615,
> avg write time 5251566408153596, temp 109
> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/range_data
> inode #279, range start 0 (range len 1048576) reads 0, writes 1,
> avg read time 18446744073709551615, avg write time 1128690176623144209, temp 64
> root@debian-i386:~# echo "hot data tracking test" >> /mnt/file
> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/inode_data
> inode #279, reads 0, writes 2, avg read time 18446744073709551615,
> avg write time 4923343766042451, temp 109
> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/range_data
> inode #279, range start 0 (range len 1048576) reads 0, writes 2,
> avg read time 18446744073709551615, avg write time 1058147040842596150, temp 64
> root@debian-i386:~#
>
> Zhi Yong Wu (11):
> vfs: introduce one structure hot_info
> vfs: introduce one rb tree - hot_inode_tree
> vfs: introduce 2 rb tree items - inode and range
> vfs: add support for updating access frequency
> vfs: add one new mount option -o hottrack
> vfs: add init and exit support
> vfs: introduce one hash table
> vfs: enable hot data tracking
> vfs: fork one private kthread to update temperature info
> vfs: add 3 new ioctl interfaces
> vfs: add debugfs support
>
> fs/Makefile | 3 +-
> fs/compat_ioctl.c | 8 +
> fs/dcache.c | 2 +
> fs/direct-io.c | 10 +
> fs/hot_debugfs.c | 488 ++++++++++++++++++++++++++++++++++
> fs/hot_debugfs.h | 60 +++++
> fs/hot_hash.c | 382 ++++++++++++++++++++++++++
> fs/hot_hash.h | 112 ++++++++
> fs/hot_rb.c | 648 +++++++++++++++++++++++++++++++++++++++++++++
> fs/hot_rb.h | 81 ++++++
> fs/hot_track.c | 85 ++++++
> fs/hot_track.h | 23 ++
> fs/ioctl.c | 132 +++++++++
> fs/namespace.c | 10 +
> fs/super.c | 11 +
> include/linux/fs.h | 15 +
> include/linux/hot_track.h | 169 ++++++++++++
> mm/filemap.c | 8 +
> mm/page-writeback.c | 21 ++
> mm/readahead.c | 9 +
> 20 files changed, 2276 insertions(+), 1 deletions(-)
> create mode 100644 fs/hot_debugfs.c
> create mode 100644 fs/hot_debugfs.h
> create mode 100644 fs/hot_hash.c
> create mode 100644 fs/hot_hash.h
> create mode 100644 fs/hot_rb.c
> create mode 100644 fs/hot_rb.h
> create mode 100644 fs/hot_track.c
> create mode 100644 fs/hot_track.h
> create mode 100644 include/linux/hot_track.h
>
> --
> 1.7.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Regards,
Zhi Yong Wu
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC 00/11] VFS: hot data tracking
2012-09-12 14:31 ` [RFC 00/11] VFS: hot data tracking Zhi Yong Wu
@ 2012-09-14 7:35 ` Zhi Yong Wu
0 siblings, 0 replies; 8+ messages in thread
From: Zhi Yong Wu @ 2012-09-14 7:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
aneesh.kumar, tytso, Zhi Yong Wu
hi, all maintainers.
ping? any comments are appreciated, thanks.
On Wed, Sep 12, 2012 at 10:31 PM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> Sorry, forgot CCed to Ted.
>
> On Tue, Sep 11, 2012 at 10:27 PM, <zwu.kernel@gmail.com> wrote:
>> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>
>> HI, folks
>> I have pushed the patchset to my kernel dev git tree:
>> git@github.com:wuzhy/kernel.git
>>
>> Also, you can review it via
>> https://github.com/wuzhy/kernel/commits/hottrack
>>
>> NOTE:
>>
>> The patchset still has a lot of bugfix and cleanup to do. It is post
>> out mainly to make sure it is going in the correct direction and
>> hope to get some helpful comments from other guys.
>>
>> TODO List:
>>
>> 1.) Need to do scalability or performance tests.
>> 2.) Fix up bugs.
>> 3.) Strictly split this patchset to keep them in order
>> This patchset is in RFC state, i haven't strictly split it
>> When it is in PATCH state, i will strictly split it and let
>> them in order.
>> 4.) Turn some Micro in to tunables
>> TIME_TO_KICK, and HEAT_UPDATE_DELAY
>> 5.) Rafactor hot_hash_is_aging()
>> If you just made the timeout value a timespec and compared
>> the _timespecs_, you would be doing a lot fewer conversions.
>> 6.) Cleanup some unnecessary lock protect
>> 7.) Add more comments to explain how to calc temperature
>>
>> Ben Chociej, Matt Lupfer and Conor Scott originally wrote this code to
>> be very btrfs-specific. I've taken their code and attempted to
>> make it more generic and integrate it at the VFS level.
>>
>> INTRODUCTION:
>>
>> Essentially, this means maintaining some key stats
>> (like number of reads/writes, last read/write time, frequency of
>> reads/writes), then distilling those numbers down to a single
>> "temperature" value that reflects what data is "hot," and using that
>> temperature to move data to SSDs.
>>
>> The long-term goal of these patches is to allow some FSs,
>> e.g. Btrfs to intelligently utilize SSDs in a heterogenous volume.
>> Incidentally, this project has been motivated by
>> the Project Ideas page on the Btrfs wiki.
>>
>> Of course, users are warned not to run this code outside of development
>> environments. These patches are EXPERIMENTAL, and as such they might eat
>> your data and/or memory. That said, the code should be relatively safe
>> when the hottrack mount option are disabled.
>>
>> MOTIVATION:
>>
>> The overall goal of enabling hot data relocation to SSD has been
>> motivated by the Project Ideas page on the Btrfs wiki at
>> <https://btrfs.wiki.kernel.org/index.php/Project_ideas>.
>> It will divide into two steps. VFS provide hot data tracking function
>> while specific FS will provide hot data relocation function.
>> So as the first step of this goal, it is hoped that the patchset
>> for hot data tracking will eventually mature into VFS.
>>
>> This is essentially the traditional cache argument: SSD is fast and
>> expensive; HDD is cheap but slow. ZFS, for example, can already take
>> advantage of SSD caching. Btrfs should also be able to take advantage of
>> hybrid storage without many broad, sweeping changes to existing code.
>>
>> SUMMARY:
>>
>> - Hooks in existing vfs functions to track data access frequency
>>
>> - New rbtrees for tracking access frequency of inodes and sub-file
>> ranges (hot_rb.c)
>> The relationship between super_block and rbtree is as below:
>> super_block->s_hotinfo.hot_inode_tree
>> In include/linux/fs.h, one struct hot_info s_hotinfo is added to
>> super_block struct. Each FS instance can find hot tracking info
>> s_hotinfo via its super_block. In this hot_info, it store a lot of hot
>> tracking info such as hot_inode_tree, inode and range hash list, etc.
>>
>> - A hash list for indexing data by its temperature (hot_hash.c)
>>
>> - A debugfs interface for dumping data from the rbtrees (hot_debugfs.c)
>>
>> - A background kthread for updating inode heat info
>>
>> - Mount options for enabling temperature tracking(-o hottrack, default mean disabled)
>> (hot_track.c)
>>
>> - An ioctl to retrieve the frequency information collected for a certain
>> file
>>
>> - Ioctls to enable/disable frequency tracking per inode.
>>
>> Usage syntax:
>>
>> root@debian-i386:~# mount -o hottrack /dev/sdb /mnt
>> [ 1505.894078] device label test devid 1 transid 29 /dev/sdb
>> [ 1505.952977] btrfs: disk space caching is enabled
>> [ 1506.069678] vfs: turning on hot data tracking
>> root@debian-i386:~# mount -t debugfs none /sys/kernel/debug
>> root@debian-i386:~# ls -l /sys/kernel/debug/vfs_hotdata/
>> total 0
>> drwxr-xr-x 2 root root 0 Aug 8 04:40 sdb
>> root@debian-i386:~# ls -l /sys/kernel/debug/vfs_hotdata/sdb
>> total 0
>> -rw-r--r-- 1 root root 0 Aug 8 04:40 inode_data
>> -rw-r--r-- 1 root root 0 Aug 8 04:40 range_data
>> root@debian-i386:~# vi /mnt/file
>> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/inode_data
>> inode #279, reads 0, writes 1, avg read time 18446744073709551615,
>> avg write time 5251566408153596, temp 109
>> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/range_data
>> inode #279, range start 0 (range len 1048576) reads 0, writes 1,
>> avg read time 18446744073709551615, avg write time 1128690176623144209, temp 64
>> root@debian-i386:~# echo "hot data tracking test" >> /mnt/file
>> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/inode_data
>> inode #279, reads 0, writes 2, avg read time 18446744073709551615,
>> avg write time 4923343766042451, temp 109
>> root@debian-i386:~# cat /sys/kernel/debug/hot_track/sdb/range_data
>> inode #279, range start 0 (range len 1048576) reads 0, writes 2,
>> avg read time 18446744073709551615, avg write time 1058147040842596150, temp 64
>> root@debian-i386:~#
>>
>> Zhi Yong Wu (11):
>> vfs: introduce one structure hot_info
>> vfs: introduce one rb tree - hot_inode_tree
>> vfs: introduce 2 rb tree items - inode and range
>> vfs: add support for updating access frequency
>> vfs: add one new mount option -o hottrack
>> vfs: add init and exit support
>> vfs: introduce one hash table
>> vfs: enable hot data tracking
>> vfs: fork one private kthread to update temperature info
>> vfs: add 3 new ioctl interfaces
>> vfs: add debugfs support
>>
>> fs/Makefile | 3 +-
>> fs/compat_ioctl.c | 8 +
>> fs/dcache.c | 2 +
>> fs/direct-io.c | 10 +
>> fs/hot_debugfs.c | 488 ++++++++++++++++++++++++++++++++++
>> fs/hot_debugfs.h | 60 +++++
>> fs/hot_hash.c | 382 ++++++++++++++++++++++++++
>> fs/hot_hash.h | 112 ++++++++
>> fs/hot_rb.c | 648 +++++++++++++++++++++++++++++++++++++++++++++
>> fs/hot_rb.h | 81 ++++++
>> fs/hot_track.c | 85 ++++++
>> fs/hot_track.h | 23 ++
>> fs/ioctl.c | 132 +++++++++
>> fs/namespace.c | 10 +
>> fs/super.c | 11 +
>> include/linux/fs.h | 15 +
>> include/linux/hot_track.h | 169 ++++++++++++
>> mm/filemap.c | 8 +
>> mm/page-writeback.c | 21 ++
>> mm/readahead.c | 9 +
>> 20 files changed, 2276 insertions(+), 1 deletions(-)
>> create mode 100644 fs/hot_debugfs.c
>> create mode 100644 fs/hot_debugfs.h
>> create mode 100644 fs/hot_hash.c
>> create mode 100644 fs/hot_hash.h
>> create mode 100644 fs/hot_rb.c
>> create mode 100644 fs/hot_rb.h
>> create mode 100644 fs/hot_track.c
>> create mode 100644 fs/hot_track.h
>> create mode 100644 include/linux/hot_track.h
>>
>> --
>> 1.7.6.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Regards,
>
> Zhi Yong Wu
--
Regards,
Zhi Yong Wu
^ permalink raw reply [flat|nested] 8+ messages in thread