linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC 06/11] vfs: add init and exit support
@ 2012-09-11 14:40 zwu.kernel
  2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Add initialization function to create some
key data structures when hot tracking is enabled;
Clean up them when hot tracking is disabled.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_rb.c    |   45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_rb.h    |    6 ++++++
 fs/hot_track.c |   15 +++++++++++++++
 fs/hot_track.h |    2 ++
 fs/namespace.c |    4 ++++
 fs/super.c     |    6 ++++++
 6 files changed, 78 insertions(+), 0 deletions(-)

diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index 560841a..71f5e76 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -133,6 +133,33 @@ void hot_rb_free_hot_range_item(struct hot_range_item *hr)
 	}
 }
 
+/* Frees the entire hot_inode_tree. */
+void hot_rb_free_hot_inode_tree(struct hot_info *root)
+{
+	struct rb_node *node, *node2;
+	struct hot_inode_item *he;
+	struct hot_range_item *hr;
+
+	/* Free hot inode and range trees on fs root */
+	node = rb_first(&root->hot_inode_tree.map);
+
+	while (node) {
+		he = rb_entry(node, struct hot_inode_item, rb_node);
+
+		node2 = rb_first(&he->hot_range_tree.map);
+		while (node2) {
+			hr = rb_entry(node2, struct hot_range_item, rb_node);
+			hot_rb_remove_hot_range_item(&he->hot_range_tree, hr);
+			hot_rb_free_hot_range_item(hr);
+			node2 = rb_first(&he->hot_range_tree.map);
+		}
+
+		hot_rb_remove_hot_inode_item(&root->hot_inode_tree, he);
+		hot_rb_free_hot_inode_item(he);
+		node = rb_first(&root->hot_inode_tree.map);
+	}
+}
+
 static struct rb_node *hot_rb_insert_hot_inode_item(struct rb_root *root,
 						unsigned long inode_num,
 						struct rb_node *node)
@@ -309,6 +336,24 @@ struct hot_range_item
 	return NULL;
 }
 
+int hot_rb_remove_hot_inode_item(struct hot_inode_tree *tree,
+				struct hot_inode_item *he)
+{
+	int ret = 0;
+	rb_erase(&he->rb_node, &tree->map);
+	he->in_tree = 0;
+	return ret;
+}
+
+int hot_rb_remove_hot_range_item(struct hot_range_tree *tree,
+				struct hot_range_item *hr)
+{
+	int ret = 0;
+	rb_erase(&hr->rb_node, &tree->map);
+	hr->in_tree = 0;
+	return ret;
+}
+
 /* Update inode frequency struct */
 static struct hot_inode_item *hot_rb_update_inode_freq(struct inode *inode,
 							int rw)
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index 4048027..df8cd14 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -46,8 +46,14 @@ int hot_rb_add_hot_inode_item(struct hot_inode_tree *tree,
 int hot_rb_add_hot_range_item(struct hot_range_tree *tree,
 				struct hot_range_item *hr);
 
+int hot_rb_remove_hot_inode_item(struct hot_inode_tree *tree,
+				struct hot_inode_item *he);
+int hot_rb_remove_hot_range_item(struct hot_range_tree *tree,
+				struct hot_range_item *hr);
+
 void hot_rb_free_hot_inode_item(struct hot_inode_item *he);
 void hot_rb_free_hot_range_item(struct hot_range_item *hr);
+void hot_rb_free_hot_inode_tree(struct hot_info *root);
 
 int __init hot_rb_item_cache_init(void);
 
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 36a41cb..68f85ad 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -58,3 +58,18 @@ void __init hot_track_item_cache_init(void)
 	if (hot_rb_item_cache_init())
 		return;
 }
+
+/*
+ * Initialize the data structures for hot data tracking.
+ */
+void hot_track_init(struct super_block *sb, const char *name)
+{
+	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
+	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
+}
+
+void hot_track_exit(struct super_block *sb)
+{
+	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
+	hot_rb_free_hot_inode_tree(&sb->s_hotinfo);
+}
diff --git a/fs/hot_track.h b/fs/hot_track.h
index dc0f5a2..b2f096c 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -17,5 +17,7 @@
 
 bool hot_track_parse_options(char *options);
 void __init hot_track_item_cache_init(void);
+void hot_track_init(struct super_block *sb, const char *name);
+void hot_track_exit(struct super_block *sb);
 
 #endif /* __HOT_TRACK__ */
diff --git a/fs/namespace.c b/fs/namespace.c
index 4d31f73..90c958a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/uaccess.h>
 #include "pnode.h"
 #include "internal.h"
+#include "hot_track.h"
 
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
@@ -1215,6 +1216,9 @@ static int do_umount(struct mount *mnt, int flags)
 		return retval;
 	}
 
+	if (sb->s_hotinfo.mount_opt & HOT_MOUNT_HOT_TRACK)
+		hot_track_exit(sb);
+
 	down_write(&namespace_sem);
 	br_write_lock(&vfsmount_lock);
 	event++;
diff --git a/fs/super.c b/fs/super.c
index d5bc781..eaf95fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1153,6 +1153,9 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	WARN_ON(sb->s_bdi == &default_backing_dev_info);
 	sb->s_flags |= MS_BORN;
 
+	if (hottrack)
+		hot_track_init(sb, name);
+
 	error = security_sb_kern_mount(sb, flags, secdata);
 	if (error)
 		goto out_sb;
@@ -1170,6 +1173,9 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	free_secdata(secdata);
 	return root;
 out_sb:
+	if (hottrack)
+		hot_track_exit(sb);
+
 	dput(root);
 	deactivate_locked_super(sb);
 out_free_secdata:
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 07/11] vfs: introduce one hash table
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 08/11] vfs: enable hot data tracking zwu.kernel
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Adds a hash table structure which contains
a lot of hash list and is used to efficiently
look up the data temperature of a file or its
ranges.
  In each hash list of hash table, the hash node
will keep track of temperature info.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/Makefile               |    2 +-
 fs/hot_hash.c             |   66 +++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_hash.h             |   26 +++++++++++++++++
 fs/hot_rb.c               |   20 +++++++++++++
 fs/hot_rb.h               |    2 +
 fs/hot_track.c            |    6 ++++
 fs/hot_track.h            |    1 +
 include/linux/hot_track.h |   38 ++++++++++++++++++++++++++
 8 files changed, 160 insertions(+), 1 deletions(-)
 create mode 100644 fs/hot_hash.c
 create mode 100644 fs/hot_hash.h

diff --git a/fs/Makefile b/fs/Makefile
index b4f620e..f925a66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o \
-		hot_rb.o hot_track.o
+		hot_rb.o hot_track.o hot_hash.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/hot_hash.c b/fs/hot_hash.c
new file mode 100644
index 0000000..cae5631
--- /dev/null
+++ b/fs/hot_hash.c
@@ -0,0 +1,66 @@
+/*
+ * fs/hot_hash.c
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/fs.h>
+#include "hot_rb.h"
+#include "hot_hash.h"
+
+/* kmem_cache pointers for slab caches */
+struct kmem_cache *hot_hash_node_cache;
+
+void hot_hash_heat_node_init(void *_node)
+{
+	struct hot_hash_node *node = _node;
+
+	memset(node, 0, sizeof(*node));
+	INIT_HLIST_NODE(&node->hashnode);
+	node->hot_freq_data = NULL;
+	node->hlist = NULL;
+	spin_lock_init(&node->lock);
+	kref_init(&node->refs);
+}
+
+int __init hot_hash_node_cache_init(void)
+{
+	hot_hash_node_cache = kmem_cache_create("hot_hash_node",
+				sizeof(struct hot_hash_node),
+				0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				hot_hash_heat_node_init);
+	if (!hot_hash_node_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Initialize the rwlock in heat inode/range hash lists.
+ */
+void hot_hash_heat_rwlock_init(struct hot_info *root)
+{
+	int i;
+	for (i = 0; i < HEAT_HASH_SIZE; i++) {
+		rwlock_init(&root->heat_inode_hl[i].rwlock);
+		rwlock_init(&root->heat_range_hl[i].rwlock);
+	}
+}
+
diff --git a/fs/hot_hash.h b/fs/hot_hash.h
new file mode 100644
index 0000000..65abc6d
--- /dev/null
+++ b/fs/hot_hash.h
@@ -0,0 +1,26 @@
+/*
+ * fs/hot_hash.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_HASH__
+#define __HOT_HASH__
+
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/hot_track.h>
+
+void hot_hash_heat_node_init(void *_node);
+
+int __init hot_hash_node_cache_init(void);
+
+void hot_hash_heat_rwlock_init(struct hot_info *root);
+
+#endif /* __HOT_HASH__ */
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index 71f5e76..fd3b9e5 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -68,6 +68,18 @@ inode_err:
 	return -ENOMEM;
 }
 
+void hot_rb_inode_item_exit(void)
+{
+	if (hot_inode_item_cache)
+		kmem_cache_destroy(hot_inode_item_cache);
+}
+
+void hot_rb_range_item_exit(void)
+{
+	if (hot_range_item_cache)
+		kmem_cache_destroy(hot_range_item_cache);
+}
+
 /*
  * Initialize a new hot_inode_item structure. The new structure is
  * returned with a reference count of one and needs to be
@@ -80,6 +92,10 @@ void hot_rb_inode_item_init(void *_item)
 	memset(he, 0, sizeof(*he));
 	kref_init(&he->refs);
 	spin_lock_init(&he->lock);
+	he->heat_node = kmem_cache_alloc(hot_hash_node_cache,
+					GFP_KERNEL | GFP_NOFS);
+	hot_hash_heat_node_init(he->heat_node);
+	he->heat_node->hot_freq_data = &he->hot_freq_data;
 	he->hot_freq_data.avg_delta_reads = (u64) -1;
 	he->hot_freq_data.avg_delta_writes = (u64) -1;
 	he->hot_freq_data.flags = FREQ_DATA_TYPE_INODE;
@@ -98,6 +114,10 @@ void hot_rb_range_item_init(void *_item)
 	memset(hr, 0, sizeof(*hr));
 	kref_init(&hr->refs);
 	spin_lock_init(&hr->lock);
+	hr->heat_node = kmem_cache_alloc(hot_hash_node_cache,
+				GFP_KERNEL | GFP_NOFS);
+	hot_hash_heat_node_init(hr->heat_node);
+	hr->heat_node->hot_freq_data = &hr->hot_freq_data;
 	hr->hot_freq_data.avg_delta_reads = (u64) -1;
 	hr->hot_freq_data.avg_delta_writes = (u64) -1;
 	hr->hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index df8cd14..193c265 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -57,6 +57,8 @@ void hot_rb_free_hot_inode_tree(struct hot_info *root);
 
 int __init hot_rb_item_cache_init(void);
 
+void hot_rb_inode_item_exit(void);
+void hot_rb_range_item_exit(void);
 void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw);
 void hot_rb_update_freqs(struct inode *inode, u64 start, u64 len,
 			int rw);
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 68f85ad..0ec8b83b 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -57,6 +57,11 @@ void __init hot_track_item_cache_init(void)
 {
 	if (hot_rb_item_cache_init())
 		return;
+
+	if (hot_hash_node_cache_init()) {
+		hot_rb_inode_item_exit();
+		hot_rb_range_item_exit();
+	}
 }
 
 /*
@@ -66,6 +71,7 @@ void hot_track_init(struct super_block *sb, const char *name)
 {
 	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
 	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
+	hot_hash_heat_rwlock_init(&sb->s_hotinfo);
 }
 
 void hot_track_exit(struct super_block *sb)
diff --git a/fs/hot_track.h b/fs/hot_track.h
index b2f096c..e137142 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -14,6 +14,7 @@
 #define __HOT_TRACK__
 
 #include "hot_rb.h"
+#include "hot_hash.h"
 
 bool hot_track_parse_options(char *options);
 void __init hot_track_item_cache_init(void);
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index b56a467..bde61de 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -20,11 +20,17 @@
 #include <linux/rbtree.h>
 #include <linux/kref.h>
 
+#define HEAT_HASH_BITS 8
+#define HEAT_HASH_SIZE (1 << HEAT_HASH_BITS)
+
 /*
  * Flags for hot data tracking mount options.
  */
 #define HOT_MOUNT_HOT_TRACK		(1 << 0)
 
+/* kmem_cache pointers for slab caches */
+extern struct kmem_cache *hot_hash_node_cache;
+
 /* A tree that sits on the hot_info */
 struct hot_inode_tree {
 	struct rb_root map;
@@ -52,6 +58,28 @@ struct hot_freq_data {
 	u32 last_temperature;
 };
 
+/* Hash list heads for hot hash table */
+struct hot_hash_head {
+	struct hlist_head hashhead;
+	rwlock_t rwlock;
+	u32 temperature;
+};
+
+/* Nodes stored in each hash list of hash table */
+struct hot_hash_node {
+	struct hlist_node hashnode;
+	struct list_head node;
+	struct hot_freq_data *hot_freq_data;
+	struct hot_hash_head *hlist;
+	spinlock_t lock; /* protects hlist */
+
+	/*
+	 * number of references to this node
+	 * equals 1 (hashlist entry)
+	 */
+	struct kref refs;
+};
+
 /* An item representing an inode and its access frequency */
 struct hot_inode_item {
 	/* node for hot_inode_tree rb_tree */
@@ -68,6 +96,8 @@ struct hot_inode_item {
 	spinlock_t lock;
 	/* prevents kfree */
 	struct kref refs;
+	/* hashlist node for this inode */
+	struct hot_hash_node *heat_node;
 };
 
 /*
@@ -91,6 +121,8 @@ struct hot_range_item {
 	spinlock_t lock;
 	/* prevents kfree */
 	struct kref refs;
+	/* hashlist node for this range */
+	struct hot_hash_node *heat_node;
 };
 
 struct hot_info {
@@ -98,6 +130,12 @@ struct hot_info {
 
 	/* red-black tree that keeps track of fs-wide hot data */
 	struct hot_inode_tree hot_inode_tree;
+
+	/* hash map of inode temperature */
+	struct hot_hash_head heat_inode_hl[HEAT_HASH_SIZE];
+
+	/* hash map of range temperature */
+	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
 };
 
 #endif  /* _LINUX_HOTTRACK_H */
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 08/11] vfs: enable hot data tracking
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
  2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 09/11] vfs: fork one private kthread to update temperature info zwu.kernel
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Miscellaneous features that implement hot data tracking
and generally make the hot data functions a bit more friendly.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/direct-io.c            |   10 ++++++++++
 include/linux/hot_track.h |   11 +++++++++++
 mm/filemap.c              |    8 ++++++++
 mm/page-writeback.c       |   21 +++++++++++++++++++++
 mm/readahead.c            |    9 +++++++++
 5 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720..74068e2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
+#include <linux/hot_track.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -1297,6 +1298,15 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(iocb->ki_filp->f_mapping->host)
+			&& iov_length(iov, nr_segs) > 0) {
+		hot_rb_update_freqs(iocb->ki_filp->f_mapping->host,
+				(u64)offset,
+				(u64)iov_length(iov, nr_segs),
+				rw & WRITE);
+	}
+
 	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				     nr_segs, get_block, end_io,
 				     submit_io, flags);
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index bde61de..8bb9028 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -28,6 +28,14 @@
  */
 #define HOT_MOUNT_HOT_TRACK		(1 << 0)
 
+/* Hot data tracking -- guard macros */
+#define TRACKING_HOT_TRACK(root) \
+		(root->s_hotinfo.mount_opt & HOT_MOUNT_HOT_TRACK)
+
+#define TRACK_THIS_INODE(inode) \
+		((TRACKING_HOT_TRACK(inode->i_sb)) && \
+		!(inode->i_flags & S_NOHOTDATATRACK))
+
 /* kmem_cache pointers for slab caches */
 extern struct kmem_cache *hot_hash_node_cache;
 
@@ -138,4 +146,7 @@ struct hot_info {
 	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
 };
 
+extern void hot_rb_update_freqs(struct inode *inode,
+				u64 start, u64 len, int rw);
+
 #endif  /* _LINUX_HOTTRACK_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 3843445..784d027 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/hot_track.h>
 #include "internal.h"
 
 /*
@@ -1224,6 +1225,13 @@ readpage:
 		 * PG_error will be set again if readpage fails.
 		 */
 		ClearPageError(page);
+
+		/* Hot data tracking */
+		if (TRACK_THIS_INODE(filp->f_mapping->host))
+			hot_rb_update_freqs(filp->f_mapping->host,
+				(u64)page->index << PAGE_CACHE_SHIFT,
+				PAGE_CACHE_SIZE, 0);
+
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce2..4e83e68 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,7 @@
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
 #include <linux/timer.h>
+#include <linux/hot_track.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -1895,13 +1896,33 @@ EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
+	pgoff_t start = 0;
+	u64 prev_count = 0, count = 0;
 
 	if (wbc->nr_to_write <= 0)
 		return 0;
+
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(mapping->host)
+		&& wbc->range_cyclic) {
+		start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+		prev_count = (u64)wbc->nr_to_write;
+	}
+
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
+
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(mapping->host)
+		&& wbc->range_cyclic) {
+		count = prev_count - (u64)wbc->nr_to_write;
+		if (count)
+			hot_rb_update_freqs(mapping->host, (u64)start,
+					count * PAGE_CACHE_SIZE, 1);
+	}
+
 	return ret;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa..c204f2b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/hot_track.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -138,6 +139,14 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 out:
 	blk_finish_plug(&plug);
 
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(mapping->host) && nr_pages > 0) {
+		u64 start = (u64)(list_entry(pages->prev,
+				struct page, lru)->index) << PAGE_CACHE_SHIFT;
+		hot_rb_update_freqs(mapping->host, start,
+				(u64)nr_pages * PAGE_CACHE_SIZE, 0);
+	}
+
 	return ret;
 }
 
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 09/11] vfs: fork one private kthread to update temperature info
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
  2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
  2012-09-11 14:40 ` [RFC 08/11] vfs: enable hot data tracking zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 10/11] vfs: add 3 new ioctl interfaces zwu.kernel
  2012-09-11 14:40 ` [RFC 11/11] vfs: add debugfs support zwu.kernel
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Fork and run one kernel kthread to calculate
that temperature based on some metrics kept
in custom frequency data structs, and store
the info in the hash table.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_hash.c             |  316 +++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_hash.h             |   86 ++++++++++++
 fs/hot_rb.c               |  132 ++++++++++++++++++-
 fs/hot_rb.h               |   15 ++
 fs/hot_track.c            |    3 +
 include/linux/hot_track.h |    5 +
 6 files changed, 551 insertions(+), 6 deletions(-)

diff --git a/fs/hot_hash.c b/fs/hot_hash.c
index cae5631..18d53b0 100644
--- a/fs/hot_hash.c
+++ b/fs/hot_hash.c
@@ -27,6 +27,8 @@
 /* kmem_cache pointers for slab caches */
 struct kmem_cache *hot_hash_node_cache;
 
+struct task_struct *hot_data_update_kthread;
+
 void hot_hash_heat_node_init(void *_node)
 {
 	struct hot_hash_node *node = _node;
@@ -64,3 +66,317 @@ void hot_hash_heat_rwlock_init(struct hot_info *root)
 	}
 }
 
+static void hot_hash_free_heat_node(struct hlist_head *head)
+{
+	struct hlist_node *pos = NULL, *pos2 = NULL;
+	struct hot_hash_node *heatnode = NULL;
+
+	hlist_for_each_safe(pos, pos2, head) {
+			heatnode = hlist_entry(pos,
+					struct hot_hash_node,
+					hashnode);
+			hlist_del(pos);
+			kfree(heatnode);
+	}
+
+}
+
+void hot_hash_free_heat_hash_list(struct hot_info *root)
+{
+	int i;
+
+	/* Free node/range heat hash lists */
+	for (i = 0; i < HEAT_HASH_SIZE; i++) {
+		hot_hash_free_heat_node(&root->heat_inode_hl[i].hashhead);
+		hot_hash_free_heat_node(&root->heat_range_hl[i].hashhead);
+	}
+}
+
+static u64 hot_hash_shift(u64 counter, u32 bits, bool shift_dir)
+{
+	if (shift_dir)
+		return counter << bits;
+	else
+		return counter >> bits;
+}
+
+/*
+ * hot_hash_calc_temperature() is responsible for distilling the six heat
+ * criteria, which are described in detail in hot_hash.h) down into a single
+ * temperature value for the data, which is an integer between 0
+ * and HEAT_MAX_VALUE.
+ *
+ * To accomplish this, the raw values from the hot_freq_data structure
+ * are shifted various ways in order to make the temperature calculation more
+ * or less sensitive to each value.
+ *
+ * Once this calibration has happened, we do some additional normalization and
+ * make sure that everything fits nicely in a u32. From there, we take a very
+ * rudimentary kind of "average" of each of the values, where the *_COEFF_POWER
+ * values act as weights for the average.
+ *
+ * Finally, we use the HEAT_HASH_BITS value, which determines the size of the
+ * heat hash list, to normalize the temperature to the proper granularity.
+ */
+int hot_hash_calc_temperature(struct hot_freq_data *freq_data)
+{
+	u64 result = 0;
+
+	struct timespec ckt = current_kernel_time();
+	u64 cur_time = timespec_to_ns(&ckt);
+
+	u32 nrr_heat = (u32)hot_hash_shift((u64)freq_data->nr_reads,
+					NRR_MULTIPLIER_POWER, true);
+	u32 nrw_heat = (u32)hot_hash_shift((u64)freq_data->nr_writes,
+					NRW_MULTIPLIER_POWER, true);
+
+	u64 ltr_heat =
+	hot_hash_shift((cur_time - timespec_to_ns(&freq_data->last_read_time)),
+			LTR_DIVIDER_POWER, false);
+	u64 ltw_heat =
+	hot_hash_shift((cur_time - timespec_to_ns(&freq_data->last_write_time)),
+			LTW_DIVIDER_POWER, false);
+
+	u64 avr_heat =
+		hot_hash_shift((((u64) -1) - freq_data->avg_delta_reads),
+			AVR_DIVIDER_POWER, false);
+	u64 avw_heat =
+		hot_hash_shift((((u64) -1) - freq_data->avg_delta_writes),
+			AVW_DIVIDER_POWER, false);
+
+	/* ltr_heat is now guaranteed to be u32 safe */
+	if (ltr_heat >= hot_hash_shift((u64) 1, 32, true))
+		ltr_heat = 0;
+	else
+		ltr_heat = hot_hash_shift((u64) 1, 32, true) - ltr_heat;
+
+	/* ltw_heat is now guaranteed to be u32 safe */
+	if (ltw_heat >= hot_hash_shift((u64) 1, 32, true))
+		ltw_heat = 0;
+	else
+		ltw_heat = hot_hash_shift((u64) 1, 32, true) - ltw_heat;
+
+	/* avr_heat is now guaranteed to be u32 safe */
+	if (avr_heat >= hot_hash_shift((u64) 1, 32, true))
+		avr_heat = (u32) -1;
+
+	/* avw_heat is now guaranteed to be u32 safe */
+	if (avw_heat >= hot_hash_shift((u64) 1, 32, true))
+		avw_heat = (u32) -1;
+
+	nrr_heat = (u32)hot_hash_shift((u64)nrr_heat,
+				(3 - NRR_COEFF_POWER), false);
+	nrw_heat = (u32)hot_hash_shift((u64)nrw_heat,
+				(3 - NRW_COEFF_POWER), false);
+	ltr_heat = hot_hash_shift(ltr_heat, (3 - LTR_COEFF_POWER), false);
+	ltw_heat = hot_hash_shift(ltw_heat, (3 - LTW_COEFF_POWER), false);
+	avr_heat = hot_hash_shift(avr_heat, (3 - AVR_COEFF_POWER), false);
+	avw_heat = hot_hash_shift(avw_heat, (3 - AVW_COEFF_POWER), false);
+
+	result = nrr_heat + nrw_heat + (u32) ltr_heat +
+			(u32) ltw_heat + (u32) avr_heat + (u32) avw_heat;
+
+	return result >> (32 - HEAT_HASH_BITS);
+}
+
+int hot_hash_is_aging(struct hot_freq_data *freq_data)
+{
+	int ret = 0;
+	struct timespec ckt = current_kernel_time();
+
+	u64 cur_time = timespec_to_ns(&ckt);
+	u64 last_read_ns =
+		(cur_time - timespec_to_ns(&freq_data->last_read_time));
+	u64 last_write_ns =
+		(cur_time - timespec_to_ns(&freq_data->last_write_time));
+	u64 kick_ns = TIME_TO_KICK * (u64)1000000000;
+
+	if ((last_read_ns > kick_ns) && (last_write_ns > kick_ns))
+		ret = 1;
+
+	return ret;
+}
+
+/*
+ * Calc a new temperature and, if necessary, move the heat_node corresponding
+ * to this inode or range to the proper hashlist with the new temperature
+ */
+void hot_hash_update_heat_hash_list(struct hot_freq_data *freq_data,
+			struct hot_info *root)
+{
+	int temperature = 0;
+	int moved = 0;
+	struct hot_hash_head *buckets, *current_bucket = NULL;
+	struct hot_inode_item *he;
+	struct hot_range_item *hr;
+
+	if (freq_data->flags & FREQ_DATA_TYPE_INODE) {
+		he = hot_freq_data_get_he(freq_data);
+		buckets = root->heat_inode_hl;
+
+		spin_lock(&he->lock);
+		temperature = hot_hash_calc_temperature(freq_data);
+		freq_data->last_temperature = temperature;
+		spin_unlock(&he->lock);
+
+		if (he == NULL)
+			return;
+
+		spin_lock(&he->heat_node->lock);
+		if (he->heat_node->hlist == NULL) {
+			current_bucket = buckets + temperature;
+			moved = 1;
+		} else {
+			write_lock(&he->heat_node->hlist->rwlock);
+			current_bucket = he->heat_node->hlist;
+			if (current_bucket->temperature != temperature) {
+				hlist_del(&he->heat_node->hashnode);
+				current_bucket = buckets + temperature;
+				moved = 1;
+			}
+			write_unlock(&he->heat_node->hlist->rwlock);
+		}
+
+		if (moved) {
+			write_lock(&current_bucket->rwlock);
+			hlist_add_head(&he->heat_node->hashnode,
+					&current_bucket->hashhead);
+			he->heat_node->hlist = current_bucket;
+			write_unlock(&current_bucket->rwlock);
+		}
+		spin_unlock(&he->heat_node->lock);
+	} else if (freq_data->flags & FREQ_DATA_TYPE_RANGE) {
+		hr = hot_freq_data_get_hr(freq_data);
+		buckets = root->heat_range_hl;
+
+		spin_lock(&hr->lock);
+		temperature = hot_hash_calc_temperature(freq_data);
+		freq_data->last_temperature = temperature;
+		spin_unlock(&hr->lock);
+
+		if (hr == NULL)
+			return;
+
+		spin_lock(&hr->heat_node->lock);
+		if (hr->heat_node->hlist == NULL) {
+			current_bucket = buckets + temperature;
+			moved = 1;
+		} else {
+			write_lock(&hr->heat_node->hlist->rwlock);
+			current_bucket = hr->heat_node->hlist;
+			if (current_bucket->temperature != temperature) {
+				hlist_del(&hr->heat_node->hashnode);
+				current_bucket = buckets + temperature;
+				moved = 1;
+			}
+			write_unlock(&hr->heat_node->hlist->rwlock);
+		}
+
+		if (moved) {
+			write_lock(&current_bucket->rwlock);
+			hlist_add_head(&hr->heat_node->hashnode,
+					&current_bucket->hashhead);
+			hr->heat_node->hlist = current_bucket;
+			write_unlock(&current_bucket->rwlock);
+		}
+		spin_unlock(&hr->heat_node->lock);
+	}
+}
+
+/*
+ * Update temperatures for each hot inode item and
+ * hot range item for aging purposes
+ */
+static void hot_hash_iterate_and_update_heat(struct hot_info *root)
+{
+	struct hot_inode_item *current_hot_inode;
+	struct hot_inode_tree *hot_inode_tree;
+	unsigned long inode_num;
+
+	hot_inode_tree = &root->hot_inode_tree;
+
+	/* walk the inode tree */
+	current_hot_inode = hot_rb_find_next_hot_inode(root, 0);
+	while (current_hot_inode) {
+		hot_hash_update_heat_hash_list(
+			&current_hot_inode->hot_freq_data, root);
+		hot_rb_update_range_data(current_hot_inode, root);
+		inode_num = current_hot_inode->i_ino;
+		hot_rb_free_hot_inode_item(current_hot_inode);
+		current_hot_inode = hot_rb_find_next_hot_inode(root,
+							inode_num + 1);
+	}
+}
+
+/* Determine if there is hot data tracking to be enabled */
+static bool hot_hash_global_hot_track(void)
+{
+	struct super_block *sb;
+	bool ret = false;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		if (sb->s_hotinfo.mount_opt & HOT_MOUNT_HOT_TRACK)
+			ret = true;
+	}
+	spin_unlock(&sb_lock);
+
+	return ret;
+}
+
+/*
+ * kthread iterates each hot_inode_item and hot_range_item
+ * and update temperatures to be shifted in heat hash table
+ * for purposes of relocation and such hot file detection
+ */
+static int hot_hash_update_temperature_kthread(void *arg)
+{
+	struct super_block *sb;
+	struct hot_info *root;
+	unsigned long delay;
+
+	do {
+		spin_lock(&sb_lock);
+		list_for_each_entry(sb, &super_blocks, s_list) {
+			if (hlist_unhashed(&sb->s_instances))
+				continue;
+			delay = HZ * HEAT_UPDATE_DELAY;
+			root = &sb->s_hotinfo;
+			if (mutex_trylock(
+				&root->hot_data_update_kthread_mutex)) {
+				hot_hash_iterate_and_update_heat(root);
+				mutex_unlock(
+					&root->hot_data_update_kthread_mutex);
+			}
+			if (unlikely(freezing(current))) {
+				__refrigerator(true);
+			} else {
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (!kthread_should_stop()) {
+					spin_unlock(&sb_lock);
+					schedule_timeout(delay);
+					spin_lock(&sb_lock);
+				}
+				__set_current_state(TASK_RUNNING);
+			}
+		}
+		spin_unlock(&sb_lock);
+	} while (!kthread_should_stop() || !hot_hash_global_hot_track());
+
+	return 0;
+}
+
+/* Fork the kthread to do temperature updates for all filesystems */
+void hot_hash_fork_update_temperature_kthread()
+{
+	if (hot_data_update_kthread)
+		return;
+
+	hot_data_update_kthread =
+		kthread_run(hot_hash_update_temperature_kthread, NULL,
+					"update_hot_temperature_kthread");
+	if (IS_ERR(hot_data_update_kthread))
+		kthread_stop(hot_data_update_kthread);
+}
diff --git a/fs/hot_hash.h b/fs/hot_hash.h
index 65abc6d..9cb89e9 100644
--- a/fs/hot_hash.h
+++ b/fs/hot_hash.h
@@ -17,10 +17,96 @@
 #include <linux/hash.h>
 #include <linux/hot_track.h>
 
+/* time to quit keeping track of tracking data (seconds)*/
+#define TIME_TO_KICK 400
+
+/* set how often to update temps (seconds) */
+#define HEAT_UPDATE_DELAY 400
+
+/*
+ * The following comments explain what exactly comprises a unit of heat.
+ *
+ * Each of six values of heat are calculated and combined in order to form an
+ * overall temperature for the data:
+ *
+ * NRR - number of reads since mount
+ * NRW - number of writes since mount
+ * LTR - time elapsed since last read (ns)
+ * LTW - time elapsed since last write (ns)
+ * AVR - average delta between recent reads (ns)
+ * AVW - average delta between recent writes (ns)
+ *
+ * These values are divided (right-shifted) according to the *_DIVIDER_POWER
+ * values defined below to bring the numbers into a reasonable range. You can
+ * modify these values to fit your needs. However, each heat unit is a u32 and
+ * thus maxes out at 2^32 - 1. Therefore, you must choose your dividers quite
+ * carefully or else they could max out or be stuck at zero quite easily.
+ *
+ * (E.g., if you chose AVR_DIVIDER_POWER = 0, nothing less than 4s of atime
+ * delta would bring the temperature above zero, ever.)
+ *
+ * Finally, each value is added to the overall temperature between 0 and 8
+ * times, depending on its *_COEFF_POWER value. Note that the coefficients are
+ * also actually implemented with shifts, so take care to treat these values
+ * as powers of 2. (I.e., 0 means we'll add it to the temp once; 1 = 2x, etc.)
+ */
+
+/* NRR/NRW heat unit = 2^X accesses */
+#define NRR_MULTIPLIER_POWER 20
+#define NRR_COEFF_POWER 0
+#define NRW_MULTIPLIER_POWER 20
+#define NRW_COEFF_POWER 0
+
+/* LTR/LTW heat unit = 2^X ns of age */
+#define LTR_DIVIDER_POWER 30
+#define LTR_COEFF_POWER 1
+#define LTW_DIVIDER_POWER 30
+#define LTW_COEFF_POWER 1
+
+/*
+ * AVR/AVW cold unit = 2^X ns of average delta
+ * AVR/AVW heat unit = HEAT_MAX_VALUE - cold unit
+ *
+ * E.g., data with an average delta between 0 and 2^X ns will have a cold value
+ * of 0, which means a heat value equal to HEAT_MAX_VALUE.
+ */
+#define AVR_DIVIDER_POWER 40
+#define AVR_COEFF_POWER 0
+#define AVW_DIVIDER_POWER 40
+#define AVW_COEFF_POWER 0
+
+/* macros to wrap container_of()'s for hot data structs */
+#define hot_freq_data_get_he(x) \
+        ((struct hot_inode_item *) container_of(x, \
+        struct hot_inode_item, hot_freq_data))
+#define hot_freq_data_get_hr(x) \
+        ((struct hot_range_item *) container_of(x, \
+        struct hot_range_item, hot_freq_data))
+
+struct hot_info;
+
 void hot_hash_heat_node_init(void *_node);
 
 int __init hot_hash_node_cache_init(void);
 
 void hot_hash_heat_rwlock_init(struct hot_info *root);
+void hot_hash_free_heat_hash_list(struct hot_info *root);
+
+/*
+ * Returns a value from 0 to HEAT_MAX_VALUE indicating the temperature of the
+ * file (and consequently its bucket number in hash list) (see hot_hash.c)
+ */
+int hot_hash_calc_temperature(struct hot_freq_data *freq_data);
+
+int hot_hash_is_aging(struct hot_freq_data *freq_data);
+
+void hot_hash_update_heat_hash_list(struct hot_freq_data *freq_data,
+                        struct hot_info *root);
+/*
+ * initialize kthread for each new mount point that
+ * periodically goes through hot inodes and hot ranges and ages them
+ * based on frequency of access
+ */
+void hot_hash_fork_update_temperature_kthread(void);
 
 #endif /* __HOT_HASH__ */
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index fd3b9e5..37d3771 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -399,9 +399,13 @@ static struct hot_inode_item *hot_rb_update_inode_freq(struct inode *inode,
 		write_unlock(&hitree->lock);
 	}
 
-	spin_lock(&he->lock);
-	hot_rb_update_freq(&he->hot_freq_data, rw);
-	spin_unlock(&he->lock);
+	if (!hot_data_update_kthread
+		|| hot_data_update_kthread->pid != current->pid) {
+		spin_lock(&he->lock);
+		hot_rb_update_freq(&he->hot_freq_data, rw);
+		spin_unlock(&he->lock);
+		hot_hash_update_heat_hash_list(&he->hot_freq_data, root);
+	}
 
 out:
 	return he;
@@ -448,9 +452,14 @@ static bool hot_rb_update_range_freq(struct hot_inode_item *he,
 			write_unlock(&hrtree->lock);
 		}
 
-		spin_lock(&hr->lock);
-		hot_rb_update_freq(&hr->hot_freq_data, rw);
-		spin_unlock(&hr->lock);
+		if (!hot_data_update_kthread
+			|| hot_data_update_kthread->pid != current->pid) {
+			spin_lock(&hr->lock);
+			hot_rb_update_freq(&hr->hot_freq_data, rw);
+			spin_unlock(&hr->lock);
+			hot_hash_update_heat_hash_list(&hr->hot_freq_data, root);
+		}
+
 		hot_rb_free_hot_range_item(hr);
 	}
 
@@ -509,6 +518,57 @@ void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw)
 	}
 }
 
+/* Walk the hot_inode_tree, locking as necessary */
+struct hot_inode_item *hot_rb_find_next_hot_inode(struct hot_info *root,
+						u64 objectid)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct hot_inode_item *entry;
+
+	read_lock(&root->hot_inode_tree.lock);
+
+	node = root->hot_inode_tree.map.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct hot_inode_item, rb_node);
+
+		if (objectid < entry->i_ino)
+			node = node->rb_left;
+		else if (objectid > entry->i_ino)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct hot_inode_item, rb_node);
+			if (objectid <= entry->i_ino) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+
+	if (node) {
+		entry = rb_entry(node, struct hot_inode_item, rb_node);
+		/*
+		  * increase reference count to prevent pruning while
+		  * caller is using the hot_inode_item
+		  */
+		kref_get(&entry->refs);
+
+		read_unlock(&root->hot_inode_tree.lock);
+		return entry;
+	}
+
+	read_unlock(&root->hot_inode_tree.lock);
+	return NULL;
+}
+
 /* main function to update access frequency from read/writepage(s) hooks */
 void hot_rb_update_freqs(struct inode *inode, u64 start,
 			u64 len, int rw)
@@ -526,3 +586,63 @@ void hot_rb_update_freqs(struct inode *inode, u64 start,
 		hot_rb_free_hot_inode_item(he);
 	}
 }
+
+/*
+ * take hot range that is now cold and remove from indexes and clean up
+ * any memory associted, involves removing hot range from rb tree, and
+ * heat hash lists, and freeing up all memory.
+ */
+static void hot_rb_remove_range_data(struct hot_inode_item *hot_inode,
+			struct hot_range_item *hr,
+			struct hot_info *root)
+{
+	/* remove range from rb tree */
+	hot_rb_remove_hot_range_item(&hot_inode->hot_range_tree, hr);
+
+	/* remove range from hash list */
+	spin_lock(&hr->heat_node->lock);
+	write_lock(&hr->heat_node->hlist->rwlock);
+	hlist_del(&hr->heat_node->hashnode);
+	write_unlock(&hr->heat_node->hlist->rwlock);
+	spin_unlock(&hr->heat_node->lock);
+
+	/*free up memory */
+	kfree(hr->heat_node);
+	hot_rb_free_hot_range_item(hr);
+}
+
+/* Update temperatures for each range item for aging purposes */
+void hot_rb_update_range_data(struct hot_inode_item *hot_inode,
+					struct hot_info *root)
+{
+	struct hot_range_tree *inode_range_tree;
+	struct rb_node *node;
+	struct rb_node *old_node;
+	struct hot_range_item *current_range;
+	int range_is_aging;
+
+	inode_range_tree = &hot_inode->hot_range_tree;
+	write_lock(&inode_range_tree->lock);
+	node = rb_first(&inode_range_tree->map);
+	/* Walk the hot_range_tree for inode */
+	while (node) {
+		current_range = rb_entry(node, struct hot_range_item, rb_node);
+		hot_hash_update_heat_hash_list(&current_range->hot_freq_data, root);
+		old_node = node;
+		node = rb_next(node);
+
+		spin_lock(&current_range->lock);
+		range_is_aging = hot_hash_is_aging(&current_range->hot_freq_data);
+		spin_unlock(&current_range->lock);
+
+		if (range_is_aging) {
+			if (atomic_read(
+			&current_range->heat_node->refs.refcount) <= 1)
+				hot_rb_remove_range_data(hot_inode,
+						current_range, root);
+		}
+	}
+
+	write_unlock(&inode_range_tree->lock);
+}
+
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index 193c265..298b6b4 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -59,8 +59,23 @@ int __init hot_rb_item_cache_init(void);
 
 void hot_rb_inode_item_exit(void);
 void hot_rb_range_item_exit(void);
+
+/*
+ * recalculates temperatures for inode or range
+ * and moves around in heat hash table based on temp
+ */
+void hot_rb_update_heat_hash_list(struct hot_freq_data *freq_data,
+				struct hot_info *root);
+
+struct hot_inode_item
+*hot_rb_find_next_hot_inode(struct hot_info *root,
+			u64 objectid);
 void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw);
 void hot_rb_update_freqs(struct inode *inode, u64 start, u64 len,
 			int rw);
 
+/* Update temperatures for each range item for aging purposes */
+void hot_rb_update_range_data(struct hot_inode_item *hot_inode,
+                                        struct hot_info *root);
+
 #endif /* __HOT_MAP__ */
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 0ec8b83b..be5bae4 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -72,10 +72,13 @@ void hot_track_init(struct super_block *sb, const char *name)
 	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
 	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
 	hot_hash_heat_rwlock_init(&sb->s_hotinfo);
+	hot_hash_fork_update_temperature_kthread();
+	hot_debugfs_volume_init(name, sb);
 }
 
 void hot_track_exit(struct super_block *sb)
 {
 	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
+	hot_hash_free_heat_hash_list(&sb->s_hotinfo);
 	hot_rb_free_hot_inode_tree(&sb->s_hotinfo);
 }
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index 8bb9028..6b8493a 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -36,6 +36,8 @@
 		((TRACKING_HOT_TRACK(inode->i_sb)) && \
 		!(inode->i_flags & S_NOHOTDATATRACK))
 
+extern struct task_struct *hot_data_update_kthread;
+
 /* kmem_cache pointers for slab caches */
 extern struct kmem_cache *hot_hash_node_cache;
 
@@ -144,6 +146,9 @@ struct hot_info {
 
 	/* hash map of range temperature */
 	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
+
+	/* protects hot data items while being iterated and updated */
+	struct mutex hot_data_update_kthread_mutex;
 };
 
 extern void hot_rb_update_freqs(struct inode *inode,
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 10/11] vfs: add 3 new ioctl interfaces
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
                   ` (2 preceding siblings ...)
  2012-09-11 14:40 ` [RFC 09/11] vfs: fork one private kthread to update temperature info zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 11/11] vfs: add debugfs support zwu.kernel
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  FS_IOC_GET_HEAT_INFO: return a struct containing the various
metrics collected in btrfs_freq_data structs, and also return a
calculated data temperature based on those metrics. Optionally, retrieve
the temperature from the hot data hash list instead of recalculating it.

  FS_IOC_GET_HEAT_OPTS: return an integer representing the current
state of hot data tracking and migration:

0 = do nothing
1 = track frequency of access

FS_IOC_SET_HEAT_OPTS: change the state of hot data tracking and
migration, as described above.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/compat_ioctl.c         |    8 +++
 fs/ioctl.c                |  132 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h        |   11 ++++
 include/linux/hot_track.h |   12 ++++
 4 files changed, 163 insertions(+), 0 deletions(-)

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index debdfe0..a88c7de 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1390,6 +1390,11 @@ COMPATIBLE_IOCTL(TIOCSTART)
 COMPATIBLE_IOCTL(TIOCSTOP)
 #endif
 
+/*Hot data tracking*/
+COMPATIBLE_IOCTL(FS_IOC_GET_HEAT_INFO)
+COMPATIBLE_IOCTL(FS_IOC_SET_HEAT_OPTS)
+COMPATIBLE_IOCTL(FS_IOC_GET_HEAT_OPTS)
+
 /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
    but we don't want warnings on other file systems. So declare
    them as compatible here. */
@@ -1572,6 +1577,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
+	case FS_IOC_GET_HEAT_INFO:
+	case FS_IOC_SET_HEAT_OPTS:
+	case FS_IOC_GET_HEAT_OPTS:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			break;
 		/*FALL THROUGH*/
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 29167be..9242969 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -18,6 +18,9 @@
 
 #include <asm/ioctls.h>
 
+#include "hot_hash.h"
+#include "hot_rb.h"
+
 /* So that the fiemap access checks can't overflow on 32 bit machines. */
 #define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
 
@@ -537,6 +540,126 @@ static int ioctl_fsthaw(struct file *filp)
 }
 
 /*
+ * Retrieve information about access frequency for the given file. Return it in
+ * a userspace-friendly struct for btrfsctl (or another tool) to parse.
+ *
+ * The temperature that is returned can be "live" -- that is, recalculated when
+ * the ioctl is called -- or it can be returned from the hashtable, reflecting
+ * the (possibly old) value that the system will use when considering files
+ * for migration. This behavior is determined by heat_info->live.
+ */
+static int ioctl_heat_info(struct file *file, void __user *argp)
+{
+	struct inode *mnt_inode = file->f_path.dentry->d_inode;
+	struct inode *file_inode;
+	struct file *file_filp;
+	struct hot_info *root = &(mnt_inode->i_sb->s_hotinfo);
+	struct heat_info *heat_info;
+	struct hot_inode_tree *hitree;
+	struct hot_inode_item *he;
+	int ret;
+
+	heat_info = kmalloc(sizeof(struct heat_info),
+				GFP_KERNEL | GFP_NOFS);
+
+	if (copy_from_user((void *) heat_info,
+			argp,
+			sizeof(struct heat_info)) != 0) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	file_filp = filp_open(heat_info->filename, O_RDONLY, 0);
+	file_inode = file_filp->f_dentry->d_inode;
+	filp_close(file_filp, NULL);
+
+	hitree = &root->hot_inode_tree;
+	read_lock(&hitree->lock);
+	he = hot_rb_lookup_hot_inode_item(hitree, file_inode->i_ino);
+	read_unlock(&hitree->lock);
+	if (!he) {
+		/* we don't have any info on this file yet */
+		ret = -ENODATA;
+		goto err;
+	}
+
+	spin_lock(&he->lock);
+	heat_info->avg_delta_reads =
+			(__u64) he->hot_freq_data.avg_delta_reads;
+	heat_info->avg_delta_writes =
+			(__u64) he->hot_freq_data.avg_delta_writes;
+	heat_info->last_read_time =
+			(__u64) timespec_to_ns(&he->hot_freq_data.last_read_time);
+	heat_info->last_write_time =
+			(__u64) timespec_to_ns(&he->hot_freq_data.last_write_time);
+	heat_info->num_reads =
+			(__u32) he->hot_freq_data.nr_reads;
+	heat_info->num_writes =
+			(__u32) he->hot_freq_data.nr_writes;
+
+	if (heat_info->live > 0) {
+		/* got a request for live temperature,
+		 * call hot_hash_calc_temperature to recalculate
+		 */
+		heat_info->temperature =
+			hot_hash_calc_temperature(&he->hot_freq_data);
+	} else {
+		/* not live temperature, get it from the hashlist */
+		read_lock(&he->heat_node->hlist->rwlock);
+		heat_info->temperature = he->heat_node->hlist->temperature;
+		read_unlock(&he->heat_node->hlist->rwlock);
+	}
+	spin_unlock(&he->lock);
+
+	hot_rb_free_hot_inode_item(he);
+
+	if (copy_to_user(argp, (void *) heat_info,
+			sizeof(struct heat_info))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	kfree(heat_info);
+	return 0;
+
+err:
+	kfree(heat_info);
+	return ret;
+}
+
+static int ioctl_heat_opts(struct file *file, void __user *argp, int set)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int arg, ret = 0;
+
+	if (!set) {
+		arg = TRACK_THIS_INODE(inode) ? 1 : 0;
+
+		if (copy_to_user(argp, (void *) &arg, sizeof(int)) != 0)
+			ret = -EFAULT;
+	} else {
+		if (copy_from_user((void *) &arg, argp, sizeof(int)) != 0) {
+			ret = -EFAULT;
+		} else {
+			switch (arg) {
+			case 0: /* track nothing */
+				/* set S_NOHOTDATATRACK */
+				inode->i_flags |= S_NOHOTDATATRACK;
+				break;
+			case 1: /* do tracking */
+				/* clear S_NOHOTDATATRACK */
+				inode->i_flags &= ~S_NOHOTDATATRACK;
+				break;
+			default:
+				ret = -EINVAL;
+			}
+		}
+	}
+
+	return ret;
+}
+
+/*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
  *
@@ -591,6 +714,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, argp);
 
+	case FS_IOC_GET_HEAT_INFO:
+		return ioctl_heat_info(filp, argp);
+
+	case FS_IOC_SET_HEAT_OPTS:
+		return ioctl_heat_opts(filp, argp, 1);
+
+	case FS_IOC_GET_HEAT_OPTS:
+		return ioctl_heat_opts(filp, argp, 0);
+
 	default:
 		if (S_ISREG(inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6229895..99698f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -256,6 +256,7 @@ struct inodes_stat_t {
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
+#define S_NOHOTDATATRACK (1 << 13)	/* hot data tracking */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -354,6 +355,16 @@ struct inodes_stat_t {
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
 
 /*
+ * Hot data tracking ioctls:
+ *
+ * HOT_INFO - retrieve info on frequency of access
+ */
+#define FS_IOC_GET_HEAT_INFO _IOR('f', 17, \
+				struct heat_info)
+#define FS_IOC_SET_HEAT_OPTS _IOW('f', 18, int)
+#define FS_IOC_GET_HEAT_OPTS _IOR('f', 19, int)
+
+/*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  */
 #define	FS_SECRM_FL			0x00000001 /* Secure deletion */
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index 6b8493a..152d3f6 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -68,6 +68,18 @@ struct hot_freq_data {
 	u32 last_temperature;
 };
 
+struct heat_info {
+	__u64 avg_delta_reads;
+	__u64 avg_delta_writes;
+	__u64 last_read_time;
+	__u64 last_write_time;
+	__u32 num_reads;
+	__u32 num_writes;
+	__u32 temperature;
+	__u8 live;
+	char filename[PATH_MAX];
+};
+
 /* Hash list heads for hot hash table */
 struct hot_hash_head {
 	struct hlist_head hashhead;
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 11/11] vfs: add debugfs support
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
                   ` (3 preceding siblings ...)
  2012-09-11 14:40 ` [RFC 10/11] vfs: add 3 new ioctl interfaces zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
volume that contains two files. The first, `inode_data', contains the
heat information for inodes that have been brought into the hot data map
structures. The second, `range_data', contains similar information for
subfile ranges.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/Makefile      |    2 +-
 fs/hot_debugfs.c |  488 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_debugfs.h |   60 +++++++
 fs/hot_track.c   |    1 +
 fs/hot_track.h   |    3 +-
 fs/namespace.c   |    6 +
 6 files changed, 557 insertions(+), 3 deletions(-)
 create mode 100644 fs/hot_debugfs.c
 create mode 100644 fs/hot_debugfs.h

diff --git a/fs/Makefile b/fs/Makefile
index f925a66..a70f288 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o \
-		hot_rb.o hot_track.o hot_hash.o
+		hot_rb.o hot_track.o hot_hash.o hot_debugfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/hot_debugfs.c b/fs/hot_debugfs.c
new file mode 100644
index 0000000..362f093
--- /dev/null
+++ b/fs/hot_debugfs.c
@@ -0,0 +1,488 @@
+/*
+ * fs/hot_debugfs.c
+ *
+ * This file contains the code to interface with the debugfs.
+ * The debugfs outputs range- and file-level access frequency
+ * statistics for each mounted volume.
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/limits.h>
+#include <linux/slab.h>
+#include "hot_debugfs.h"
+
+/* list to keep track of each mounted volumes debugfs_vol_data */
+static struct list_head hot_debugfs_vol_data_list;
+
+/* lock for debugfs_vol_data_list */
+static spinlock_t hot_debugfs_data_list_lock;
+
+/* pointer to top level debugfs dentry */
+static struct dentry *hot_debugfs_root_dentry;
+
+static int hot_debugfs_copy(struct debugfs_vol_data *data, char *msg, int len)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+	uint new_log_alloc_size;
+	char *new_log;
+	static char err_msg[] = "No more memory!\n";
+
+	if (len >= data->log_alloc_size - debugfs_log->len) {
+		/* Not enough room in the log buffer for the new message. */
+		/* Allocate a bigger buffer. */
+		new_log_alloc_size = data->log_alloc_size + LOG_PAGE_SIZE;
+		new_log = vmalloc(new_log_alloc_size);
+
+		if (new_log) {
+			memcpy(new_log, debugfs_log->str, debugfs_log->len);
+			memset(new_log + debugfs_log->len, 0,
+				new_log_alloc_size - debugfs_log->len);
+			vfree(debugfs_log->str);
+			debugfs_log->str = new_log;
+			data->log_alloc_size = new_log_alloc_size;
+		} else {
+			WARN_ON(1);
+			if (data->log_alloc_size - debugfs_log->len) {
+				strlcpy(debugfs_log->str +
+				debugfs_log->len,
+				err_msg,
+				data->log_alloc_size - debugfs_log->len);
+				debugfs_log->len +=
+				min((typeof(debugfs_log->len))
+				sizeof(err_msg),
+				((typeof(debugfs_log->len))
+				data->log_alloc_size - debugfs_log->len));
+			}
+			return 0;
+		}
+	}
+
+	memcpy(debugfs_log->str + debugfs_log->len, data->log_work_buff, len);
+	debugfs_log->len += (unsigned long) len;
+
+	return len;
+}
+
+/* Returns the number of bytes written to the log. */
+static int hot_debugfs_log(struct debugfs_vol_data *data, const char *fmt, ...)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+	va_list args;
+	int len;
+	static char trunc_msg[] =
+			"The next message has been truncated.\n";
+
+	if (debugfs_log->str == NULL)
+		return -1;
+
+	spin_lock(&data->log_lock);
+
+	va_start(args, fmt);
+	len = vsnprintf(data->log_work_buff,
+			sizeof(data->log_work_buff), fmt, args);
+	va_end(args);
+
+	if (len >= sizeof(data->log_work_buff)) {
+		hot_debugfs_copy(data, trunc_msg, sizeof(trunc_msg));
+	}
+
+	len = hot_debugfs_copy(data, data->log_work_buff, len);
+	spin_unlock(&data->log_lock);
+
+	return len;
+}
+
+/* initialize a log corresponding to a fs volume */
+static int hot_debugfs_log_init(struct debugfs_vol_data *data)
+{
+	int err = 0;
+	struct lstring *debugfs_log = data->debugfs_log;
+
+	spin_lock(&data->log_lock);
+	debugfs_log->str = vmalloc(INIT_LOG_ALLOC_SIZE);
+	if (debugfs_log->str) {
+		memset(debugfs_log->str, 0, INIT_LOG_ALLOC_SIZE);
+		data->log_alloc_size = INIT_LOG_ALLOC_SIZE;
+	} else {
+		err = -ENOMEM;
+	}
+	spin_unlock(&data->log_lock);
+
+	return err;
+}
+
+/* free a log corresponding to a fs volume */
+static void hot_debugfs_log_exit(struct debugfs_vol_data *data)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+
+	spin_lock(&data->log_lock);
+	vfree(debugfs_log->str);
+	debugfs_log->str = NULL;
+	debugfs_log->len = 0;
+	spin_unlock(&data->log_lock);
+}
+
+/* debugfs open file override from fops table */
+static int __hot_debugfs_open(struct inode *inode, struct file *file)
+{
+	if (inode->i_private)
+		file->private_data = inode->i_private;
+
+	return 0;
+}
+
+static void __hot_debugfs_print_range_freq_data(
+			struct hot_inode_item *hot_inode,
+			struct hot_range_item *hot_range,
+			struct debugfs_vol_data *data,
+			struct hot_info *root)
+{
+	struct hot_freq_data *freq_data;
+	u64 start;
+	u64 len;
+
+	freq_data = &hot_range->hot_freq_data;
+
+	spin_lock(&hot_range->lock);
+	start = hot_range->start;
+	len = hot_range->len;
+	spin_unlock(&hot_range->lock);
+
+	/* Always lock hot_inode_item first */
+	spin_lock(&hot_inode->lock);
+	spin_lock(&hot_range->lock);
+	hot_debugfs_log(data, "inode #%lu, range start " \
+			"%llu (range len %llu) reads %u, writes %u, "
+			"avg read time %llu, avg write time %llu, temp %u\n",
+			hot_inode->i_ino,
+			hot_range->start,
+			hot_range->len,
+			freq_data->nr_reads,
+			freq_data->nr_writes,
+			freq_data->avg_delta_reads,
+			freq_data->avg_delta_writes,
+			freq_data->last_temperature);
+	spin_unlock(&hot_range->lock);
+	spin_unlock(&hot_inode->lock);
+}
+
+/*
+ * take the inode, find ranges associated with inode
+ * and print each range data struct
+ */
+static void __hot_debugfs_walk_range_tree(struct hot_inode_item *hot_inode,
+				struct debugfs_vol_data *data,
+				struct hot_info *root)
+{
+	struct hot_range_tree *inode_range_tree;
+	struct rb_node *node;
+	struct hot_range_item *current_range;
+
+	inode_range_tree = &hot_inode->hot_range_tree;
+	read_lock(&inode_range_tree->lock);
+	node = rb_first(&inode_range_tree->map);
+
+	/* Walk the hot_range_tree for inode */
+	while (node) {
+		current_range = rb_entry(node, struct hot_range_item, rb_node);
+		__hot_debugfs_print_range_freq_data(hot_inode,
+						current_range, data, root);
+		node = rb_next(node);
+	}
+	read_unlock(&inode_range_tree->lock);
+}
+
+/* Print frequency data for each freq data to log */
+static void __hot_debugfs_print_inode_freq_data(
+				struct hot_inode_item *hot_inode,
+				struct debugfs_vol_data *data,
+				struct hot_info *root)
+{
+	struct hot_freq_data *freq_data = &hot_inode->hot_freq_data;
+
+	spin_lock(&hot_inode->lock);
+	hot_debugfs_log(data, "inode #%lu, reads %u, writes %u, " \
+		"avg read time %llu, avg write time %llu, temp %u\n",
+		hot_inode->i_ino,
+		freq_data->nr_reads,
+		freq_data->nr_writes,
+		freq_data->avg_delta_reads,
+		freq_data->avg_delta_writes,
+		freq_data->last_temperature);
+	spin_unlock(&hot_inode->lock);
+}
+
+/* debugfs read file override from fops table */
+static ssize_t __hot_debugfs_range_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos)
+{
+	int err = 0;
+	struct hot_info *root;
+	struct hot_inode_item *current_hot_inode;
+	struct debugfs_vol_data *data;
+	struct lstring *debugfs_log;
+	unsigned long inode_num;
+
+	data = (struct debugfs_vol_data *) file->private_data;
+	root = &(data->sb->s_hotinfo);
+
+	if (!data->debugfs_log) {
+		/* initialize debugfs log corresponding to this volume*/
+		debugfs_log = kmalloc(sizeof(struct lstring),
+				GFP_KERNEL | GFP_NOFS);
+		debugfs_log->str = NULL,
+		debugfs_log->len = 0;
+		data->debugfs_log = debugfs_log;
+		hot_debugfs_log_init(data);
+	}
+
+	if ((unsigned long) *ppos > 0) {
+		/* caller is continuing a previous read, don't walk tree */
+		if ((unsigned long) *ppos >= data->debugfs_log->len)
+			goto clean_up;
+
+		goto print_to_user;
+	}
+
+	/* walk the inode tree */
+	current_hot_inode = hot_rb_find_next_hot_inode(root, 0);
+
+	while (current_hot_inode) {
+		/* walk ranges, print data to debugfs log */
+		__hot_debugfs_walk_range_tree(current_hot_inode, data, root);
+		inode_num = current_hot_inode->i_ino;
+		hot_rb_free_hot_inode_item(current_hot_inode);
+		current_hot_inode = hot_rb_find_next_hot_inode(root,
+							inode_num + 1);
+	}
+
+print_to_user:
+	if (data->debugfs_log->len) {
+		err = simple_read_from_buffer(user, count, ppos,
+						data->debugfs_log->str,
+						data->debugfs_log->len);
+	}
+
+	return err;
+
+clean_up:
+	/* Reader has finished the file, clean up */
+	hot_debugfs_log_exit(data);
+	kfree(data->debugfs_log);
+	data->debugfs_log = NULL;
+
+	return 0;
+}
+
+/* debugfs read file override from fops table */
+static ssize_t __hot_debugfs_inode_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos)
+{
+	int err = 0;
+	struct hot_info *root;
+	struct hot_inode_item *current_hot_inode;
+	struct debugfs_vol_data *data;
+	struct lstring *debugfs_log;
+	unsigned long inode_num;
+
+	data = (struct debugfs_vol_data *) file->private_data;
+	root = &(data->sb->s_hotinfo);
+
+	if (!data->debugfs_log) {
+		/* initialize debugfs log corresponding to this volume */
+		debugfs_log = kmalloc(sizeof(struct lstring),
+					GFP_KERNEL | GFP_NOFS);
+		debugfs_log->str = NULL,
+		debugfs_log->len = 0;
+		data->debugfs_log = debugfs_log;
+		hot_debugfs_log_init(data);
+	}
+
+	if ((unsigned long) *ppos > 0) {
+		/* caller is continuing a previous read, don't walk tree */
+		if ((unsigned long) *ppos >= data->debugfs_log->len)
+			goto clean_up;
+
+			goto print_to_user;
+	}
+
+	/* walk the inode tree */
+	current_hot_inode = hot_rb_find_next_hot_inode(root, 0);
+
+	while (current_hot_inode) {
+		/* walk ranges, print data to debugfs log */
+		__hot_debugfs_print_inode_freq_data(current_hot_inode,
+							data, root);
+		inode_num = current_hot_inode->i_ino;
+		hot_rb_free_hot_inode_item(current_hot_inode);
+		current_hot_inode = hot_rb_find_next_hot_inode(root,
+								inode_num+1);
+	}
+
+print_to_user:
+	if (data->debugfs_log->len) {
+		err = simple_read_from_buffer(user, count, ppos,
+					data->debugfs_log->str,
+					data->debugfs_log->len);
+	}
+
+	return err;
+
+clean_up:
+	/* reader has finished the file, clean up */
+	hot_debugfs_log_exit(data);
+	kfree(data->debugfs_log);
+	data->debugfs_log = NULL;
+
+	return 0;
+}
+
+/* fops to override for printing range data */
+static const struct file_operations hot_debugfs_range_fops = {
+	.read = __hot_debugfs_range_read,
+	.open = __hot_debugfs_open,
+};
+
+/* fops to override for printing inode data */
+static const struct file_operations hot_debugfs_inode_fops = {
+	.read = __hot_debugfs_inode_read,
+	.open = __hot_debugfs_open,
+};
+
+/* initialize debugfs at module init */
+int hot_debugfs_init(void)
+{
+	hot_debugfs_root_dentry = debugfs_create_dir(DEBUGFS_ROOT_NAME, NULL);
+	/*init list of debugfs data list */
+	INIT_LIST_HEAD(&hot_debugfs_vol_data_list);
+	/*init lock to list of debugfs data list */
+	spin_lock_init(&hot_debugfs_data_list_lock);
+	if (!hot_debugfs_root_dentry)
+		goto debugfs_error;
+
+	return 0;
+
+debugfs_error:
+	return -EIO;
+}
+
+/*
+ * on each volume mount, initialize the debugfs dentries and associated
+ * structures (debugfs_vol_data and debugfs_log)
+ */
+int hot_debugfs_volume_init(const char *uuid, struct super_block *sb)
+{
+	struct dentry *debugfs_volume_entry = NULL;
+	struct dentry *debugfs_range_entry = NULL;
+	struct dentry *debugfs_inode_entry = NULL;
+	struct debugfs_vol_data *range_data = NULL;
+	struct debugfs_vol_data *inode_data = NULL;
+	size_t dev_name_length = strlen(uuid);
+	char dev[NAME_MAX];
+
+	if (!hot_debugfs_root_dentry)
+		goto debugfs_error;
+
+	/* create debugfs folder for this volume by mounted dev name */
+	memcpy(dev, uuid + DEV_NAME_CHOP, dev_name_length - DEV_NAME_CHOP + 1);
+	debugfs_volume_entry = debugfs_create_dir(dev, hot_debugfs_root_dentry);
+
+	if (!debugfs_volume_entry)
+		goto debugfs_error;
+
+	/* malloc and initialize debugfs_vol_data for range_data */
+	range_data = kmalloc(sizeof(struct debugfs_vol_data),
+				GFP_KERNEL | GFP_NOFS);
+	memset(range_data, 0, sizeof(struct debugfs_vol_data));
+	range_data->debugfs_log = NULL;
+	range_data->sb = sb;
+	spin_lock_init(&range_data->log_lock);
+	range_data->log_alloc_size = 0;
+
+	/* malloc and initialize debugfs_vol_data for range_data */
+	inode_data = kmalloc(sizeof(struct debugfs_vol_data),
+				GFP_KERNEL | GFP_NOFS);
+	memset(inode_data, 0, sizeof(struct debugfs_vol_data));
+	inode_data->debugfs_log = NULL;
+	inode_data->sb = sb;
+	spin_lock_init(&inode_data->log_lock);
+	inode_data->log_alloc_size = 0;
+
+	/*
+	 * add debugfs_vol_data for inode data and range data for
+	 * volume to list
+	 */
+	range_data->de = debugfs_volume_entry;
+	inode_data->de = debugfs_volume_entry;
+	spin_lock(&hot_debugfs_data_list_lock);
+	list_add(&range_data->node, &hot_debugfs_vol_data_list);
+	list_add(&inode_data->node, &hot_debugfs_vol_data_list);
+	spin_unlock(&hot_debugfs_data_list_lock);
+
+	/* create debugfs range_data file */
+	debugfs_range_entry = debugfs_create_file("range_data",
+				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
+				debugfs_volume_entry,
+				(void *) range_data,
+				&hot_debugfs_range_fops);
+	if (!debugfs_range_entry)
+		goto debugfs_error;
+
+	/* create debugfs inode_data file */
+	debugfs_inode_entry = debugfs_create_file("inode_data",
+				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
+				debugfs_volume_entry,
+				(void *) inode_data,
+				&hot_debugfs_inode_fops);
+
+	if (!debugfs_inode_entry)
+		goto debugfs_error;
+
+	return 0;
+
+debugfs_error:
+	kfree(range_data);
+	kfree(inode_data);
+
+	return -EIO;
+}
+
+/*
+ * find volume mounted (match by superblock) and remove
+ * debugfs dentry
+ */
+void hot_debugfs_volume_exit(struct super_block *sb)
+{
+	struct list_head *head;
+	struct list_head *pos;
+	struct debugfs_vol_data *data;
+
+	spin_lock(&hot_debugfs_data_list_lock);
+	head = &hot_debugfs_vol_data_list;
+	/* must clean up memory assicatied with superblock */
+	list_for_each(pos, head)
+	{
+		data = list_entry(pos, struct debugfs_vol_data, node);
+		if (data->sb == sb) {
+			list_del(pos);
+			debugfs_remove_recursive(data->de);
+			kfree(data);
+			data = NULL;
+			break;
+		}
+	}
+	spin_unlock(&hot_debugfs_data_list_lock);
+}
diff --git a/fs/hot_debugfs.h b/fs/hot_debugfs.h
new file mode 100644
index 0000000..977ad4c
--- /dev/null
+++ b/fs/hot_debugfs.h
@@ -0,0 +1,60 @@
+/*
+ * fs/debugfs.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_DEBUGFS__
+#define __HOT_DEBUGFS__
+
+#include <linux/fs.h>
+#include "hot_rb.h"
+#include "hot_hash.h"
+
+/* size of log to vmalloc */
+#define INIT_LOG_ALLOC_SIZE (PAGE_SIZE * 10)
+#define LOG_PAGE_SIZE (PAGE_SIZE * 10)
+
+/*
+ * number of chars of device name of chop off for making debugfs folder
+ * e.g. /dev/sda -> sda
+ *
+ * TODO: use something better for this
+ */
+#define DEV_NAME_CHOP 5
+
+/*
+ * Name for VFS data in debugfs directory
+ * e.g. /sys/kernel/debug/hot_track
+ */
+#define DEBUGFS_ROOT_NAME "hot_track"
+
+/* log to output to userspace in debugfs files */
+struct lstring {
+	char *str;
+	unsigned long len;
+};
+
+/* debugfs_vol_data is a struct of items that is passed to the debugfs */
+struct debugfs_vol_data {
+	struct list_head node; /* protected by data_list_lock */
+	struct lstring *debugfs_log;
+	struct super_block *sb;
+	struct dentry *de;
+	spinlock_t log_lock; /* protects debugfs_log */
+	char log_work_buff[1024];
+	uint log_alloc_size;
+};
+
+int hot_debugfs_init(void);
+void hot_debugfs_exit(void);
+int hot_debugfs_volume_init(const char *, struct super_block *);
+void hot_debugfs_volume_exit(struct super_block *);
+
+#endif /* __HOT_DEBUGFS__ */
diff --git a/fs/hot_track.c b/fs/hot_track.c
index be5bae4..ae113db 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -81,4 +81,5 @@ void hot_track_exit(struct super_block *sb)
 	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
 	hot_hash_free_heat_hash_list(&sb->s_hotinfo);
 	hot_rb_free_hot_inode_tree(&sb->s_hotinfo);
+	hot_debugfs_volume_exit(sb);
 }
diff --git a/fs/hot_track.h b/fs/hot_track.h
index e137142..3cb5a01 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -13,8 +13,7 @@
 #ifndef __HOT_TRACK__
 #define __HOT_TRACK__
 
-#include "hot_rb.h"
-#include "hot_hash.h"
+#include "hot_debugfs.h"
 
 bool hot_track_parse_options(char *options);
 void __init hot_track_item_cache_init(void);
diff --git a/fs/namespace.c b/fs/namespace.c
index 90c958a..6843489 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2629,6 +2629,12 @@ void __init mnt_init(void)
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+
+	err = hot_debugfs_init();
+	if (err)
+		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
+			__func__, err);
+
 	init_rootfs();
 	init_mount_tree();
 }
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-09-11 14:51 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
2012-09-11 14:40 ` [RFC 08/11] vfs: enable hot data tracking zwu.kernel
2012-09-11 14:40 ` [RFC 09/11] vfs: fork one private kthread to update temperature info zwu.kernel
2012-09-11 14:40 ` [RFC 10/11] vfs: add 3 new ioctl interfaces zwu.kernel
2012-09-11 14:40 ` [RFC 11/11] vfs: add debugfs support zwu.kernel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).