All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 06/11] vfs: add init and exit support
@ 2012-09-11 14:40 zwu.kernel
  2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Add initialization function to create some
key data structures when hot tracking is enabled;
Clean up them when hot tracking is disabled.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_rb.c    |   45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_rb.h    |    6 ++++++
 fs/hot_track.c |   15 +++++++++++++++
 fs/hot_track.h |    2 ++
 fs/namespace.c |    4 ++++
 fs/super.c     |    6 ++++++
 6 files changed, 78 insertions(+), 0 deletions(-)

diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index 560841a..71f5e76 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -133,6 +133,33 @@ void hot_rb_free_hot_range_item(struct hot_range_item *hr)
 	}
 }
 
+/* Frees the entire hot_inode_tree. */
+void hot_rb_free_hot_inode_tree(struct hot_info *root)
+{
+	struct rb_node *node, *node2;
+	struct hot_inode_item *he;
+	struct hot_range_item *hr;
+
+	/* Free hot inode and range trees on fs root */
+	node = rb_first(&root->hot_inode_tree.map);
+
+	while (node) {
+		he = rb_entry(node, struct hot_inode_item, rb_node);
+
+		node2 = rb_first(&he->hot_range_tree.map);
+		while (node2) {
+			hr = rb_entry(node2, struct hot_range_item, rb_node);
+			hot_rb_remove_hot_range_item(&he->hot_range_tree, hr);
+			hot_rb_free_hot_range_item(hr);
+			node2 = rb_first(&he->hot_range_tree.map);
+		}
+
+		hot_rb_remove_hot_inode_item(&root->hot_inode_tree, he);
+		hot_rb_free_hot_inode_item(he);
+		node = rb_first(&root->hot_inode_tree.map);
+	}
+}
+
 static struct rb_node *hot_rb_insert_hot_inode_item(struct rb_root *root,
 						unsigned long inode_num,
 						struct rb_node *node)
@@ -309,6 +336,24 @@ struct hot_range_item
 	return NULL;
 }
 
+int hot_rb_remove_hot_inode_item(struct hot_inode_tree *tree,
+				struct hot_inode_item *he)
+{
+	int ret = 0;
+	rb_erase(&he->rb_node, &tree->map);
+	he->in_tree = 0;
+	return ret;
+}
+
+int hot_rb_remove_hot_range_item(struct hot_range_tree *tree,
+				struct hot_range_item *hr)
+{
+	int ret = 0;
+	rb_erase(&hr->rb_node, &tree->map);
+	hr->in_tree = 0;
+	return ret;
+}
+
 /* Update inode frequency struct */
 static struct hot_inode_item *hot_rb_update_inode_freq(struct inode *inode,
 							int rw)
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index 4048027..df8cd14 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -46,8 +46,14 @@ int hot_rb_add_hot_inode_item(struct hot_inode_tree *tree,
 int hot_rb_add_hot_range_item(struct hot_range_tree *tree,
 				struct hot_range_item *hr);
 
+int hot_rb_remove_hot_inode_item(struct hot_inode_tree *tree,
+				struct hot_inode_item *he);
+int hot_rb_remove_hot_range_item(struct hot_range_tree *tree,
+				struct hot_range_item *hr);
+
 void hot_rb_free_hot_inode_item(struct hot_inode_item *he);
 void hot_rb_free_hot_range_item(struct hot_range_item *hr);
+void hot_rb_free_hot_inode_tree(struct hot_info *root);
 
 int __init hot_rb_item_cache_init(void);
 
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 36a41cb..68f85ad 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -58,3 +58,18 @@ void __init hot_track_item_cache_init(void)
 	if (hot_rb_item_cache_init())
 		return;
 }
+
+/*
+ * Initialize the data structures for hot data tracking.
+ */
+void hot_track_init(struct super_block *sb, const char *name)
+{
+	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
+	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
+}
+
+void hot_track_exit(struct super_block *sb)
+{
+	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
+	hot_rb_free_hot_inode_tree(&sb->s_hotinfo);
+}
diff --git a/fs/hot_track.h b/fs/hot_track.h
index dc0f5a2..b2f096c 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -17,5 +17,7 @@
 
 bool hot_track_parse_options(char *options);
 void __init hot_track_item_cache_init(void);
+void hot_track_init(struct super_block *sb, const char *name);
+void hot_track_exit(struct super_block *sb);
 
 #endif /* __HOT_TRACK__ */
diff --git a/fs/namespace.c b/fs/namespace.c
index 4d31f73..90c958a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/uaccess.h>
 #include "pnode.h"
 #include "internal.h"
+#include "hot_track.h"
 
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
@@ -1215,6 +1216,9 @@ static int do_umount(struct mount *mnt, int flags)
 		return retval;
 	}
 
+	if (sb->s_hotinfo.mount_opt & HOT_MOUNT_HOT_TRACK)
+		hot_track_exit(sb);
+
 	down_write(&namespace_sem);
 	br_write_lock(&vfsmount_lock);
 	event++;
diff --git a/fs/super.c b/fs/super.c
index d5bc781..eaf95fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1153,6 +1153,9 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	WARN_ON(sb->s_bdi == &default_backing_dev_info);
 	sb->s_flags |= MS_BORN;
 
+	if (hottrack)
+		hot_track_init(sb, name);
+
 	error = security_sb_kern_mount(sb, flags, secdata);
 	if (error)
 		goto out_sb;
@@ -1170,6 +1173,9 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	free_secdata(secdata);
 	return root;
 out_sb:
+	if (hottrack)
+		hot_track_exit(sb);
+
 	dput(root);
 	deactivate_locked_super(sb);
 out_free_secdata:
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 07/11] vfs: introduce one hash table
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 08/11] vfs: enable hot data tracking zwu.kernel
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Adds a hash table structure which contains
a lot of hash list and is used to efficiently
look up the data temperature of a file or its
ranges.
  In each hash list of hash table, the hash node
will keep track of temperature info.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/Makefile               |    2 +-
 fs/hot_hash.c             |   66 +++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_hash.h             |   26 +++++++++++++++++
 fs/hot_rb.c               |   20 +++++++++++++
 fs/hot_rb.h               |    2 +
 fs/hot_track.c            |    6 ++++
 fs/hot_track.h            |    1 +
 include/linux/hot_track.h |   38 ++++++++++++++++++++++++++
 8 files changed, 160 insertions(+), 1 deletions(-)
 create mode 100644 fs/hot_hash.c
 create mode 100644 fs/hot_hash.h

diff --git a/fs/Makefile b/fs/Makefile
index b4f620e..f925a66 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o \
-		hot_rb.o hot_track.o
+		hot_rb.o hot_track.o hot_hash.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/hot_hash.c b/fs/hot_hash.c
new file mode 100644
index 0000000..cae5631
--- /dev/null
+++ b/fs/hot_hash.c
@@ -0,0 +1,66 @@
+/*
+ * fs/hot_hash.c
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/fs.h>
+#include "hot_rb.h"
+#include "hot_hash.h"
+
+/* kmem_cache pointers for slab caches */
+struct kmem_cache *hot_hash_node_cache;
+
+void hot_hash_heat_node_init(void *_node)
+{
+	struct hot_hash_node *node = _node;
+
+	memset(node, 0, sizeof(*node));
+	INIT_HLIST_NODE(&node->hashnode);
+	node->hot_freq_data = NULL;
+	node->hlist = NULL;
+	spin_lock_init(&node->lock);
+	kref_init(&node->refs);
+}
+
+int __init hot_hash_node_cache_init(void)
+{
+	hot_hash_node_cache = kmem_cache_create("hot_hash_node",
+				sizeof(struct hot_hash_node),
+				0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				hot_hash_heat_node_init);
+	if (!hot_hash_node_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Initialize the rwlock in heat inode/range hash lists.
+ */
+void hot_hash_heat_rwlock_init(struct hot_info *root)
+{
+	int i;
+	for (i = 0; i < HEAT_HASH_SIZE; i++) {
+		rwlock_init(&root->heat_inode_hl[i].rwlock);
+		rwlock_init(&root->heat_range_hl[i].rwlock);
+	}
+}
+
diff --git a/fs/hot_hash.h b/fs/hot_hash.h
new file mode 100644
index 0000000..65abc6d
--- /dev/null
+++ b/fs/hot_hash.h
@@ -0,0 +1,26 @@
+/*
+ * fs/hot_hash.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_HASH__
+#define __HOT_HASH__
+
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/hot_track.h>
+
+void hot_hash_heat_node_init(void *_node);
+
+int __init hot_hash_node_cache_init(void);
+
+void hot_hash_heat_rwlock_init(struct hot_info *root);
+
+#endif /* __HOT_HASH__ */
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index 71f5e76..fd3b9e5 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -68,6 +68,18 @@ inode_err:
 	return -ENOMEM;
 }
 
+void hot_rb_inode_item_exit(void)
+{
+	if (hot_inode_item_cache)
+		kmem_cache_destroy(hot_inode_item_cache);
+}
+
+void hot_rb_range_item_exit(void)
+{
+	if (hot_range_item_cache)
+		kmem_cache_destroy(hot_range_item_cache);
+}
+
 /*
  * Initialize a new hot_inode_item structure. The new structure is
  * returned with a reference count of one and needs to be
@@ -80,6 +92,10 @@ void hot_rb_inode_item_init(void *_item)
 	memset(he, 0, sizeof(*he));
 	kref_init(&he->refs);
 	spin_lock_init(&he->lock);
+	he->heat_node = kmem_cache_alloc(hot_hash_node_cache,
+					GFP_KERNEL | GFP_NOFS);
+	hot_hash_heat_node_init(he->heat_node);
+	he->heat_node->hot_freq_data = &he->hot_freq_data;
 	he->hot_freq_data.avg_delta_reads = (u64) -1;
 	he->hot_freq_data.avg_delta_writes = (u64) -1;
 	he->hot_freq_data.flags = FREQ_DATA_TYPE_INODE;
@@ -98,6 +114,10 @@ void hot_rb_range_item_init(void *_item)
 	memset(hr, 0, sizeof(*hr));
 	kref_init(&hr->refs);
 	spin_lock_init(&hr->lock);
+	hr->heat_node = kmem_cache_alloc(hot_hash_node_cache,
+				GFP_KERNEL | GFP_NOFS);
+	hot_hash_heat_node_init(hr->heat_node);
+	hr->heat_node->hot_freq_data = &hr->hot_freq_data;
 	hr->hot_freq_data.avg_delta_reads = (u64) -1;
 	hr->hot_freq_data.avg_delta_writes = (u64) -1;
 	hr->hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index df8cd14..193c265 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -57,6 +57,8 @@ void hot_rb_free_hot_inode_tree(struct hot_info *root);
 
 int __init hot_rb_item_cache_init(void);
 
+void hot_rb_inode_item_exit(void);
+void hot_rb_range_item_exit(void);
 void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw);
 void hot_rb_update_freqs(struct inode *inode, u64 start, u64 len,
 			int rw);
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 68f85ad..0ec8b83b 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -57,6 +57,11 @@ void __init hot_track_item_cache_init(void)
 {
 	if (hot_rb_item_cache_init())
 		return;
+
+	if (hot_hash_node_cache_init()) {
+		hot_rb_inode_item_exit();
+		hot_rb_range_item_exit();
+	}
 }
 
 /*
@@ -66,6 +71,7 @@ void hot_track_init(struct super_block *sb, const char *name)
 {
 	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
 	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
+	hot_hash_heat_rwlock_init(&sb->s_hotinfo);
 }
 
 void hot_track_exit(struct super_block *sb)
diff --git a/fs/hot_track.h b/fs/hot_track.h
index b2f096c..e137142 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -14,6 +14,7 @@
 #define __HOT_TRACK__
 
 #include "hot_rb.h"
+#include "hot_hash.h"
 
 bool hot_track_parse_options(char *options);
 void __init hot_track_item_cache_init(void);
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index b56a467..bde61de 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -20,11 +20,17 @@
 #include <linux/rbtree.h>
 #include <linux/kref.h>
 
+#define HEAT_HASH_BITS 8
+#define HEAT_HASH_SIZE (1 << HEAT_HASH_BITS)
+
 /*
  * Flags for hot data tracking mount options.
  */
 #define HOT_MOUNT_HOT_TRACK		(1 << 0)
 
+/* kmem_cache pointers for slab caches */
+extern struct kmem_cache *hot_hash_node_cache;
+
 /* A tree that sits on the hot_info */
 struct hot_inode_tree {
 	struct rb_root map;
@@ -52,6 +58,28 @@ struct hot_freq_data {
 	u32 last_temperature;
 };
 
+/* Hash list heads for hot hash table */
+struct hot_hash_head {
+	struct hlist_head hashhead;
+	rwlock_t rwlock;
+	u32 temperature;
+};
+
+/* Nodes stored in each hash list of hash table */
+struct hot_hash_node {
+	struct hlist_node hashnode;
+	struct list_head node;
+	struct hot_freq_data *hot_freq_data;
+	struct hot_hash_head *hlist;
+	spinlock_t lock; /* protects hlist */
+
+	/*
+	 * number of references to this node
+	 * equals 1 (hashlist entry)
+	 */
+	struct kref refs;
+};
+
 /* An item representing an inode and its access frequency */
 struct hot_inode_item {
 	/* node for hot_inode_tree rb_tree */
@@ -68,6 +96,8 @@ struct hot_inode_item {
 	spinlock_t lock;
 	/* prevents kfree */
 	struct kref refs;
+	/* hashlist node for this inode */
+	struct hot_hash_node *heat_node;
 };
 
 /*
@@ -91,6 +121,8 @@ struct hot_range_item {
 	spinlock_t lock;
 	/* prevents kfree */
 	struct kref refs;
+	/* hashlist node for this range */
+	struct hot_hash_node *heat_node;
 };
 
 struct hot_info {
@@ -98,6 +130,12 @@ struct hot_info {
 
 	/* red-black tree that keeps track of fs-wide hot data */
 	struct hot_inode_tree hot_inode_tree;
+
+	/* hash map of inode temperature */
+	struct hot_hash_head heat_inode_hl[HEAT_HASH_SIZE];
+
+	/* hash map of range temperature */
+	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
 };
 
 #endif  /* _LINUX_HOTTRACK_H */
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 08/11] vfs: enable hot data tracking
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
  2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 09/11] vfs: fork one private kthread to update temperature info zwu.kernel
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Miscellaneous features that implement hot data tracking
and generally make the hot data functions a bit more friendly.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/direct-io.c            |   10 ++++++++++
 include/linux/hot_track.h |   11 +++++++++++
 mm/filemap.c              |    8 ++++++++
 mm/page-writeback.c       |   21 +++++++++++++++++++++
 mm/readahead.c            |    9 +++++++++
 5 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720..74068e2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
+#include <linux/hot_track.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -1297,6 +1298,15 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(iocb->ki_filp->f_mapping->host)
+			&& iov_length(iov, nr_segs) > 0) {
+		hot_rb_update_freqs(iocb->ki_filp->f_mapping->host,
+				(u64)offset,
+				(u64)iov_length(iov, nr_segs),
+				rw & WRITE);
+	}
+
 	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				     nr_segs, get_block, end_io,
 				     submit_io, flags);
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index bde61de..8bb9028 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -28,6 +28,14 @@
  */
 #define HOT_MOUNT_HOT_TRACK		(1 << 0)
 
+/* Hot data tracking -- guard macros */
+#define TRACKING_HOT_TRACK(root) \
+		(root->s_hotinfo.mount_opt & HOT_MOUNT_HOT_TRACK)
+
+#define TRACK_THIS_INODE(inode) \
+		((TRACKING_HOT_TRACK(inode->i_sb)) && \
+		!(inode->i_flags & S_NOHOTDATATRACK))
+
 /* kmem_cache pointers for slab caches */
 extern struct kmem_cache *hot_hash_node_cache;
 
@@ -138,4 +146,7 @@ struct hot_info {
 	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
 };
 
+extern void hot_rb_update_freqs(struct inode *inode,
+				u64 start, u64 len, int rw);
+
 #endif  /* _LINUX_HOTTRACK_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 3843445..784d027 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/hot_track.h>
 #include "internal.h"
 
 /*
@@ -1224,6 +1225,13 @@ readpage:
 		 * PG_error will be set again if readpage fails.
 		 */
 		ClearPageError(page);
+
+		/* Hot data tracking */
+		if (TRACK_THIS_INODE(filp->f_mapping->host))
+			hot_rb_update_freqs(filp->f_mapping->host,
+				(u64)page->index << PAGE_CACHE_SHIFT,
+				PAGE_CACHE_SIZE, 0);
+
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce2..4e83e68 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,7 @@
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
 #include <linux/timer.h>
+#include <linux/hot_track.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -1895,13 +1896,33 @@ EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
+	pgoff_t start = 0;
+	u64 prev_count = 0, count = 0;
 
 	if (wbc->nr_to_write <= 0)
 		return 0;
+
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(mapping->host)
+		&& wbc->range_cyclic) {
+		start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+		prev_count = (u64)wbc->nr_to_write;
+	}
+
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
+
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(mapping->host)
+		&& wbc->range_cyclic) {
+		count = prev_count - (u64)wbc->nr_to_write;
+		if (count)
+			hot_rb_update_freqs(mapping->host, (u64)start,
+					count * PAGE_CACHE_SIZE, 1);
+	}
+
 	return ret;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa..c204f2b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/hot_track.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -138,6 +139,14 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 out:
 	blk_finish_plug(&plug);
 
+	/* Hot data tracking */
+	if (TRACK_THIS_INODE(mapping->host) && nr_pages > 0) {
+		u64 start = (u64)(list_entry(pages->prev,
+				struct page, lru)->index) << PAGE_CACHE_SHIFT;
+		hot_rb_update_freqs(mapping->host, start,
+				(u64)nr_pages * PAGE_CACHE_SIZE, 0);
+	}
+
 	return ret;
 }
 
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 09/11] vfs: fork one private kthread to update temperature info
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
  2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
  2012-09-11 14:40 ` [RFC 08/11] vfs: enable hot data tracking zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 10/11] vfs: add 3 new ioctl interfaces zwu.kernel
  2012-09-11 14:40 ` [RFC 11/11] vfs: add debugfs support zwu.kernel
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Fork and run one kernel kthread to calculate
that temperature based on some metrics kept
in custom frequency data structs, and store
the info in the hash table.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_hash.c             |  316 +++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_hash.h             |   86 ++++++++++++
 fs/hot_rb.c               |  132 ++++++++++++++++++-
 fs/hot_rb.h               |   15 ++
 fs/hot_track.c            |    3 +
 include/linux/hot_track.h |    5 +
 6 files changed, 551 insertions(+), 6 deletions(-)

diff --git a/fs/hot_hash.c b/fs/hot_hash.c
index cae5631..18d53b0 100644
--- a/fs/hot_hash.c
+++ b/fs/hot_hash.c
@@ -27,6 +27,8 @@
 /* kmem_cache pointers for slab caches */
 struct kmem_cache *hot_hash_node_cache;
 
+struct task_struct *hot_data_update_kthread;
+
 void hot_hash_heat_node_init(void *_node)
 {
 	struct hot_hash_node *node = _node;
@@ -64,3 +66,317 @@ void hot_hash_heat_rwlock_init(struct hot_info *root)
 	}
 }
 
+static void hot_hash_free_heat_node(struct hlist_head *head)
+{
+	struct hlist_node *pos = NULL, *pos2 = NULL;
+	struct hot_hash_node *heatnode = NULL;
+
+	hlist_for_each_safe(pos, pos2, head) {
+			heatnode = hlist_entry(pos,
+					struct hot_hash_node,
+					hashnode);
+			hlist_del(pos);
+			kfree(heatnode);
+	}
+
+}
+
+void hot_hash_free_heat_hash_list(struct hot_info *root)
+{
+	int i;
+
+	/* Free node/range heat hash lists */
+	for (i = 0; i < HEAT_HASH_SIZE; i++) {
+		hot_hash_free_heat_node(&root->heat_inode_hl[i].hashhead);
+		hot_hash_free_heat_node(&root->heat_range_hl[i].hashhead);
+	}
+}
+
+static u64 hot_hash_shift(u64 counter, u32 bits, bool shift_dir)
+{
+	if (shift_dir)
+		return counter << bits;
+	else
+		return counter >> bits;
+}
+
+/*
+ * hot_hash_calc_temperature() is responsible for distilling the six heat
+ * criteria, which are described in detail in hot_hash.h) down into a single
+ * temperature value for the data, which is an integer between 0
+ * and HEAT_MAX_VALUE.
+ *
+ * To accomplish this, the raw values from the hot_freq_data structure
+ * are shifted various ways in order to make the temperature calculation more
+ * or less sensitive to each value.
+ *
+ * Once this calibration has happened, we do some additional normalization and
+ * make sure that everything fits nicely in a u32. From there, we take a very
+ * rudimentary kind of "average" of each of the values, where the *_COEFF_POWER
+ * values act as weights for the average.
+ *
+ * Finally, we use the HEAT_HASH_BITS value, which determines the size of the
+ * heat hash list, to normalize the temperature to the proper granularity.
+ */
+int hot_hash_calc_temperature(struct hot_freq_data *freq_data)
+{
+	u64 result = 0;
+
+	struct timespec ckt = current_kernel_time();
+	u64 cur_time = timespec_to_ns(&ckt);
+
+	u32 nrr_heat = (u32)hot_hash_shift((u64)freq_data->nr_reads,
+					NRR_MULTIPLIER_POWER, true);
+	u32 nrw_heat = (u32)hot_hash_shift((u64)freq_data->nr_writes,
+					NRW_MULTIPLIER_POWER, true);
+
+	u64 ltr_heat =
+	hot_hash_shift((cur_time - timespec_to_ns(&freq_data->last_read_time)),
+			LTR_DIVIDER_POWER, false);
+	u64 ltw_heat =
+	hot_hash_shift((cur_time - timespec_to_ns(&freq_data->last_write_time)),
+			LTW_DIVIDER_POWER, false);
+
+	u64 avr_heat =
+		hot_hash_shift((((u64) -1) - freq_data->avg_delta_reads),
+			AVR_DIVIDER_POWER, false);
+	u64 avw_heat =
+		hot_hash_shift((((u64) -1) - freq_data->avg_delta_writes),
+			AVW_DIVIDER_POWER, false);
+
+	/* ltr_heat is now guaranteed to be u32 safe */
+	if (ltr_heat >= hot_hash_shift((u64) 1, 32, true))
+		ltr_heat = 0;
+	else
+		ltr_heat = hot_hash_shift((u64) 1, 32, true) - ltr_heat;
+
+	/* ltw_heat is now guaranteed to be u32 safe */
+	if (ltw_heat >= hot_hash_shift((u64) 1, 32, true))
+		ltw_heat = 0;
+	else
+		ltw_heat = hot_hash_shift((u64) 1, 32, true) - ltw_heat;
+
+	/* avr_heat is now guaranteed to be u32 safe */
+	if (avr_heat >= hot_hash_shift((u64) 1, 32, true))
+		avr_heat = (u32) -1;
+
+	/* avw_heat is now guaranteed to be u32 safe */
+	if (avw_heat >= hot_hash_shift((u64) 1, 32, true))
+		avw_heat = (u32) -1;
+
+	nrr_heat = (u32)hot_hash_shift((u64)nrr_heat,
+				(3 - NRR_COEFF_POWER), false);
+	nrw_heat = (u32)hot_hash_shift((u64)nrw_heat,
+				(3 - NRW_COEFF_POWER), false);
+	ltr_heat = hot_hash_shift(ltr_heat, (3 - LTR_COEFF_POWER), false);
+	ltw_heat = hot_hash_shift(ltw_heat, (3 - LTW_COEFF_POWER), false);
+	avr_heat = hot_hash_shift(avr_heat, (3 - AVR_COEFF_POWER), false);
+	avw_heat = hot_hash_shift(avw_heat, (3 - AVW_COEFF_POWER), false);
+
+	result = nrr_heat + nrw_heat + (u32) ltr_heat +
+			(u32) ltw_heat + (u32) avr_heat + (u32) avw_heat;
+
+	return result >> (32 - HEAT_HASH_BITS);
+}
+
+int hot_hash_is_aging(struct hot_freq_data *freq_data)
+{
+	int ret = 0;
+	struct timespec ckt = current_kernel_time();
+
+	u64 cur_time = timespec_to_ns(&ckt);
+	u64 last_read_ns =
+		(cur_time - timespec_to_ns(&freq_data->last_read_time));
+	u64 last_write_ns =
+		(cur_time - timespec_to_ns(&freq_data->last_write_time));
+	u64 kick_ns = TIME_TO_KICK * (u64)1000000000;
+
+	if ((last_read_ns > kick_ns) && (last_write_ns > kick_ns))
+		ret = 1;
+
+	return ret;
+}
+
+/*
+ * Calc a new temperature and, if necessary, move the heat_node corresponding
+ * to this inode or range to the proper hashlist with the new temperature
+ */
+void hot_hash_update_heat_hash_list(struct hot_freq_data *freq_data,
+			struct hot_info *root)
+{
+	int temperature = 0;
+	int moved = 0;
+	struct hot_hash_head *buckets, *current_bucket = NULL;
+	struct hot_inode_item *he;
+	struct hot_range_item *hr;
+
+	if (freq_data->flags & FREQ_DATA_TYPE_INODE) {
+		he = hot_freq_data_get_he(freq_data);
+		buckets = root->heat_inode_hl;
+
+		spin_lock(&he->lock);
+		temperature = hot_hash_calc_temperature(freq_data);
+		freq_data->last_temperature = temperature;
+		spin_unlock(&he->lock);
+
+		if (he == NULL)
+			return;
+
+		spin_lock(&he->heat_node->lock);
+		if (he->heat_node->hlist == NULL) {
+			current_bucket = buckets + temperature;
+			moved = 1;
+		} else {
+			write_lock(&he->heat_node->hlist->rwlock);
+			current_bucket = he->heat_node->hlist;
+			if (current_bucket->temperature != temperature) {
+				hlist_del(&he->heat_node->hashnode);
+				current_bucket = buckets + temperature;
+				moved = 1;
+			}
+			write_unlock(&he->heat_node->hlist->rwlock);
+		}
+
+		if (moved) {
+			write_lock(&current_bucket->rwlock);
+			hlist_add_head(&he->heat_node->hashnode,
+					&current_bucket->hashhead);
+			he->heat_node->hlist = current_bucket;
+			write_unlock(&current_bucket->rwlock);
+		}
+		spin_unlock(&he->heat_node->lock);
+	} else if (freq_data->flags & FREQ_DATA_TYPE_RANGE) {
+		hr = hot_freq_data_get_hr(freq_data);
+		buckets = root->heat_range_hl;
+
+		spin_lock(&hr->lock);
+		temperature = hot_hash_calc_temperature(freq_data);
+		freq_data->last_temperature = temperature;
+		spin_unlock(&hr->lock);
+
+		if (hr == NULL)
+			return;
+
+		spin_lock(&hr->heat_node->lock);
+		if (hr->heat_node->hlist == NULL) {
+			current_bucket = buckets + temperature;
+			moved = 1;
+		} else {
+			write_lock(&hr->heat_node->hlist->rwlock);
+			current_bucket = hr->heat_node->hlist;
+			if (current_bucket->temperature != temperature) {
+				hlist_del(&hr->heat_node->hashnode);
+				current_bucket = buckets + temperature;
+				moved = 1;
+			}
+			write_unlock(&hr->heat_node->hlist->rwlock);
+		}
+
+		if (moved) {
+			write_lock(&current_bucket->rwlock);
+			hlist_add_head(&hr->heat_node->hashnode,
+					&current_bucket->hashhead);
+			hr->heat_node->hlist = current_bucket;
+			write_unlock(&current_bucket->rwlock);
+		}
+		spin_unlock(&hr->heat_node->lock);
+	}
+}
+
+/*
+ * Update temperatures for each hot inode item and
+ * hot range item for aging purposes
+ */
+static void hot_hash_iterate_and_update_heat(struct hot_info *root)
+{
+	struct hot_inode_item *current_hot_inode;
+	struct hot_inode_tree *hot_inode_tree;
+	unsigned long inode_num;
+
+	hot_inode_tree = &root->hot_inode_tree;
+
+	/* walk the inode tree */
+	current_hot_inode = hot_rb_find_next_hot_inode(root, 0);
+	while (current_hot_inode) {
+		hot_hash_update_heat_hash_list(
+			&current_hot_inode->hot_freq_data, root);
+		hot_rb_update_range_data(current_hot_inode, root);
+		inode_num = current_hot_inode->i_ino;
+		hot_rb_free_hot_inode_item(current_hot_inode);
+		current_hot_inode = hot_rb_find_next_hot_inode(root,
+							inode_num + 1);
+	}
+}
+
+/* Determine if there is hot data tracking to be enabled */
+static bool hot_hash_global_hot_track(void)
+{
+	struct super_block *sb;
+	bool ret = false;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (hlist_unhashed(&sb->s_instances))
+			continue;
+		if (sb->s_hotinfo.mount_opt & HOT_MOUNT_HOT_TRACK)
+			ret = true;
+	}
+	spin_unlock(&sb_lock);
+
+	return ret;
+}
+
+/*
+ * kthread iterates each hot_inode_item and hot_range_item
+ * and update temperatures to be shifted in heat hash table
+ * for purposes of relocation and such hot file detection
+ */
+static int hot_hash_update_temperature_kthread(void *arg)
+{
+	struct super_block *sb;
+	struct hot_info *root;
+	unsigned long delay;
+
+	do {
+		spin_lock(&sb_lock);
+		list_for_each_entry(sb, &super_blocks, s_list) {
+			if (hlist_unhashed(&sb->s_instances))
+				continue;
+			delay = HZ * HEAT_UPDATE_DELAY;
+			root = &sb->s_hotinfo;
+			if (mutex_trylock(
+				&root->hot_data_update_kthread_mutex)) {
+				hot_hash_iterate_and_update_heat(root);
+				mutex_unlock(
+					&root->hot_data_update_kthread_mutex);
+			}
+			if (unlikely(freezing(current))) {
+				__refrigerator(true);
+			} else {
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (!kthread_should_stop()) {
+					spin_unlock(&sb_lock);
+					schedule_timeout(delay);
+					spin_lock(&sb_lock);
+				}
+				__set_current_state(TASK_RUNNING);
+			}
+		}
+		spin_unlock(&sb_lock);
+	} while (!kthread_should_stop() || !hot_hash_global_hot_track());
+
+	return 0;
+}
+
+/* Fork the kthread to do temperature updates for all filesystems */
+void hot_hash_fork_update_temperature_kthread()
+{
+	if (hot_data_update_kthread)
+		return;
+
+	hot_data_update_kthread =
+		kthread_run(hot_hash_update_temperature_kthread, NULL,
+					"update_hot_temperature_kthread");
+	if (IS_ERR(hot_data_update_kthread))
+		kthread_stop(hot_data_update_kthread);
+}
diff --git a/fs/hot_hash.h b/fs/hot_hash.h
index 65abc6d..9cb89e9 100644
--- a/fs/hot_hash.h
+++ b/fs/hot_hash.h
@@ -17,10 +17,96 @@
 #include <linux/hash.h>
 #include <linux/hot_track.h>
 
+/* time to quit keeping track of tracking data (seconds)*/
+#define TIME_TO_KICK 400
+
+/* set how often to update temps (seconds) */
+#define HEAT_UPDATE_DELAY 400
+
+/*
+ * The following comments explain what exactly comprises a unit of heat.
+ *
+ * Each of six values of heat are calculated and combined in order to form an
+ * overall temperature for the data:
+ *
+ * NRR - number of reads since mount
+ * NRW - number of writes since mount
+ * LTR - time elapsed since last read (ns)
+ * LTW - time elapsed since last write (ns)
+ * AVR - average delta between recent reads (ns)
+ * AVW - average delta between recent writes (ns)
+ *
+ * These values are divided (right-shifted) according to the *_DIVIDER_POWER
+ * values defined below to bring the numbers into a reasonable range. You can
+ * modify these values to fit your needs. However, each heat unit is a u32 and
+ * thus maxes out at 2^32 - 1. Therefore, you must choose your dividers quite
+ * carefully or else they could max out or be stuck at zero quite easily.
+ *
+ * (E.g., if you chose AVR_DIVIDER_POWER = 0, nothing less than 4s of atime
+ * delta would bring the temperature above zero, ever.)
+ *
+ * Finally, each value is added to the overall temperature between 0 and 8
+ * times, depending on its *_COEFF_POWER value. Note that the coefficients are
+ * also actually implemented with shifts, so take care to treat these values
+ * as powers of 2. (I.e., 0 means we'll add it to the temp once; 1 = 2x, etc.)
+ */
+
+/* NRR/NRW heat unit = 2^X accesses */
+#define NRR_MULTIPLIER_POWER 20
+#define NRR_COEFF_POWER 0
+#define NRW_MULTIPLIER_POWER 20
+#define NRW_COEFF_POWER 0
+
+/* LTR/LTW heat unit = 2^X ns of age */
+#define LTR_DIVIDER_POWER 30
+#define LTR_COEFF_POWER 1
+#define LTW_DIVIDER_POWER 30
+#define LTW_COEFF_POWER 1
+
+/*
+ * AVR/AVW cold unit = 2^X ns of average delta
+ * AVR/AVW heat unit = HEAT_MAX_VALUE - cold unit
+ *
+ * E.g., data with an average delta between 0 and 2^X ns will have a cold value
+ * of 0, which means a heat value equal to HEAT_MAX_VALUE.
+ */
+#define AVR_DIVIDER_POWER 40
+#define AVR_COEFF_POWER 0
+#define AVW_DIVIDER_POWER 40
+#define AVW_COEFF_POWER 0
+
+/* macros to wrap container_of()'s for hot data structs */
+#define hot_freq_data_get_he(x) \
+        ((struct hot_inode_item *) container_of(x, \
+        struct hot_inode_item, hot_freq_data))
+#define hot_freq_data_get_hr(x) \
+        ((struct hot_range_item *) container_of(x, \
+        struct hot_range_item, hot_freq_data))
+
+struct hot_info;
+
 void hot_hash_heat_node_init(void *_node);
 
 int __init hot_hash_node_cache_init(void);
 
 void hot_hash_heat_rwlock_init(struct hot_info *root);
+void hot_hash_free_heat_hash_list(struct hot_info *root);
+
+/*
+ * Returns a value from 0 to HEAT_MAX_VALUE indicating the temperature of the
+ * file (and consequently its bucket number in hash list) (see hot_hash.c)
+ */
+int hot_hash_calc_temperature(struct hot_freq_data *freq_data);
+
+int hot_hash_is_aging(struct hot_freq_data *freq_data);
+
+void hot_hash_update_heat_hash_list(struct hot_freq_data *freq_data,
+                        struct hot_info *root);
+/*
+ * initialize kthread for each new mount point that
+ * periodically goes through hot inodes and hot ranges and ages them
+ * based on frequency of access
+ */
+void hot_hash_fork_update_temperature_kthread(void);
 
 #endif /* __HOT_HASH__ */
diff --git a/fs/hot_rb.c b/fs/hot_rb.c
index fd3b9e5..37d3771 100644
--- a/fs/hot_rb.c
+++ b/fs/hot_rb.c
@@ -399,9 +399,13 @@ static struct hot_inode_item *hot_rb_update_inode_freq(struct inode *inode,
 		write_unlock(&hitree->lock);
 	}
 
-	spin_lock(&he->lock);
-	hot_rb_update_freq(&he->hot_freq_data, rw);
-	spin_unlock(&he->lock);
+	if (!hot_data_update_kthread
+		|| hot_data_update_kthread->pid != current->pid) {
+		spin_lock(&he->lock);
+		hot_rb_update_freq(&he->hot_freq_data, rw);
+		spin_unlock(&he->lock);
+		hot_hash_update_heat_hash_list(&he->hot_freq_data, root);
+	}
 
 out:
 	return he;
@@ -448,9 +452,14 @@ static bool hot_rb_update_range_freq(struct hot_inode_item *he,
 			write_unlock(&hrtree->lock);
 		}
 
-		spin_lock(&hr->lock);
-		hot_rb_update_freq(&hr->hot_freq_data, rw);
-		spin_unlock(&hr->lock);
+		if (!hot_data_update_kthread
+			|| hot_data_update_kthread->pid != current->pid) {
+			spin_lock(&hr->lock);
+			hot_rb_update_freq(&hr->hot_freq_data, rw);
+			spin_unlock(&hr->lock);
+			hot_hash_update_heat_hash_list(&hr->hot_freq_data, root);
+		}
+
 		hot_rb_free_hot_range_item(hr);
 	}
 
@@ -509,6 +518,57 @@ void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw)
 	}
 }
 
+/* Walk the hot_inode_tree, locking as necessary */
+struct hot_inode_item *hot_rb_find_next_hot_inode(struct hot_info *root,
+						u64 objectid)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct hot_inode_item *entry;
+
+	read_lock(&root->hot_inode_tree.lock);
+
+	node = root->hot_inode_tree.map.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct hot_inode_item, rb_node);
+
+		if (objectid < entry->i_ino)
+			node = node->rb_left;
+		else if (objectid > entry->i_ino)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct hot_inode_item, rb_node);
+			if (objectid <= entry->i_ino) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+
+	if (node) {
+		entry = rb_entry(node, struct hot_inode_item, rb_node);
+		/*
+		  * increase reference count to prevent pruning while
+		  * caller is using the hot_inode_item
+		  */
+		kref_get(&entry->refs);
+
+		read_unlock(&root->hot_inode_tree.lock);
+		return entry;
+	}
+
+	read_unlock(&root->hot_inode_tree.lock);
+	return NULL;
+}
+
 /* main function to update access frequency from read/writepage(s) hooks */
 void hot_rb_update_freqs(struct inode *inode, u64 start,
 			u64 len, int rw)
@@ -526,3 +586,63 @@ void hot_rb_update_freqs(struct inode *inode, u64 start,
 		hot_rb_free_hot_inode_item(he);
 	}
 }
+
+/*
+ * take hot range that is now cold and remove from indexes and clean up
+ * any memory associted, involves removing hot range from rb tree, and
+ * heat hash lists, and freeing up all memory.
+ */
+static void hot_rb_remove_range_data(struct hot_inode_item *hot_inode,
+			struct hot_range_item *hr,
+			struct hot_info *root)
+{
+	/* remove range from rb tree */
+	hot_rb_remove_hot_range_item(&hot_inode->hot_range_tree, hr);
+
+	/* remove range from hash list */
+	spin_lock(&hr->heat_node->lock);
+	write_lock(&hr->heat_node->hlist->rwlock);
+	hlist_del(&hr->heat_node->hashnode);
+	write_unlock(&hr->heat_node->hlist->rwlock);
+	spin_unlock(&hr->heat_node->lock);
+
+	/*free up memory */
+	kfree(hr->heat_node);
+	hot_rb_free_hot_range_item(hr);
+}
+
+/* Update temperatures for each range item for aging purposes */
+void hot_rb_update_range_data(struct hot_inode_item *hot_inode,
+					struct hot_info *root)
+{
+	struct hot_range_tree *inode_range_tree;
+	struct rb_node *node;
+	struct rb_node *old_node;
+	struct hot_range_item *current_range;
+	int range_is_aging;
+
+	inode_range_tree = &hot_inode->hot_range_tree;
+	write_lock(&inode_range_tree->lock);
+	node = rb_first(&inode_range_tree->map);
+	/* Walk the hot_range_tree for inode */
+	while (node) {
+		current_range = rb_entry(node, struct hot_range_item, rb_node);
+		hot_hash_update_heat_hash_list(&current_range->hot_freq_data, root);
+		old_node = node;
+		node = rb_next(node);
+
+		spin_lock(&current_range->lock);
+		range_is_aging = hot_hash_is_aging(&current_range->hot_freq_data);
+		spin_unlock(&current_range->lock);
+
+		if (range_is_aging) {
+			if (atomic_read(
+			&current_range->heat_node->refs.refcount) <= 1)
+				hot_rb_remove_range_data(hot_inode,
+						current_range, root);
+		}
+	}
+
+	write_unlock(&inode_range_tree->lock);
+}
+
diff --git a/fs/hot_rb.h b/fs/hot_rb.h
index 193c265..298b6b4 100644
--- a/fs/hot_rb.h
+++ b/fs/hot_rb.h
@@ -59,8 +59,23 @@ int __init hot_rb_item_cache_init(void);
 
 void hot_rb_inode_item_exit(void);
 void hot_rb_range_item_exit(void);
+
+/*
+ * recalculates temperatures for inode or range
+ * and moves around in heat hash table based on temp
+ */
+void hot_rb_update_heat_hash_list(struct hot_freq_data *freq_data,
+				struct hot_info *root);
+
+struct hot_inode_item
+*hot_rb_find_next_hot_inode(struct hot_info *root,
+			u64 objectid);
 void hot_rb_update_freq(struct hot_freq_data *freq_data, int rw);
 void hot_rb_update_freqs(struct inode *inode, u64 start, u64 len,
 			int rw);
 
+/* Update temperatures for each range item for aging purposes */
+void hot_rb_update_range_data(struct hot_inode_item *hot_inode,
+                                        struct hot_info *root);
+
 #endif /* __HOT_MAP__ */
diff --git a/fs/hot_track.c b/fs/hot_track.c
index 0ec8b83b..be5bae4 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -72,10 +72,13 @@ void hot_track_init(struct super_block *sb, const char *name)
 	sb->s_hotinfo.mount_opt |= HOT_MOUNT_HOT_TRACK;
 	hot_rb_inode_tree_init(&sb->s_hotinfo.hot_inode_tree);
 	hot_hash_heat_rwlock_init(&sb->s_hotinfo);
+	hot_hash_fork_update_temperature_kthread();
+	hot_debugfs_volume_init(name, sb);
 }
 
 void hot_track_exit(struct super_block *sb)
 {
 	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
+	hot_hash_free_heat_hash_list(&sb->s_hotinfo);
 	hot_rb_free_hot_inode_tree(&sb->s_hotinfo);
 }
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index 8bb9028..6b8493a 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -36,6 +36,8 @@
 		((TRACKING_HOT_TRACK(inode->i_sb)) && \
 		!(inode->i_flags & S_NOHOTDATATRACK))
 
+extern struct task_struct *hot_data_update_kthread;
+
 /* kmem_cache pointers for slab caches */
 extern struct kmem_cache *hot_hash_node_cache;
 
@@ -144,6 +146,9 @@ struct hot_info {
 
 	/* hash map of range temperature */
 	struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
+
+	/* protects hot data items while being iterated and updated */
+	struct mutex hot_data_update_kthread_mutex;
 };
 
 extern void hot_rb_update_freqs(struct inode *inode,
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 10/11] vfs: add 3 new ioctl interfaces
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
                   ` (2 preceding siblings ...)
  2012-09-11 14:40 ` [RFC 09/11] vfs: fork one private kthread to update temperature info zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  2012-09-11 14:40 ` [RFC 11/11] vfs: add debugfs support zwu.kernel
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  FS_IOC_GET_HEAT_INFO: return a struct containing the various
metrics collected in btrfs_freq_data structs, and also return a
calculated data temperature based on those metrics. Optionally, retrieve
the temperature from the hot data hash list instead of recalculating it.

  FS_IOC_GET_HEAT_OPTS: return an integer representing the current
state of hot data tracking and migration:

0 = do nothing
1 = track frequency of access

FS_IOC_SET_HEAT_OPTS: change the state of hot data tracking and
migration, as described above.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/compat_ioctl.c         |    8 +++
 fs/ioctl.c                |  132 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h        |   11 ++++
 include/linux/hot_track.h |   12 ++++
 4 files changed, 163 insertions(+), 0 deletions(-)

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index debdfe0..a88c7de 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1390,6 +1390,11 @@ COMPATIBLE_IOCTL(TIOCSTART)
 COMPATIBLE_IOCTL(TIOCSTOP)
 #endif
 
+/*Hot data tracking*/
+COMPATIBLE_IOCTL(FS_IOC_GET_HEAT_INFO)
+COMPATIBLE_IOCTL(FS_IOC_SET_HEAT_OPTS)
+COMPATIBLE_IOCTL(FS_IOC_GET_HEAT_OPTS)
+
 /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
    but we don't want warnings on other file systems. So declare
    them as compatible here. */
@@ -1572,6 +1577,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
+	case FS_IOC_GET_HEAT_INFO:
+	case FS_IOC_SET_HEAT_OPTS:
+	case FS_IOC_GET_HEAT_OPTS:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			break;
 		/*FALL THROUGH*/
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 29167be..9242969 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -18,6 +18,9 @@
 
 #include <asm/ioctls.h>
 
+#include "hot_hash.h"
+#include "hot_rb.h"
+
 /* So that the fiemap access checks can't overflow on 32 bit machines. */
 #define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
 
@@ -537,6 +540,126 @@ static int ioctl_fsthaw(struct file *filp)
 }
 
 /*
+ * Retrieve information about access frequency for the given file. Return it in
+ * a userspace-friendly struct for btrfsctl (or another tool) to parse.
+ *
+ * The temperature that is returned can be "live" -- that is, recalculated when
+ * the ioctl is called -- or it can be returned from the hashtable, reflecting
+ * the (possibly old) value that the system will use when considering files
+ * for migration. This behavior is determined by heat_info->live.
+ */
+static int ioctl_heat_info(struct file *file, void __user *argp)
+{
+	struct inode *mnt_inode = file->f_path.dentry->d_inode;
+	struct inode *file_inode;
+	struct file *file_filp;
+	struct hot_info *root = &(mnt_inode->i_sb->s_hotinfo);
+	struct heat_info *heat_info;
+	struct hot_inode_tree *hitree;
+	struct hot_inode_item *he;
+	int ret;
+
+	heat_info = kmalloc(sizeof(struct heat_info),
+				GFP_KERNEL | GFP_NOFS);
+
+	if (copy_from_user((void *) heat_info,
+			argp,
+			sizeof(struct heat_info)) != 0) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	file_filp = filp_open(heat_info->filename, O_RDONLY, 0);
+	file_inode = file_filp->f_dentry->d_inode;
+	filp_close(file_filp, NULL);
+
+	hitree = &root->hot_inode_tree;
+	read_lock(&hitree->lock);
+	he = hot_rb_lookup_hot_inode_item(hitree, file_inode->i_ino);
+	read_unlock(&hitree->lock);
+	if (!he) {
+		/* we don't have any info on this file yet */
+		ret = -ENODATA;
+		goto err;
+	}
+
+	spin_lock(&he->lock);
+	heat_info->avg_delta_reads =
+			(__u64) he->hot_freq_data.avg_delta_reads;
+	heat_info->avg_delta_writes =
+			(__u64) he->hot_freq_data.avg_delta_writes;
+	heat_info->last_read_time =
+			(__u64) timespec_to_ns(&he->hot_freq_data.last_read_time);
+	heat_info->last_write_time =
+			(__u64) timespec_to_ns(&he->hot_freq_data.last_write_time);
+	heat_info->num_reads =
+			(__u32) he->hot_freq_data.nr_reads;
+	heat_info->num_writes =
+			(__u32) he->hot_freq_data.nr_writes;
+
+	if (heat_info->live > 0) {
+		/* got a request for live temperature,
+		 * call hot_hash_calc_temperature to recalculate
+		 */
+		heat_info->temperature =
+			hot_hash_calc_temperature(&he->hot_freq_data);
+	} else {
+		/* not live temperature, get it from the hashlist */
+		read_lock(&he->heat_node->hlist->rwlock);
+		heat_info->temperature = he->heat_node->hlist->temperature;
+		read_unlock(&he->heat_node->hlist->rwlock);
+	}
+	spin_unlock(&he->lock);
+
+	hot_rb_free_hot_inode_item(he);
+
+	if (copy_to_user(argp, (void *) heat_info,
+			sizeof(struct heat_info))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	kfree(heat_info);
+	return 0;
+
+err:
+	kfree(heat_info);
+	return ret;
+}
+
+static int ioctl_heat_opts(struct file *file, void __user *argp, int set)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int arg, ret = 0;
+
+	if (!set) {
+		arg = TRACK_THIS_INODE(inode) ? 1 : 0;
+
+		if (copy_to_user(argp, (void *) &arg, sizeof(int)) != 0)
+			ret = -EFAULT;
+	} else {
+		if (copy_from_user((void *) &arg, argp, sizeof(int)) != 0) {
+			ret = -EFAULT;
+		} else {
+			switch (arg) {
+			case 0: /* track nothing */
+				/* set S_NOHOTDATATRACK */
+				inode->i_flags |= S_NOHOTDATATRACK;
+				break;
+			case 1: /* do tracking */
+				/* clear S_NOHOTDATATRACK */
+				inode->i_flags &= ~S_NOHOTDATATRACK;
+				break;
+			default:
+				ret = -EINVAL;
+			}
+		}
+	}
+
+	return ret;
+}
+
+/*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
  *
@@ -591,6 +714,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, argp);
 
+	case FS_IOC_GET_HEAT_INFO:
+		return ioctl_heat_info(filp, argp);
+
+	case FS_IOC_SET_HEAT_OPTS:
+		return ioctl_heat_opts(filp, argp, 1);
+
+	case FS_IOC_GET_HEAT_OPTS:
+		return ioctl_heat_opts(filp, argp, 0);
+
 	default:
 		if (S_ISREG(inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6229895..99698f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -256,6 +256,7 @@ struct inodes_stat_t {
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
+#define S_NOHOTDATATRACK (1 << 13)	/* hot data tracking */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -354,6 +355,16 @@ struct inodes_stat_t {
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
 
 /*
+ * Hot data tracking ioctls:
+ *
+ * HOT_INFO - retrieve info on frequency of access
+ */
+#define FS_IOC_GET_HEAT_INFO _IOR('f', 17, \
+				struct heat_info)
+#define FS_IOC_SET_HEAT_OPTS _IOW('f', 18, int)
+#define FS_IOC_GET_HEAT_OPTS _IOR('f', 19, int)
+
+/*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  */
 #define	FS_SECRM_FL			0x00000001 /* Secure deletion */
diff --git a/include/linux/hot_track.h b/include/linux/hot_track.h
index 6b8493a..152d3f6 100644
--- a/include/linux/hot_track.h
+++ b/include/linux/hot_track.h
@@ -68,6 +68,18 @@ struct hot_freq_data {
 	u32 last_temperature;
 };
 
+struct heat_info {
+	__u64 avg_delta_reads;
+	__u64 avg_delta_writes;
+	__u64 last_read_time;
+	__u64 last_write_time;
+	__u32 num_reads;
+	__u32 num_writes;
+	__u32 temperature;
+	__u8 live;
+	char filename[PATH_MAX];
+};
+
 /* Hash list heads for hot hash table */
 struct hot_hash_head {
 	struct hlist_head hashhead;
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC 11/11] vfs: add debugfs support
  2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
                   ` (3 preceding siblings ...)
  2012-09-11 14:40 ` [RFC 10/11] vfs: add 3 new ioctl interfaces zwu.kernel
@ 2012-09-11 14:40 ` zwu.kernel
  4 siblings, 0 replies; 6+ messages in thread
From: zwu.kernel @ 2012-09-11 14:40 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, dave, viro, hch, chris.mason, cmm, linuxram,
	aneesh.kumar, Zhi Yong Wu

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
volume that contains two files. The first, `inode_data', contains the
heat information for inodes that have been brought into the hot data map
structures. The second, `range_data', contains similar information for
subfile ranges.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/Makefile      |    2 +-
 fs/hot_debugfs.c |  488 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/hot_debugfs.h |   60 +++++++
 fs/hot_track.c   |    1 +
 fs/hot_track.h   |    3 +-
 fs/namespace.c   |    6 +
 6 files changed, 557 insertions(+), 3 deletions(-)
 create mode 100644 fs/hot_debugfs.c
 create mode 100644 fs/hot_debugfs.h

diff --git a/fs/Makefile b/fs/Makefile
index f925a66..a70f288 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o \
-		hot_rb.o hot_track.o hot_hash.o
+		hot_rb.o hot_track.o hot_hash.o hot_debugfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/hot_debugfs.c b/fs/hot_debugfs.c
new file mode 100644
index 0000000..362f093
--- /dev/null
+++ b/fs/hot_debugfs.c
@@ -0,0 +1,488 @@
+/*
+ * fs/hot_debugfs.c
+ *
+ * This file contains the code to interface with the debugfs.
+ * The debugfs outputs range- and file-level access frequency
+ * statistics for each mounted volume.
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/limits.h>
+#include <linux/slab.h>
+#include "hot_debugfs.h"
+
+/* list to keep track of each mounted volumes debugfs_vol_data */
+static struct list_head hot_debugfs_vol_data_list;
+
+/* lock for debugfs_vol_data_list */
+static spinlock_t hot_debugfs_data_list_lock;
+
+/* pointer to top level debugfs dentry */
+static struct dentry *hot_debugfs_root_dentry;
+
+static int hot_debugfs_copy(struct debugfs_vol_data *data, char *msg, int len)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+	uint new_log_alloc_size;
+	char *new_log;
+	static char err_msg[] = "No more memory!\n";
+
+	if (len >= data->log_alloc_size - debugfs_log->len) {
+		/* Not enough room in the log buffer for the new message. */
+		/* Allocate a bigger buffer. */
+		new_log_alloc_size = data->log_alloc_size + LOG_PAGE_SIZE;
+		new_log = vmalloc(new_log_alloc_size);
+
+		if (new_log) {
+			memcpy(new_log, debugfs_log->str, debugfs_log->len);
+			memset(new_log + debugfs_log->len, 0,
+				new_log_alloc_size - debugfs_log->len);
+			vfree(debugfs_log->str);
+			debugfs_log->str = new_log;
+			data->log_alloc_size = new_log_alloc_size;
+		} else {
+			WARN_ON(1);
+			if (data->log_alloc_size - debugfs_log->len) {
+				strlcpy(debugfs_log->str +
+				debugfs_log->len,
+				err_msg,
+				data->log_alloc_size - debugfs_log->len);
+				debugfs_log->len +=
+				min((typeof(debugfs_log->len))
+				sizeof(err_msg),
+				((typeof(debugfs_log->len))
+				data->log_alloc_size - debugfs_log->len));
+			}
+			return 0;
+		}
+	}
+
+	memcpy(debugfs_log->str + debugfs_log->len, data->log_work_buff, len);
+	debugfs_log->len += (unsigned long) len;
+
+	return len;
+}
+
+/* Returns the number of bytes written to the log. */
+static int hot_debugfs_log(struct debugfs_vol_data *data, const char *fmt, ...)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+	va_list args;
+	int len;
+	static char trunc_msg[] =
+			"The next message has been truncated.\n";
+
+	if (debugfs_log->str == NULL)
+		return -1;
+
+	spin_lock(&data->log_lock);
+
+	va_start(args, fmt);
+	len = vsnprintf(data->log_work_buff,
+			sizeof(data->log_work_buff), fmt, args);
+	va_end(args);
+
+	if (len >= sizeof(data->log_work_buff)) {
+		hot_debugfs_copy(data, trunc_msg, sizeof(trunc_msg));
+	}
+
+	len = hot_debugfs_copy(data, data->log_work_buff, len);
+	spin_unlock(&data->log_lock);
+
+	return len;
+}
+
+/* initialize a log corresponding to a fs volume */
+static int hot_debugfs_log_init(struct debugfs_vol_data *data)
+{
+	int err = 0;
+	struct lstring *debugfs_log = data->debugfs_log;
+
+	spin_lock(&data->log_lock);
+	debugfs_log->str = vmalloc(INIT_LOG_ALLOC_SIZE);
+	if (debugfs_log->str) {
+		memset(debugfs_log->str, 0, INIT_LOG_ALLOC_SIZE);
+		data->log_alloc_size = INIT_LOG_ALLOC_SIZE;
+	} else {
+		err = -ENOMEM;
+	}
+	spin_unlock(&data->log_lock);
+
+	return err;
+}
+
+/* free a log corresponding to a fs volume */
+static void hot_debugfs_log_exit(struct debugfs_vol_data *data)
+{
+	struct lstring *debugfs_log = data->debugfs_log;
+
+	spin_lock(&data->log_lock);
+	vfree(debugfs_log->str);
+	debugfs_log->str = NULL;
+	debugfs_log->len = 0;
+	spin_unlock(&data->log_lock);
+}
+
+/* debugfs open file override from fops table */
+static int __hot_debugfs_open(struct inode *inode, struct file *file)
+{
+	if (inode->i_private)
+		file->private_data = inode->i_private;
+
+	return 0;
+}
+
+static void __hot_debugfs_print_range_freq_data(
+			struct hot_inode_item *hot_inode,
+			struct hot_range_item *hot_range,
+			struct debugfs_vol_data *data,
+			struct hot_info *root)
+{
+	struct hot_freq_data *freq_data;
+	u64 start;
+	u64 len;
+
+	freq_data = &hot_range->hot_freq_data;
+
+	spin_lock(&hot_range->lock);
+	start = hot_range->start;
+	len = hot_range->len;
+	spin_unlock(&hot_range->lock);
+
+	/* Always lock hot_inode_item first */
+	spin_lock(&hot_inode->lock);
+	spin_lock(&hot_range->lock);
+	hot_debugfs_log(data, "inode #%lu, range start " \
+			"%llu (range len %llu) reads %u, writes %u, "
+			"avg read time %llu, avg write time %llu, temp %u\n",
+			hot_inode->i_ino,
+			hot_range->start,
+			hot_range->len,
+			freq_data->nr_reads,
+			freq_data->nr_writes,
+			freq_data->avg_delta_reads,
+			freq_data->avg_delta_writes,
+			freq_data->last_temperature);
+	spin_unlock(&hot_range->lock);
+	spin_unlock(&hot_inode->lock);
+}
+
+/*
+ * take the inode, find ranges associated with inode
+ * and print each range data struct
+ */
+static void __hot_debugfs_walk_range_tree(struct hot_inode_item *hot_inode,
+				struct debugfs_vol_data *data,
+				struct hot_info *root)
+{
+	struct hot_range_tree *inode_range_tree;
+	struct rb_node *node;
+	struct hot_range_item *current_range;
+
+	inode_range_tree = &hot_inode->hot_range_tree;
+	read_lock(&inode_range_tree->lock);
+	node = rb_first(&inode_range_tree->map);
+
+	/* Walk the hot_range_tree for inode */
+	while (node) {
+		current_range = rb_entry(node, struct hot_range_item, rb_node);
+		__hot_debugfs_print_range_freq_data(hot_inode,
+						current_range, data, root);
+		node = rb_next(node);
+	}
+	read_unlock(&inode_range_tree->lock);
+}
+
+/* Print frequency data for each freq data to log */
+static void __hot_debugfs_print_inode_freq_data(
+				struct hot_inode_item *hot_inode,
+				struct debugfs_vol_data *data,
+				struct hot_info *root)
+{
+	struct hot_freq_data *freq_data = &hot_inode->hot_freq_data;
+
+	spin_lock(&hot_inode->lock);
+	hot_debugfs_log(data, "inode #%lu, reads %u, writes %u, " \
+		"avg read time %llu, avg write time %llu, temp %u\n",
+		hot_inode->i_ino,
+		freq_data->nr_reads,
+		freq_data->nr_writes,
+		freq_data->avg_delta_reads,
+		freq_data->avg_delta_writes,
+		freq_data->last_temperature);
+	spin_unlock(&hot_inode->lock);
+}
+
+/* debugfs read file override from fops table */
+static ssize_t __hot_debugfs_range_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos)
+{
+	int err = 0;
+	struct hot_info *root;
+	struct hot_inode_item *current_hot_inode;
+	struct debugfs_vol_data *data;
+	struct lstring *debugfs_log;
+	unsigned long inode_num;
+
+	data = (struct debugfs_vol_data *) file->private_data;
+	root = &(data->sb->s_hotinfo);
+
+	if (!data->debugfs_log) {
+		/* initialize debugfs log corresponding to this volume*/
+		debugfs_log = kmalloc(sizeof(struct lstring),
+				GFP_KERNEL | GFP_NOFS);
+		debugfs_log->str = NULL,
+		debugfs_log->len = 0;
+		data->debugfs_log = debugfs_log;
+		hot_debugfs_log_init(data);
+	}
+
+	if ((unsigned long) *ppos > 0) {
+		/* caller is continuing a previous read, don't walk tree */
+		if ((unsigned long) *ppos >= data->debugfs_log->len)
+			goto clean_up;
+
+		goto print_to_user;
+	}
+
+	/* walk the inode tree */
+	current_hot_inode = hot_rb_find_next_hot_inode(root, 0);
+
+	while (current_hot_inode) {
+		/* walk ranges, print data to debugfs log */
+		__hot_debugfs_walk_range_tree(current_hot_inode, data, root);
+		inode_num = current_hot_inode->i_ino;
+		hot_rb_free_hot_inode_item(current_hot_inode);
+		current_hot_inode = hot_rb_find_next_hot_inode(root,
+							inode_num + 1);
+	}
+
+print_to_user:
+	if (data->debugfs_log->len) {
+		err = simple_read_from_buffer(user, count, ppos,
+						data->debugfs_log->str,
+						data->debugfs_log->len);
+	}
+
+	return err;
+
+clean_up:
+	/* Reader has finished the file, clean up */
+	hot_debugfs_log_exit(data);
+	kfree(data->debugfs_log);
+	data->debugfs_log = NULL;
+
+	return 0;
+}
+
+/* debugfs read file override from fops table */
+static ssize_t __hot_debugfs_inode_read(struct file *file, char __user *user,
+					size_t count, loff_t *ppos)
+{
+	int err = 0;
+	struct hot_info *root;
+	struct hot_inode_item *current_hot_inode;
+	struct debugfs_vol_data *data;
+	struct lstring *debugfs_log;
+	unsigned long inode_num;
+
+	data = (struct debugfs_vol_data *) file->private_data;
+	root = &(data->sb->s_hotinfo);
+
+	if (!data->debugfs_log) {
+		/* initialize debugfs log corresponding to this volume */
+		debugfs_log = kmalloc(sizeof(struct lstring),
+					GFP_KERNEL | GFP_NOFS);
+		debugfs_log->str = NULL,
+		debugfs_log->len = 0;
+		data->debugfs_log = debugfs_log;
+		hot_debugfs_log_init(data);
+	}
+
+	if ((unsigned long) *ppos > 0) {
+		/* caller is continuing a previous read, don't walk tree */
+		if ((unsigned long) *ppos >= data->debugfs_log->len)
+			goto clean_up;
+
+			goto print_to_user;
+	}
+
+	/* walk the inode tree */
+	current_hot_inode = hot_rb_find_next_hot_inode(root, 0);
+
+	while (current_hot_inode) {
+		/* walk ranges, print data to debugfs log */
+		__hot_debugfs_print_inode_freq_data(current_hot_inode,
+							data, root);
+		inode_num = current_hot_inode->i_ino;
+		hot_rb_free_hot_inode_item(current_hot_inode);
+		current_hot_inode = hot_rb_find_next_hot_inode(root,
+								inode_num+1);
+	}
+
+print_to_user:
+	if (data->debugfs_log->len) {
+		err = simple_read_from_buffer(user, count, ppos,
+					data->debugfs_log->str,
+					data->debugfs_log->len);
+	}
+
+	return err;
+
+clean_up:
+	/* reader has finished the file, clean up */
+	hot_debugfs_log_exit(data);
+	kfree(data->debugfs_log);
+	data->debugfs_log = NULL;
+
+	return 0;
+}
+
+/* fops to override for printing range data */
+static const struct file_operations hot_debugfs_range_fops = {
+	.read = __hot_debugfs_range_read,
+	.open = __hot_debugfs_open,
+};
+
+/* fops to override for printing inode data */
+static const struct file_operations hot_debugfs_inode_fops = {
+	.read = __hot_debugfs_inode_read,
+	.open = __hot_debugfs_open,
+};
+
+/* initialize debugfs at module init */
+int hot_debugfs_init(void)
+{
+	hot_debugfs_root_dentry = debugfs_create_dir(DEBUGFS_ROOT_NAME, NULL);
+	/*init list of debugfs data list */
+	INIT_LIST_HEAD(&hot_debugfs_vol_data_list);
+	/*init lock to list of debugfs data list */
+	spin_lock_init(&hot_debugfs_data_list_lock);
+	if (!hot_debugfs_root_dentry)
+		goto debugfs_error;
+
+	return 0;
+
+debugfs_error:
+	return -EIO;
+}
+
+/*
+ * on each volume mount, initialize the debugfs dentries and associated
+ * structures (debugfs_vol_data and debugfs_log)
+ */
+int hot_debugfs_volume_init(const char *uuid, struct super_block *sb)
+{
+	struct dentry *debugfs_volume_entry = NULL;
+	struct dentry *debugfs_range_entry = NULL;
+	struct dentry *debugfs_inode_entry = NULL;
+	struct debugfs_vol_data *range_data = NULL;
+	struct debugfs_vol_data *inode_data = NULL;
+	size_t dev_name_length = strlen(uuid);
+	char dev[NAME_MAX];
+
+	if (!hot_debugfs_root_dentry)
+		goto debugfs_error;
+
+	/* create debugfs folder for this volume by mounted dev name */
+	memcpy(dev, uuid + DEV_NAME_CHOP, dev_name_length - DEV_NAME_CHOP + 1);
+	debugfs_volume_entry = debugfs_create_dir(dev, hot_debugfs_root_dentry);
+
+	if (!debugfs_volume_entry)
+		goto debugfs_error;
+
+	/* malloc and initialize debugfs_vol_data for range_data */
+	range_data = kmalloc(sizeof(struct debugfs_vol_data),
+				GFP_KERNEL | GFP_NOFS);
+	memset(range_data, 0, sizeof(struct debugfs_vol_data));
+	range_data->debugfs_log = NULL;
+	range_data->sb = sb;
+	spin_lock_init(&range_data->log_lock);
+	range_data->log_alloc_size = 0;
+
+	/* malloc and initialize debugfs_vol_data for range_data */
+	inode_data = kmalloc(sizeof(struct debugfs_vol_data),
+				GFP_KERNEL | GFP_NOFS);
+	memset(inode_data, 0, sizeof(struct debugfs_vol_data));
+	inode_data->debugfs_log = NULL;
+	inode_data->sb = sb;
+	spin_lock_init(&inode_data->log_lock);
+	inode_data->log_alloc_size = 0;
+
+	/*
+	 * add debugfs_vol_data for inode data and range data for
+	 * volume to list
+	 */
+	range_data->de = debugfs_volume_entry;
+	inode_data->de = debugfs_volume_entry;
+	spin_lock(&hot_debugfs_data_list_lock);
+	list_add(&range_data->node, &hot_debugfs_vol_data_list);
+	list_add(&inode_data->node, &hot_debugfs_vol_data_list);
+	spin_unlock(&hot_debugfs_data_list_lock);
+
+	/* create debugfs range_data file */
+	debugfs_range_entry = debugfs_create_file("range_data",
+				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
+				debugfs_volume_entry,
+				(void *) range_data,
+				&hot_debugfs_range_fops);
+	if (!debugfs_range_entry)
+		goto debugfs_error;
+
+	/* create debugfs inode_data file */
+	debugfs_inode_entry = debugfs_create_file("inode_data",
+				S_IFREG | S_IRUSR | S_IWUSR | S_IRUGO,
+				debugfs_volume_entry,
+				(void *) inode_data,
+				&hot_debugfs_inode_fops);
+
+	if (!debugfs_inode_entry)
+		goto debugfs_error;
+
+	return 0;
+
+debugfs_error:
+	kfree(range_data);
+	kfree(inode_data);
+
+	return -EIO;
+}
+
+/*
+ * find volume mounted (match by superblock) and remove
+ * debugfs dentry
+ */
+void hot_debugfs_volume_exit(struct super_block *sb)
+{
+	struct list_head *head;
+	struct list_head *pos;
+	struct debugfs_vol_data *data;
+
+	spin_lock(&hot_debugfs_data_list_lock);
+	head = &hot_debugfs_vol_data_list;
+	/* must clean up memory assicatied with superblock */
+	list_for_each(pos, head)
+	{
+		data = list_entry(pos, struct debugfs_vol_data, node);
+		if (data->sb == sb) {
+			list_del(pos);
+			debugfs_remove_recursive(data->de);
+			kfree(data);
+			data = NULL;
+			break;
+		}
+	}
+	spin_unlock(&hot_debugfs_data_list_lock);
+}
diff --git a/fs/hot_debugfs.h b/fs/hot_debugfs.h
new file mode 100644
index 0000000..977ad4c
--- /dev/null
+++ b/fs/hot_debugfs.h
@@ -0,0 +1,60 @@
+/*
+ * fs/debugfs.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *            Ben Chociej <bchociej@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_DEBUGFS__
+#define __HOT_DEBUGFS__
+
+#include <linux/fs.h>
+#include "hot_rb.h"
+#include "hot_hash.h"
+
+/* size of log to vmalloc */
+#define INIT_LOG_ALLOC_SIZE (PAGE_SIZE * 10)
+#define LOG_PAGE_SIZE (PAGE_SIZE * 10)
+
+/*
+ * number of chars of device name of chop off for making debugfs folder
+ * e.g. /dev/sda -> sda
+ *
+ * TODO: use something better for this
+ */
+#define DEV_NAME_CHOP 5
+
+/*
+ * Name for VFS data in debugfs directory
+ * e.g. /sys/kernel/debug/hot_track
+ */
+#define DEBUGFS_ROOT_NAME "hot_track"
+
+/* log to output to userspace in debugfs files */
+struct lstring {
+	char *str;
+	unsigned long len;
+};
+
+/* debugfs_vol_data is a struct of items that is passed to the debugfs */
+struct debugfs_vol_data {
+	struct list_head node; /* protected by data_list_lock */
+	struct lstring *debugfs_log;
+	struct super_block *sb;
+	struct dentry *de;
+	spinlock_t log_lock; /* protects debugfs_log */
+	char log_work_buff[1024];
+	uint log_alloc_size;
+};
+
+int hot_debugfs_init(void);
+void hot_debugfs_exit(void);
+int hot_debugfs_volume_init(const char *, struct super_block *);
+void hot_debugfs_volume_exit(struct super_block *);
+
+#endif /* __HOT_DEBUGFS__ */
diff --git a/fs/hot_track.c b/fs/hot_track.c
index be5bae4..ae113db 100644
--- a/fs/hot_track.c
+++ b/fs/hot_track.c
@@ -81,4 +81,5 @@ void hot_track_exit(struct super_block *sb)
 	sb->s_hotinfo.mount_opt &= ~HOT_MOUNT_HOT_TRACK;
 	hot_hash_free_heat_hash_list(&sb->s_hotinfo);
 	hot_rb_free_hot_inode_tree(&sb->s_hotinfo);
+	hot_debugfs_volume_exit(sb);
 }
diff --git a/fs/hot_track.h b/fs/hot_track.h
index e137142..3cb5a01 100644
--- a/fs/hot_track.h
+++ b/fs/hot_track.h
@@ -13,8 +13,7 @@
 #ifndef __HOT_TRACK__
 #define __HOT_TRACK__
 
-#include "hot_rb.h"
-#include "hot_hash.h"
+#include "hot_debugfs.h"
 
 bool hot_track_parse_options(char *options);
 void __init hot_track_item_cache_init(void);
diff --git a/fs/namespace.c b/fs/namespace.c
index 90c958a..6843489 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2629,6 +2629,12 @@ void __init mnt_init(void)
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+
+	err = hot_debugfs_init();
+	if (err)
+		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
+			__func__, err);
+
 	init_rootfs();
 	init_mount_tree();
 }
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-09-11 14:51 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-11 14:40 [RFC 06/11] vfs: add init and exit support zwu.kernel
2012-09-11 14:40 ` [RFC 07/11] vfs: introduce one hash table zwu.kernel
2012-09-11 14:40 ` [RFC 08/11] vfs: enable hot data tracking zwu.kernel
2012-09-11 14:40 ` [RFC 09/11] vfs: fork one private kthread to update temperature info zwu.kernel
2012-09-11 14:40 ` [RFC 10/11] vfs: add 3 new ioctl interfaces zwu.kernel
2012-09-11 14:40 ` [RFC 11/11] vfs: add debugfs support zwu.kernel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.