All of lore.kernel.org
 help / color / mirror / Atom feed
From: zwu.kernel@gmail.com
To: viro@zeniv.linux.org.uk
Cc: linux-fsdevel@vger.kernel.org, sekharan@us.ibm.com,
	Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Subject: [PATCH v4 03/10] VFS hot tracking: Add a workqueue to move items between hot maps
Date: Mon,  5 Aug 2013 22:49:53 +0800	[thread overview]
Message-ID: <1375714200-23944-4-git-send-email-zwu.kernel@gmail.com> (raw)
In-Reply-To: <1375714200-23944-1-git-send-email-zwu.kernel@gmail.com>

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

Add a workqueue per superblock and a delayed_work
to run periodic work to update map info on each superblock.

Two arrays of map list are defined, one is for hot inode
items, and the other is for hot extent items.

The hot items in the RB-tree will be at first distilled
into one temperature in the range [0, 255]. It will be
be linked to its corresponding array of map list which use
the temperature as its index.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/hot_tracking.c            | 264 +++++++++++++++++++++++++++++++++++++++++++
 fs/hot_tracking.h            |  24 ++++
 include/linux/hot_tracking.h |   9 +-
 3 files changed, 296 insertions(+), 1 deletion(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index e2a6e84..857d423 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -34,6 +34,7 @@ static void hot_range_item_init(struct hot_range_item *hr,
 			struct hot_inode_item *he, loff_t start)
 {
 	kref_init(&hr->refs);
+	INIT_LIST_HEAD(&hr->track_list);
 	hr->freq.avg_delta_reads = (u64) -1;
 	hr->freq.avg_delta_writes = (u64) -1;
 	hr->start = start;
@@ -56,6 +57,9 @@ static void hot_range_item_free(struct kref *kref)
 	struct hot_info *root = hr->hot_inode->hot_root;
 
 	rb_erase(&hr->rb_node, &hr->hot_inode->hot_range_tree);
+	spin_lock(&root->m_lock);
+	list_del_init(&hr->track_list);
+	spin_unlock(&root->m_lock);
 	call_rcu(&hr->rcu, hot_range_item_free_cb);
 }
 
@@ -81,6 +85,8 @@ struct hot_range_item
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hot_range_item *hr, *hr_new = NULL;
+	u32 temp;
+	u8 temp_cur;
 
 	start = hot_bit_shift(start, RANGE_BITS, true);
 
@@ -114,6 +120,12 @@ redo:
 	if (hr_new) {
 		rb_link_node(&hr_new->rb_node, parent, p);
 		rb_insert_color(&hr_new->rb_node, &he->hot_range_tree);
+		temp = hot_temp_calc(&hr_new->freq);
+		temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false);
+		spin_lock(&he->hot_root->m_lock);
+		list_add_tail(&hr_new->track_list,
+			&he->hot_root->hot_map[TYPE_RANGE][temp_cur]);
+		spin_unlock(&he->hot_root->m_lock);
 		hot_range_item_get(hr_new); /* For the caller */
 		spin_unlock(&he->i_lock);
 		return hr_new;
@@ -152,10 +164,68 @@ static void hot_range_tree_free(struct hot_inode_item *he)
 	spin_unlock(&he->i_lock);
 }
 
+static void hot_range_map_update(struct hot_info *root,
+			struct hot_range_item *hr)
+{
+	u32 temp = hot_temp_calc(&hr->freq);
+	u8 temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false);
+	u8 temp_prev = (u8)hot_bit_shift((u64)hr->freq.last_temp,
+				(32 - MAP_BITS), false);
+
+	hr->freq.last_temp = temp;
+
+	spin_lock(&root->m_lock);
+	if (!list_empty(&hr->track_list)
+		&& (temp_cur != temp_prev)) {
+		list_del_init(&hr->track_list);
+		list_add_tail(&hr->track_list,
+			&root->hot_map[TYPE_RANGE][temp_cur]);
+	}
+	spin_unlock(&root->m_lock);
+}
+
+/*
+ * Update temperatures for each range item for aging purposes.
+ * If one hot range item is old, it will be aged out.
+ */
+static void hot_range_tree_update(struct hot_inode_item *he,
+				struct hot_info *root)
+{
+	struct rb_node *node;
+	struct hot_range_item *hr;
+
+	rcu_read_lock();
+	node = rb_first(&he->hot_range_tree);
+	while (node) {
+		hr = rb_entry(node, struct hot_range_item, rb_node);
+		node = rb_next(node);
+		hot_range_map_update(root, hr);
+	}
+	rcu_read_unlock();
+}
+
+static int hot_range_temp_cmp(void *priv, struct list_head *a,
+				struct list_head *b)
+{
+	struct hot_range_item *ap = container_of(a,
+			struct hot_range_item, track_list);
+	struct hot_range_item *bp = container_of(b,
+			struct hot_range_item, track_list);
+
+	int diff = ap->freq.last_temp - bp->freq.last_temp;
+	if (diff > 0)
+		return -1;
+	else if (diff < 0)
+		return 1;
+	else
+		return 0;
+}
+
 static void hot_inode_item_init(struct hot_inode_item *he,
 			struct hot_info *root, u64 ino)
 {
 	kref_init(&he->refs);
+	INIT_LIST_HEAD(&he->track_list);
 	he->freq.avg_delta_reads = (u64) -1;
 	he->freq.avg_delta_writes = (u64) -1;
 	he->i_ino = ino;
@@ -177,6 +247,7 @@ static void hot_inode_item_free(struct kref *kref)
 				struct hot_inode_item, refs);
 
 	rb_erase(&he->rb_node, &he->hot_root->hot_inode_tree);
+	list_del_init(&he->track_list);
 	hot_range_tree_free(he);
 	call_rcu(&he->rcu, hot_inode_item_free_cb);
 }
@@ -203,6 +274,8 @@ struct hot_inode_item
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hot_inode_item *he, *he_new = NULL;
+	u32 temp;
+	u8 temp_cur;
 
 	/* walk tree to find insertion point */
 redo:
@@ -234,6 +307,10 @@ redo:
 	if (he_new) {
 		rb_link_node(&he_new->rb_node, parent, p);
 		rb_insert_color(&he_new->rb_node, &root->hot_inode_tree);
+		temp = hot_temp_calc(&he_new->freq);
+		temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false);
+		list_add_tail(&he_new->track_list,
+			&root->hot_map[TYPE_INODE][temp_cur]);
 		hot_inode_item_get(he_new); /* For the caller */
 		spin_unlock(&root->t_lock);
 		return he_new;
@@ -273,6 +350,48 @@ void hot_inode_item_unlink(struct inode *inode)
 EXPORT_SYMBOL_GPL(hot_inode_item_unlink);
 
 /*
+ * Calculate a new temperature and, if necessary,
+ * move the list_head corresponding to this inode or range
+ * to the proper list with the new temperature.
+ */
+static void hot_inode_map_update(struct hot_info *root,
+			struct hot_inode_item *he)
+{
+	u32 temp = hot_temp_calc(&he->freq);
+	u8 temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false);
+	u8 temp_prev = (u8)hot_bit_shift((u64)he->freq.last_temp,
+				(32 - MAP_BITS), false);
+
+	he->freq.last_temp = temp;
+
+	spin_lock(&root->t_lock);
+	if (!list_empty(&he->track_list)
+		&& (temp_cur != temp_prev)) {
+		list_del_init(&he->track_list);
+		list_add_tail(&he->track_list,
+			&root->hot_map[TYPE_INODE][temp_cur]);
+	}
+	spin_unlock(&root->t_lock);
+}
+
+static int hot_inode_temp_cmp(void *priv, struct list_head *a,
+				struct list_head *b)
+{
+	struct hot_inode_item *ap = container_of(a,
+			struct hot_inode_item, track_list);
+	struct hot_inode_item *bp = container_of(b,
+			struct hot_inode_item, track_list);
+
+	int diff = ap->freq.last_temp - bp->freq.last_temp;
+	if (diff > 0)
+		return -1;
+	else if (diff < 0)
+		return 1;
+	else
+		return 0;
+}
+
+/*
  * This function does the actual work of updating
  * the frequency numbers.
  *
@@ -318,6 +437,128 @@ static void hot_freq_update(struct hot_info *root,
 }
 
 /*
+ * hot_temp_calc() is responsible for distilling the six heat
+ * criteria down into a single temperature value for the data,
+ * which is an integer between 0 and HEAT_MAX_VALUE.
+ *
+ * With the six values, we first do some very rudimentary
+ * "normalizations" to each metric such that they affect the
+ * final temperature calculation exactly the right way. It's
+ * important to note that we still weren't really sure that
+ * these six adjustments were exactly right.
+ * They could definitely use more tweaking and adjustment,
+ * especially in terms of the memory footprint they consume.
+ *
+ * Next, we take the adjusted values and shift them down to
+ * a manageable size, whereafter they are weighted using the
+ * the *_COEFF_POWER values and combined to a single temperature
+ * value.
+ */
+u32 hot_temp_calc(struct hot_freq *freq)
+{
+	u32 result = 0;
+
+	struct timespec ckt = current_kernel_time();
+	u64 cur_time = timespec_to_ns(&ckt);
+	u32 nrr_heat, nrw_heat;
+	u64 ltr_heat, ltw_heat, avr_heat, avw_heat;
+
+	nrr_heat = (u32)hot_bit_shift((u64)freq->nr_reads,
+					NRR_MULTIPLIER_POWER, true);
+	nrw_heat = (u32)hot_bit_shift((u64)freq->nr_writes,
+					NRW_MULTIPLIER_POWER, true);
+
+	ltr_heat =
+	hot_bit_shift((cur_time - timespec_to_ns(&freq->last_read_time)),
+			LTR_DIVIDER_POWER, false);
+	ltw_heat =
+	hot_bit_shift((cur_time - timespec_to_ns(&freq->last_write_time)),
+			LTW_DIVIDER_POWER, false);
+
+	avr_heat =
+	hot_bit_shift((((u64) -1) - freq->avg_delta_reads),
+			AVR_DIVIDER_POWER, false);
+	avw_heat =
+	hot_bit_shift((((u64) -1) - freq->avg_delta_writes),
+			AVW_DIVIDER_POWER, false);
+
+	/* ltr_heat is now guaranteed to be u32 safe */
+	if (ltr_heat >= hot_bit_shift((u64) 1, 32, true))
+		ltr_heat = 0;
+	else
+		ltr_heat = hot_bit_shift((u64) 1, 32, true) - ltr_heat;
+
+	/* ltw_heat is now guaranteed to be u32 safe */
+	if (ltw_heat >= hot_bit_shift((u64) 1, 32, true))
+		ltw_heat = 0;
+	else
+		ltw_heat = hot_bit_shift((u64) 1, 32, true) - ltw_heat;
+
+	/* avr_heat is now guaranteed to be u32 safe */
+	if (avr_heat >= hot_bit_shift((u64) 1, 32, true))
+		avr_heat = (u32) -1;
+
+	/* avw_heat is now guaranteed to be u32 safe */
+	if (avw_heat >= hot_bit_shift((u64) 1, 32, true))
+		avw_heat = (u32) -1;
+
+	nrr_heat = (u32)hot_bit_shift((u64)nrr_heat,
+		(3 - NRR_COEFF_POWER), false);
+	nrw_heat = (u32)hot_bit_shift((u64)nrw_heat,
+		(3 - NRW_COEFF_POWER), false);
+	ltr_heat = hot_bit_shift(ltr_heat, (3 - LTR_COEFF_POWER), false);
+	ltw_heat = hot_bit_shift(ltw_heat, (3 - LTW_COEFF_POWER), false);
+	avr_heat = hot_bit_shift(avr_heat, (3 - AVR_COEFF_POWER), false);
+	avw_heat = hot_bit_shift(avw_heat, (3 - AVW_COEFF_POWER), false);
+
+	result = nrr_heat + nrw_heat + (u32) ltr_heat +
+		(u32) ltw_heat + (u32) avr_heat + (u32) avw_heat;
+
+	return result;
+}
+
+/*
+ * Every sync period we update temperatures for
+ * each hot inode item and hot range item for aging
+ * purposes.
+ */
+static void hot_update_worker(struct work_struct *work)
+{
+	struct hot_info *root = container_of(to_delayed_work(work),
+					struct hot_info, update_work);
+	struct hot_inode_item *he;
+	struct rb_node *node;
+	int i;
+
+	rcu_read_lock();
+	node = rb_first(&root->hot_inode_tree);
+	while (node) {
+		he = rb_entry(node, struct hot_inode_item, rb_node);
+		node = rb_next(node);
+		hot_inode_map_update(root, he);
+		hot_range_tree_update(he, root);
+	}
+	rcu_read_unlock();
+
+	/* Sort temperature map info based on last temperature */
+	for (i = 0; i < MAP_SIZE; i++) {
+		spin_lock(&root->t_lock);
+		list_sort(NULL, &root->hot_map[TYPE_INODE][i],
+			hot_inode_temp_cmp);
+		spin_unlock(&root->t_lock);
+
+		spin_lock(&root->m_lock);
+		list_sort(NULL, &root->hot_map[TYPE_RANGE][i],
+			hot_range_temp_cmp);
+		spin_unlock(&root->m_lock);
+	}
+
+	/* Instert next delayed work */
+	queue_delayed_work(root->update_wq, &root->update_work,
+		msecs_to_jiffies(HOT_UPDATE_INTERVAL * MSEC_PER_SEC));
+}
+
+/*
  * Initialize kmem cache for hot_inode_item and hot_range_item.
  */
 void __init hot_cache_init(void)
@@ -401,6 +642,26 @@ static struct hot_info *hot_tree_init(struct super_block *sb)
 
 	root->hot_inode_tree = RB_ROOT;
 	spin_lock_init(&root->t_lock);
+	spin_lock_init(&root->m_lock);
+
+	for (i = 0; i < MAP_SIZE; i++) {
+		for (j = 0; j < MAX_TYPES; j++)
+			INIT_LIST_HEAD(&root->hot_map[j][i]);
+	}
+
+	root->update_wq = alloc_workqueue(
+			"hot_update_wq", WQ_NON_REENTRANT, 0);
+	if (!root->update_wq) {
+		printk(KERN_ERR "%s: Failed to create "
+				"hot update workqueue\n", __func__);
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Initialize hot tracking wq and arm one delayed work */
+	INIT_DELAYED_WORK(&root->update_work, hot_update_worker);
+	queue_delayed_work(root->update_wq, &root->update_work,
+		msecs_to_jiffies(HOT_UPDATE_INTERVAL * MSEC_PER_SEC));
 
 	return root;
 }
@@ -412,6 +673,9 @@ static void hot_tree_exit(struct hot_info *root)
 {
 	struct rb_node *node;
 
+	cancel_delayed_work_sync(&root->update_work);
+	destroy_workqueue(root->update_wq);
+
 	spin_lock(&root->t_lock);
 	node = rb_first(&root->hot_inode_tree);
 	while (node) {
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index bb4cb16..0be7621 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -12,10 +12,34 @@
 #ifndef __HOT_TRACKING__
 #define __HOT_TRACKING__
 
+#include <linux/workqueue.h>
 #include <linux/hot_tracking.h>
 
+#define HOT_UPDATE_INTERVAL 150
+
 /* size of sub-file ranges */
 #define RANGE_BITS 20
 #define FREQ_POWER 4
 
+/* NRR/NRW heat unit = 2^X accesses */
+#define NRR_MULTIPLIER_POWER 20 /* NRR - number of reads since mount */
+#define NRR_COEFF_POWER 0
+#define NRW_MULTIPLIER_POWER 20 /* NRW - number of writes since mount */
+#define NRW_COEFF_POWER 0
+
+/* LTR/LTW heat unit = 2^X ns of age */
+#define LTR_DIVIDER_POWER 30 /* LTR - time elapsed since last read(ns) */
+#define LTR_COEFF_POWER 1
+#define LTW_DIVIDER_POWER 30 /* LTW - time elapsed since last write(ns) */
+#define LTW_COEFF_POWER 1
+
+/*
+ * AVR/AVW cold unit = 2^X ns of average delta
+ * AVR/AVW heat unit = HEAT_MAX_VALUE - cold unit
+ */
+#define AVR_DIVIDER_POWER 40 /* AVR - average delta between recent reads(ns) */
+#define AVR_COEFF_POWER 0
+#define AVW_DIVIDER_POWER 40 /* AVW - average delta between recent writes(ns) */
+#define AVW_COEFF_POWER 0
+
 #endif /* __HOT_TRACKING__ */
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index e2a9d50..9095859 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -55,6 +55,7 @@ struct hot_inode_item {
 	struct kref refs;
 	struct rb_node rb_node;         /* rbtree index */
 	struct rcu_head rcu;
+	struct list_head track_list;    /* link to *_map[] */
 	struct rb_root hot_range_tree;	/* tree of ranges */
 	spinlock_t i_lock;		/* protect above tree */
 	struct hot_info *hot_root;	/* associated hot_info */
@@ -70,6 +71,7 @@ struct hot_range_item {
 	struct kref refs;
 	struct rb_node rb_node;                 /* rbtree index */
 	struct rcu_head rcu;
+	struct list_head track_list;            /* link to *_map[] */
 	struct hot_inode_item *hot_inode;	/* associated hot_inode_item */
 	loff_t start;				/* offset in bytes */
 	size_t len;				/* length in bytes */
@@ -77,7 +79,11 @@ struct hot_range_item {
 
 struct hot_info {
 	struct rb_root hot_inode_tree;
-	spinlock_t t_lock;				/* protect above tree */
+	struct list_head hot_map[MAX_TYPES][MAP_SIZE];	/* map of inode temp */
+	spinlock_t t_lock;		/* protect tree and map for inode item */
+	spinlock_t m_lock;		/* protect map for range item */
+	struct workqueue_struct *update_wq;
+	struct delayed_work update_work;
 };
 
 extern void __init hot_cache_init(void);
@@ -96,6 +102,7 @@ extern struct hot_inode_item
 *hot_inode_item_lookup(struct hot_info *root,
 			u64 ino, int alloc);
 extern void hot_inode_item_unlink(struct inode *inode);
+extern u32 hot_temp_calc(struct hot_freq *freq);
 extern inline bool hot_track_enabled(struct inode *inode, size_t len);
 
 static inline u64 hot_bit_shift(u64 counter, u32 bits, bool dir)
-- 
1.7.11.7


  parent reply	other threads:[~2013-08-05 14:49 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-05 14:49 [PATCH v4 00/10] VFS hot tracking zwu.kernel
2013-08-05 14:49 ` [PATCH v4 01/10] VFS hot tracking: Define basic data structures and functions zwu.kernel
2013-08-05 14:49 ` [PATCH v4 02/10] VFS hot tracking: Track IO and record heat information zwu.kernel
2013-08-05 14:49 ` zwu.kernel [this message]
2013-08-05 14:49 ` [PATCH v4 04/10] VFS hot tracking: Add shrinker functionality to curtail memory usage zwu.kernel
2013-08-05 14:49 ` [PATCH v4 05/10] VFS hot tracking: Add an ioctl to get hot tracking information zwu.kernel
2013-08-05 14:49 ` [PATCH v4 06/10] VFS hot tracking: Add a /proc interface to make the interval tunable zwu.kernel
2013-08-05 14:49 ` [PATCH v4 07/10] VFS hot tracking: Add two /proc interfaces to control memory usage zwu.kernel
2013-08-05 14:49 ` [PATCH v4 08/10] VFS hot tracking: Add documentation zwu.kernel
2013-08-05 14:49 ` [PATCH v4 09/10] VFS hot tracking, btrfs: Add hot tracking support zwu.kernel
2013-08-05 14:50 ` [PATCH v4 10/10] VFS hot tracking, xfs: " zwu.kernel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1375714200-23944-4-git-send-email-zwu.kernel@gmail.com \
    --to=zwu.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=sekharan@us.ibm.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wuzhy@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.