From mboxrd@z Thu Jan 1 00:00:00 1970 From: zwu.kernel@gmail.com Subject: [PATCH v4 03/10] VFS hot tracking: Add a workqueue to move items between hot maps Date: Mon, 5 Aug 2013 22:49:53 +0800 Message-ID: <1375714200-23944-4-git-send-email-zwu.kernel@gmail.com> References: <1375714200-23944-1-git-send-email-zwu.kernel@gmail.com> Cc: linux-fsdevel@vger.kernel.org, sekharan@us.ibm.com, Zhi Yong Wu To: viro@zeniv.linux.org.uk Return-path: Received: from e8.ny.us.ibm.com ([32.97.182.138]:46071 "EHLO e8.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751296Ab3HEOt0 (ORCPT ); Mon, 5 Aug 2013 10:49:26 -0400 Received: from /spool/local by e8.ny.us.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 15:49:26 +0100 Received: from d01relay07.pok.ibm.com (d01relay07.pok.ibm.com [9.56.227.147]) by d01dlp03.pok.ibm.com (Postfix) with ESMTP id D8965C9003E for ; Mon, 5 Aug 2013 10:49:20 -0400 (EDT) Received: from d01av01.pok.ibm.com (d01av01.pok.ibm.com [9.56.224.215]) by d01relay07.pok.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r75EnE2Y20250854 for ; Mon, 5 Aug 2013 10:49:16 -0400 Received: from d01av01.pok.ibm.com (loopback [127.0.0.1]) by d01av01.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id r75EnEVc021391 for ; Mon, 5 Aug 2013 10:49:14 -0400 In-Reply-To: <1375714200-23944-1-git-send-email-zwu.kernel@gmail.com> Sender: linux-fsdevel-owner@vger.kernel.org List-ID: From: Zhi Yong Wu Add a workqueue per superblock and a delayed_work to run periodic work to update map info on each superblock. Two arrays of map list are defined, one is for hot inode items, and the other is for hot extent items. The hot items in the RB-tree will be at first distilled into one temperature in the range [0, 255]. It will be be linked to its corresponding array of map list which use the temperature as its index. Signed-off-by: Chandra Seetharaman Signed-off-by: Zhi Yong Wu --- fs/hot_tracking.c | 264 +++++++++++++++++++++++++++++++++++++++++++ fs/hot_tracking.h | 24 ++++ include/linux/hot_tracking.h | 9 +- 3 files changed, 296 insertions(+), 1 deletion(-) diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c index e2a6e84..857d423 100644 --- a/fs/hot_tracking.c +++ b/fs/hot_tracking.c @@ -34,6 +34,7 @@ static void hot_range_item_init(struct hot_range_item *hr, struct hot_inode_item *he, loff_t start) { kref_init(&hr->refs); + INIT_LIST_HEAD(&hr->track_list); hr->freq.avg_delta_reads = (u64) -1; hr->freq.avg_delta_writes = (u64) -1; hr->start = start; @@ -56,6 +57,9 @@ static void hot_range_item_free(struct kref *kref) struct hot_info *root = hr->hot_inode->hot_root; rb_erase(&hr->rb_node, &hr->hot_inode->hot_range_tree); + spin_lock(&root->m_lock); + list_del_init(&hr->track_list); + spin_unlock(&root->m_lock); call_rcu(&hr->rcu, hot_range_item_free_cb); } @@ -81,6 +85,8 @@ struct hot_range_item struct rb_node **p; struct rb_node *parent = NULL; struct hot_range_item *hr, *hr_new = NULL; + u32 temp; + u8 temp_cur; start = hot_bit_shift(start, RANGE_BITS, true); @@ -114,6 +120,12 @@ redo: if (hr_new) { rb_link_node(&hr_new->rb_node, parent, p); rb_insert_color(&hr_new->rb_node, &he->hot_range_tree); + temp = hot_temp_calc(&hr_new->freq); + temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false); + spin_lock(&he->hot_root->m_lock); + list_add_tail(&hr_new->track_list, + &he->hot_root->hot_map[TYPE_RANGE][temp_cur]); + spin_unlock(&he->hot_root->m_lock); hot_range_item_get(hr_new); /* For the caller */ spin_unlock(&he->i_lock); return hr_new; @@ -152,10 +164,68 @@ static void hot_range_tree_free(struct hot_inode_item *he) spin_unlock(&he->i_lock); } +static void hot_range_map_update(struct hot_info *root, + struct hot_range_item *hr) +{ + u32 temp = hot_temp_calc(&hr->freq); + u8 temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false); + u8 temp_prev = (u8)hot_bit_shift((u64)hr->freq.last_temp, + (32 - MAP_BITS), false); + + hr->freq.last_temp = temp; + + spin_lock(&root->m_lock); + if (!list_empty(&hr->track_list) + && (temp_cur != temp_prev)) { + list_del_init(&hr->track_list); + list_add_tail(&hr->track_list, + &root->hot_map[TYPE_RANGE][temp_cur]); + } + spin_unlock(&root->m_lock); +} + +/* + * Update temperatures for each range item for aging purposes. + * If one hot range item is old, it will be aged out. + */ +static void hot_range_tree_update(struct hot_inode_item *he, + struct hot_info *root) +{ + struct rb_node *node; + struct hot_range_item *hr; + + rcu_read_lock(); + node = rb_first(&he->hot_range_tree); + while (node) { + hr = rb_entry(node, struct hot_range_item, rb_node); + node = rb_next(node); + hot_range_map_update(root, hr); + } + rcu_read_unlock(); +} + +static int hot_range_temp_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct hot_range_item *ap = container_of(a, + struct hot_range_item, track_list); + struct hot_range_item *bp = container_of(b, + struct hot_range_item, track_list); + + int diff = ap->freq.last_temp - bp->freq.last_temp; + if (diff > 0) + return -1; + else if (diff < 0) + return 1; + else + return 0; +} + static void hot_inode_item_init(struct hot_inode_item *he, struct hot_info *root, u64 ino) { kref_init(&he->refs); + INIT_LIST_HEAD(&he->track_list); he->freq.avg_delta_reads = (u64) -1; he->freq.avg_delta_writes = (u64) -1; he->i_ino = ino; @@ -177,6 +247,7 @@ static void hot_inode_item_free(struct kref *kref) struct hot_inode_item, refs); rb_erase(&he->rb_node, &he->hot_root->hot_inode_tree); + list_del_init(&he->track_list); hot_range_tree_free(he); call_rcu(&he->rcu, hot_inode_item_free_cb); } @@ -203,6 +274,8 @@ struct hot_inode_item struct rb_node **p; struct rb_node *parent = NULL; struct hot_inode_item *he, *he_new = NULL; + u32 temp; + u8 temp_cur; /* walk tree to find insertion point */ redo: @@ -234,6 +307,10 @@ redo: if (he_new) { rb_link_node(&he_new->rb_node, parent, p); rb_insert_color(&he_new->rb_node, &root->hot_inode_tree); + temp = hot_temp_calc(&he_new->freq); + temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false); + list_add_tail(&he_new->track_list, + &root->hot_map[TYPE_INODE][temp_cur]); hot_inode_item_get(he_new); /* For the caller */ spin_unlock(&root->t_lock); return he_new; @@ -273,6 +350,48 @@ void hot_inode_item_unlink(struct inode *inode) EXPORT_SYMBOL_GPL(hot_inode_item_unlink); /* + * Calculate a new temperature and, if necessary, + * move the list_head corresponding to this inode or range + * to the proper list with the new temperature. + */ +static void hot_inode_map_update(struct hot_info *root, + struct hot_inode_item *he) +{ + u32 temp = hot_temp_calc(&he->freq); + u8 temp_cur = (u8)hot_bit_shift((u64)temp, (32 - MAP_BITS), false); + u8 temp_prev = (u8)hot_bit_shift((u64)he->freq.last_temp, + (32 - MAP_BITS), false); + + he->freq.last_temp = temp; + + spin_lock(&root->t_lock); + if (!list_empty(&he->track_list) + && (temp_cur != temp_prev)) { + list_del_init(&he->track_list); + list_add_tail(&he->track_list, + &root->hot_map[TYPE_INODE][temp_cur]); + } + spin_unlock(&root->t_lock); +} + +static int hot_inode_temp_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct hot_inode_item *ap = container_of(a, + struct hot_inode_item, track_list); + struct hot_inode_item *bp = container_of(b, + struct hot_inode_item, track_list); + + int diff = ap->freq.last_temp - bp->freq.last_temp; + if (diff > 0) + return -1; + else if (diff < 0) + return 1; + else + return 0; +} + +/* * This function does the actual work of updating * the frequency numbers. * @@ -318,6 +437,128 @@ static void hot_freq_update(struct hot_info *root, } /* + * hot_temp_calc() is responsible for distilling the six heat + * criteria down into a single temperature value for the data, + * which is an integer between 0 and HEAT_MAX_VALUE. + * + * With the six values, we first do some very rudimentary + * "normalizations" to each metric such that they affect the + * final temperature calculation exactly the right way. It's + * important to note that we still weren't really sure that + * these six adjustments were exactly right. + * They could definitely use more tweaking and adjustment, + * especially in terms of the memory footprint they consume. + * + * Next, we take the adjusted values and shift them down to + * a manageable size, whereafter they are weighted using the + * the *_COEFF_POWER values and combined to a single temperature + * value. + */ +u32 hot_temp_calc(struct hot_freq *freq) +{ + u32 result = 0; + + struct timespec ckt = current_kernel_time(); + u64 cur_time = timespec_to_ns(&ckt); + u32 nrr_heat, nrw_heat; + u64 ltr_heat, ltw_heat, avr_heat, avw_heat; + + nrr_heat = (u32)hot_bit_shift((u64)freq->nr_reads, + NRR_MULTIPLIER_POWER, true); + nrw_heat = (u32)hot_bit_shift((u64)freq->nr_writes, + NRW_MULTIPLIER_POWER, true); + + ltr_heat = + hot_bit_shift((cur_time - timespec_to_ns(&freq->last_read_time)), + LTR_DIVIDER_POWER, false); + ltw_heat = + hot_bit_shift((cur_time - timespec_to_ns(&freq->last_write_time)), + LTW_DIVIDER_POWER, false); + + avr_heat = + hot_bit_shift((((u64) -1) - freq->avg_delta_reads), + AVR_DIVIDER_POWER, false); + avw_heat = + hot_bit_shift((((u64) -1) - freq->avg_delta_writes), + AVW_DIVIDER_POWER, false); + + /* ltr_heat is now guaranteed to be u32 safe */ + if (ltr_heat >= hot_bit_shift((u64) 1, 32, true)) + ltr_heat = 0; + else + ltr_heat = hot_bit_shift((u64) 1, 32, true) - ltr_heat; + + /* ltw_heat is now guaranteed to be u32 safe */ + if (ltw_heat >= hot_bit_shift((u64) 1, 32, true)) + ltw_heat = 0; + else + ltw_heat = hot_bit_shift((u64) 1, 32, true) - ltw_heat; + + /* avr_heat is now guaranteed to be u32 safe */ + if (avr_heat >= hot_bit_shift((u64) 1, 32, true)) + avr_heat = (u32) -1; + + /* avw_heat is now guaranteed to be u32 safe */ + if (avw_heat >= hot_bit_shift((u64) 1, 32, true)) + avw_heat = (u32) -1; + + nrr_heat = (u32)hot_bit_shift((u64)nrr_heat, + (3 - NRR_COEFF_POWER), false); + nrw_heat = (u32)hot_bit_shift((u64)nrw_heat, + (3 - NRW_COEFF_POWER), false); + ltr_heat = hot_bit_shift(ltr_heat, (3 - LTR_COEFF_POWER), false); + ltw_heat = hot_bit_shift(ltw_heat, (3 - LTW_COEFF_POWER), false); + avr_heat = hot_bit_shift(avr_heat, (3 - AVR_COEFF_POWER), false); + avw_heat = hot_bit_shift(avw_heat, (3 - AVW_COEFF_POWER), false); + + result = nrr_heat + nrw_heat + (u32) ltr_heat + + (u32) ltw_heat + (u32) avr_heat + (u32) avw_heat; + + return result; +} + +/* + * Every sync period we update temperatures for + * each hot inode item and hot range item for aging + * purposes. + */ +static void hot_update_worker(struct work_struct *work) +{ + struct hot_info *root = container_of(to_delayed_work(work), + struct hot_info, update_work); + struct hot_inode_item *he; + struct rb_node *node; + int i; + + rcu_read_lock(); + node = rb_first(&root->hot_inode_tree); + while (node) { + he = rb_entry(node, struct hot_inode_item, rb_node); + node = rb_next(node); + hot_inode_map_update(root, he); + hot_range_tree_update(he, root); + } + rcu_read_unlock(); + + /* Sort temperature map info based on last temperature */ + for (i = 0; i < MAP_SIZE; i++) { + spin_lock(&root->t_lock); + list_sort(NULL, &root->hot_map[TYPE_INODE][i], + hot_inode_temp_cmp); + spin_unlock(&root->t_lock); + + spin_lock(&root->m_lock); + list_sort(NULL, &root->hot_map[TYPE_RANGE][i], + hot_range_temp_cmp); + spin_unlock(&root->m_lock); + } + + /* Instert next delayed work */ + queue_delayed_work(root->update_wq, &root->update_work, + msecs_to_jiffies(HOT_UPDATE_INTERVAL * MSEC_PER_SEC)); +} + +/* * Initialize kmem cache for hot_inode_item and hot_range_item. */ void __init hot_cache_init(void) @@ -401,6 +642,26 @@ static struct hot_info *hot_tree_init(struct super_block *sb) root->hot_inode_tree = RB_ROOT; spin_lock_init(&root->t_lock); + spin_lock_init(&root->m_lock); + + for (i = 0; i < MAP_SIZE; i++) { + for (j = 0; j < MAX_TYPES; j++) + INIT_LIST_HEAD(&root->hot_map[j][i]); + } + + root->update_wq = alloc_workqueue( + "hot_update_wq", WQ_NON_REENTRANT, 0); + if (!root->update_wq) { + printk(KERN_ERR "%s: Failed to create " + "hot update workqueue\n", __func__); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + /* Initialize hot tracking wq and arm one delayed work */ + INIT_DELAYED_WORK(&root->update_work, hot_update_worker); + queue_delayed_work(root->update_wq, &root->update_work, + msecs_to_jiffies(HOT_UPDATE_INTERVAL * MSEC_PER_SEC)); return root; } @@ -412,6 +673,9 @@ static void hot_tree_exit(struct hot_info *root) { struct rb_node *node; + cancel_delayed_work_sync(&root->update_work); + destroy_workqueue(root->update_wq); + spin_lock(&root->t_lock); node = rb_first(&root->hot_inode_tree); while (node) { diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h index bb4cb16..0be7621 100644 --- a/fs/hot_tracking.h +++ b/fs/hot_tracking.h @@ -12,10 +12,34 @@ #ifndef __HOT_TRACKING__ #define __HOT_TRACKING__ +#include #include +#define HOT_UPDATE_INTERVAL 150 + /* size of sub-file ranges */ #define RANGE_BITS 20 #define FREQ_POWER 4 +/* NRR/NRW heat unit = 2^X accesses */ +#define NRR_MULTIPLIER_POWER 20 /* NRR - number of reads since mount */ +#define NRR_COEFF_POWER 0 +#define NRW_MULTIPLIER_POWER 20 /* NRW - number of writes since mount */ +#define NRW_COEFF_POWER 0 + +/* LTR/LTW heat unit = 2^X ns of age */ +#define LTR_DIVIDER_POWER 30 /* LTR - time elapsed since last read(ns) */ +#define LTR_COEFF_POWER 1 +#define LTW_DIVIDER_POWER 30 /* LTW - time elapsed since last write(ns) */ +#define LTW_COEFF_POWER 1 + +/* + * AVR/AVW cold unit = 2^X ns of average delta + * AVR/AVW heat unit = HEAT_MAX_VALUE - cold unit + */ +#define AVR_DIVIDER_POWER 40 /* AVR - average delta between recent reads(ns) */ +#define AVR_COEFF_POWER 0 +#define AVW_DIVIDER_POWER 40 /* AVW - average delta between recent writes(ns) */ +#define AVW_COEFF_POWER 0 + #endif /* __HOT_TRACKING__ */ diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h index e2a9d50..9095859 100644 --- a/include/linux/hot_tracking.h +++ b/include/linux/hot_tracking.h @@ -55,6 +55,7 @@ struct hot_inode_item { struct kref refs; struct rb_node rb_node; /* rbtree index */ struct rcu_head rcu; + struct list_head track_list; /* link to *_map[] */ struct rb_root hot_range_tree; /* tree of ranges */ spinlock_t i_lock; /* protect above tree */ struct hot_info *hot_root; /* associated hot_info */ @@ -70,6 +71,7 @@ struct hot_range_item { struct kref refs; struct rb_node rb_node; /* rbtree index */ struct rcu_head rcu; + struct list_head track_list; /* link to *_map[] */ struct hot_inode_item *hot_inode; /* associated hot_inode_item */ loff_t start; /* offset in bytes */ size_t len; /* length in bytes */ @@ -77,7 +79,11 @@ struct hot_range_item { struct hot_info { struct rb_root hot_inode_tree; - spinlock_t t_lock; /* protect above tree */ + struct list_head hot_map[MAX_TYPES][MAP_SIZE]; /* map of inode temp */ + spinlock_t t_lock; /* protect tree and map for inode item */ + spinlock_t m_lock; /* protect map for range item */ + struct workqueue_struct *update_wq; + struct delayed_work update_work; }; extern void __init hot_cache_init(void); @@ -96,6 +102,7 @@ extern struct hot_inode_item *hot_inode_item_lookup(struct hot_info *root, u64 ino, int alloc); extern void hot_inode_item_unlink(struct inode *inode); +extern u32 hot_temp_calc(struct hot_freq *freq); extern inline bool hot_track_enabled(struct inode *inode, size_t len); static inline u64 hot_bit_shift(u64 counter, u32 bits, bool dir) -- 1.7.11.7