linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] KSM: numa awareness sysfs knob
@ 2012-01-23 10:29 Petr Holasek
  2012-01-25  0:03 ` Andrew Morton
  0 siblings, 1 reply; 5+ messages in thread
From: Petr Holasek @ 2012-01-23 10:29 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Andrea Arcangeli, Chris Wright, Izik Eidus,
	linux-kernel, linux-mm, Anton Arapov, Petr Holasek

This patch is based on RFC

https://lkml.org/lkml/2011/11/30/91

Introduces new sysfs binary knob /sys/kernel/mm/ksm/merge_nodes
which control merging pages across different numa nodes.
When it is set to zero only pages from the same node are merged,
otherwise pages from all nodes can be merged together (default behavior).

Typical use-case could be a lot of KVM guests on NUMA machine
where cpus from more distant nodes would have significant increase
of access latency to the merged ksm page. Switching merge_nodes
to 1 will result into these steps:

	1) unmerging all ksm pages
	2) re-merging all pages from VM_MERGEABLE vmas only within
		their NUMA nodes.
	3) lower average access latency to merged pages at the
	   expense of higher memory usage.

Every numa node has its own stable & unstable trees because
of faster searching and inserting. Changing of merge_nodes
value breaks COW on all current ksm pages.

Signed-off-by: Petr Holasek <pholasek@redhat.com>
---
 Documentation/vm/ksm.txt |    3 +
 mm/ksm.c                 |  124 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 108 insertions(+), 19 deletions(-)

diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index b392e49..ac9fc42 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -58,6 +58,9 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
                    e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
                    Default: 20 (chosen for demonstration purposes)
 
+merge_nodes      - specifies if pages from different numa nodes can be merged
+                   Default: 1
+
 run              - set 0 to stop ksmd from running but keep merged pages,
                    set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
                    set 2 to stop ksmd and unmerge all pages currently merged,
diff --git a/mm/ksm.c b/mm/ksm.c
index 1925ffb..402e4bc 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -37,6 +37,7 @@
 #include <linux/hash.h>
 #include <linux/freezer.h>
 #include <linux/oom.h>
+#include <linux/numa.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -121,6 +122,7 @@ struct stable_node {
 	struct rb_node node;
 	struct hlist_head hlist;
 	unsigned long kpfn;
+	struct rb_root *root;
 };
 
 /**
@@ -141,7 +143,10 @@ struct rmap_item {
 	unsigned long address;		/* + low bits used for flags below */
 	unsigned int oldchecksum;	/* when unstable */
 	union {
-		struct rb_node node;	/* when node of unstable tree */
+		struct {
+			struct rb_node node;	/* when node of unstable tree */
+			struct rb_root *root;
+		};
 		struct {		/* when listed from stable tree */
 			struct stable_node *head;
 			struct hlist_node hlist;
@@ -154,8 +159,8 @@ struct rmap_item {
 #define STABLE_FLAG	0x200	/* is listed from the stable tree */
 
 /* The stable and unstable tree heads */
-static struct rb_root root_stable_tree = RB_ROOT;
-static struct rb_root root_unstable_tree = RB_ROOT;
+static struct rb_root root_unstable_tree[MAX_NUMNODES] = { RB_ROOT, };
+static struct rb_root root_stable_tree[MAX_NUMNODES] = { RB_ROOT, };
 
 #define MM_SLOTS_HASH_SHIFT 10
 #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
@@ -190,6 +195,9 @@ static unsigned int ksm_thread_pages_to_scan = 100;
 /* Milliseconds ksmd should sleep between batches */
 static unsigned int ksm_thread_sleep_millisecs = 20;
 
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_nodes = 1;
+
 #define KSM_RUN_STOP	0
 #define KSM_RUN_MERGE	1
 #define KSM_RUN_UNMERGE	2
@@ -459,7 +467,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
 		cond_resched();
 	}
 
-	rb_erase(&stable_node->node, &root_stable_tree);
+	rb_erase(&stable_node->node, stable_node->root);
 	free_stable_node(stable_node);
 }
 
@@ -557,7 +565,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
 		BUG_ON(age > 1);
 		if (!age)
-			rb_erase(&rmap_item->node, &root_unstable_tree);
+			rb_erase(&rmap_item->node, rmap_item->root);
 
 		ksm_pages_unshared--;
 		rmap_item->address &= PAGE_MASK;
@@ -986,8 +994,9 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
  */
 static struct page *stable_tree_search(struct page *page)
 {
-	struct rb_node *node = root_stable_tree.rb_node;
+	struct rb_node *node;
 	struct stable_node *stable_node;
+	int nid;
 
 	stable_node = page_stable_node(page);
 	if (stable_node) {			/* ksm page forked */
@@ -995,6 +1004,13 @@ static struct page *stable_tree_search(struct page *page)
 		return page;
 	}
 
+	if (ksm_merge_nodes)
+		nid = 0;
+	else
+		nid = page_to_nid(page);
+
+	node = root_stable_tree[nid].rb_node;
+
 	while (node) {
 		struct page *tree_page;
 		int ret;
@@ -1029,10 +1045,18 @@ static struct page *stable_tree_search(struct page *page)
  */
 static struct stable_node *stable_tree_insert(struct page *kpage)
 {
-	struct rb_node **new = &root_stable_tree.rb_node;
+	int nid;
+	struct rb_node **new = NULL;
 	struct rb_node *parent = NULL;
 	struct stable_node *stable_node;
 
+	if (ksm_merge_nodes)
+		nid = 0;
+	else
+		nid = page_to_nid(kpage);
+
+	new = &root_stable_tree[nid].rb_node;
+
 	while (*new) {
 		struct page *tree_page;
 		int ret;
@@ -1066,12 +1090,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
 		return NULL;
 
 	rb_link_node(&stable_node->node, parent, new);
-	rb_insert_color(&stable_node->node, &root_stable_tree);
+	rb_insert_color(&stable_node->node, &root_stable_tree[nid]);
 
 	INIT_HLIST_HEAD(&stable_node->hlist);
 
 	stable_node->kpfn = page_to_pfn(kpage);
+	stable_node->root = &root_stable_tree[nid];
 	set_page_stable_node(kpage, stable_node);
+	printk(KERN_DEBUG "Stable node was inserted into tree %d\n", nid);
 
 	return stable_node;
 }
@@ -1094,11 +1120,18 @@ static
 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
 					      struct page *page,
 					      struct page **tree_pagep)
-
 {
-	struct rb_node **new = &root_unstable_tree.rb_node;
+	struct rb_node **new = NULL;
+	struct rb_root *root;
 	struct rb_node *parent = NULL;
 
+	if (ksm_merge_nodes)
+		root = &root_unstable_tree[0];
+	else
+		root = &root_unstable_tree[page_to_nid(page)];
+
+	new = &root->rb_node;
+
 	while (*new) {
 		struct rmap_item *tree_rmap_item;
 		struct page *tree_page;
@@ -1135,8 +1168,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
 
 	rmap_item->address |= UNSTABLE_FLAG;
 	rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+	rmap_item->root = root;
 	rb_link_node(&rmap_item->node, parent, new);
-	rb_insert_color(&rmap_item->node, &root_unstable_tree);
+	rb_insert_color(&rmap_item->node, root);
 
 	ksm_pages_unshared++;
 	return NULL;
@@ -1279,6 +1313,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 	struct mm_slot *slot;
 	struct vm_area_struct *vma;
 	struct rmap_item *rmap_item;
+	int i;
 
 	if (list_empty(&ksm_mm_head.mm_list))
 		return NULL;
@@ -1297,7 +1332,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 		 */
 		lru_add_drain_all();
 
-		root_unstable_tree = RB_ROOT;
+		for (i = 0; i < MAX_NUMNODES; i++)
+			root_unstable_tree[i] = RB_ROOT;
 
 		spin_lock(&ksm_mmlist_lock);
 		slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1775,15 +1811,19 @@ static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
 						 unsigned long end_pfn)
 {
 	struct rb_node *node;
+	int i;
 
-	for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
-		struct stable_node *stable_node;
+	for (i = 0; i < MAX_NUMNODES; i++)
+		for (node = rb_first(&root_stable_tree[i]); node;
+				node = rb_next(node)) {
+			struct stable_node *stable_node;
+
+			stable_node = rb_entry(node, struct stable_node, node);
+			if (stable_node->kpfn >= start_pfn &&
+			    stable_node->kpfn < end_pfn)
+				return stable_node;
+		}
 
-		stable_node = rb_entry(node, struct stable_node, node);
-		if (stable_node->kpfn >= start_pfn &&
-		    stable_node->kpfn < end_pfn)
-			return stable_node;
-	}
 	return NULL;
 }
 
@@ -1933,6 +1973,49 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 KSM_ATTR(run);
 
+static ssize_t merge_nodes_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", ksm_merge_nodes);
+}
+
+static ssize_t merge_nodes_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int err;
+	long unsigned int knob;
+
+	err = kstrtoul(buf, 10, &knob);
+	if (err)
+		return err;
+	if (knob > 1)
+		return -EINVAL;
+
+	mutex_lock(&ksm_thread_mutex);
+	if (ksm_merge_nodes != knob) {
+		/* NUMA mode is changing, so re-merge all */
+		int oom_score_adj;
+
+		oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+		err = unmerge_and_remove_all_rmap_items();
+		compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
+							oom_score_adj);
+		if (err) {
+			ksm_run = KSM_RUN_STOP;
+			count = err;
+		}
+	}
+	ksm_merge_nodes = knob;
+	mutex_unlock(&ksm_thread_mutex);
+
+	if (ksm_run & KSM_RUN_MERGE)
+		wake_up_interruptible(&ksm_thread_wait);
+
+	return count;
+}
+KSM_ATTR(merge_nodes);
+
 static ssize_t pages_shared_show(struct kobject *kobj,
 				 struct kobj_attribute *attr, char *buf)
 {
@@ -1987,6 +2070,9 @@ static struct attribute *ksm_attrs[] = {
 	&pages_unshared_attr.attr,
 	&pages_volatile_attr.attr,
 	&full_scans_attr.attr,
+#ifdef CONFIG_NUMA
+	&merge_nodes_attr.attr,
+#endif
 	NULL,
 };
 
-- 
1.7.6.5


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] KSM: numa awareness sysfs knob
  2012-01-23 10:29 [PATCH] KSM: numa awareness sysfs knob Petr Holasek
@ 2012-01-25  0:03 ` Andrew Morton
  2012-01-25 16:40   ` Petr Holasek
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Morton @ 2012-01-25  0:03 UTC (permalink / raw)
  To: Petr Holasek
  Cc: Hugh Dickins, Andrea Arcangeli, Chris Wright, Izik Eidus,
	linux-kernel, linux-mm, Anton Arapov

On Mon, 23 Jan 2012 11:29:28 +0100
Petr Holasek <pholasek@redhat.com> wrote:

> This patch is based on RFC
> 
> https://lkml.org/lkml/2011/11/30/91
> 
> Introduces new sysfs binary knob /sys/kernel/mm/ksm/merge_nodes

It's not binary - it's ascii text!  "boolean" is a better term here ;)

> which control merging pages across different numa nodes.
> When it is set to zero only pages from the same node are merged,
> otherwise pages from all nodes can be merged together (default behavior).
> 
> Typical use-case could be a lot of KVM guests on NUMA machine
> where cpus from more distant nodes would have significant increase
> of access latency to the merged ksm page. Switching merge_nodes
> to 1 will result into these steps:
> 
> 	1) unmerging all ksm pages
> 	2) re-merging all pages from VM_MERGEABLE vmas only within
> 		their NUMA nodes.
> 	3) lower average access latency to merged pages at the
> 	   expense of higher memory usage.
> 
> Every numa node has its own stable & unstable trees because
> of faster searching and inserting. Changing of merge_nodes
> value breaks COW on all current ksm pages.
> 

How useful is this code?  Do you have any performance testing results
to help make the case for merging it?

Should the unmerged case be made permanent and not configurable?  IOW,
what is the argument for continuing to permit the user to merge across
nodes?

Should the code bother doing this unmerge when
/sys/kernel/mm/ksm/merge_nodes is written to?  It would be simpler to
expect the user to configure /sys/kernel/mm/ksm/merge_nodes prior to
using KSM at all?

> @@ -58,6 +58,9 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
>                     e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
>                     Default: 20 (chosen for demonstration purposes)
>  
> +merge_nodes      - specifies if pages from different numa nodes can be merged
> +                   Default: 1

This documentation would be better if it informed the user about how to
use merge_nodes.  What are the effects of altering it and why might
they wish to do this?

>
> ...
>
> +static ssize_t merge_nodes_store(struct kobject *kobj,
> +				   struct kobj_attribute *attr,
> +				   const char *buf, size_t count)
> +{
> +	int err;
> +	long unsigned int knob;

Plain old "unsigned long" is more usual.

Better would be to make this "unsigned", to match ksm_merge_nodes.  Use
kstrtouint() below.

>
> ...
>
> @@ -1987,6 +2070,9 @@ static struct attribute *ksm_attrs[] = {
>  	&pages_unshared_attr.attr,
>  	&pages_volatile_attr.attr,
>  	&full_scans_attr.attr,
> +#ifdef CONFIG_NUMA
> +	&merge_nodes_attr.attr,
> +#endif

One might think that with CONFIG_NUMA=n, we just added a pile of
useless codebloat to vmlinux.  But gcc is sneaky and removes the
unreferenced functions.

However while doing so, gcc shows that it reads
Documentation/SubmitChecklist, section 2:

mm/ksm.c:2017: warning: 'merge_nodes_attr' defined but not used

So...

diff -puN mm/ksm.c~ksm-numa-awareness-sysfs-knob-fix mm/ksm.c
--- a/mm/ksm.c~ksm-numa-awareness-sysfs-knob-fix
+++ a/mm/ksm.c
@@ -1973,6 +1973,7 @@ static ssize_t run_store(struct kobject 
 }
 KSM_ATTR(run);
 
+#ifdef CONFIG_NUMA
 static ssize_t merge_nodes_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf)
 {
@@ -2015,6 +2016,7 @@ static ssize_t merge_nodes_store(struct 
 	return count;
 }
 KSM_ATTR(merge_nodes);
+#endif
 
 static ssize_t pages_shared_show(struct kobject *kobj,
 				 struct kobj_attribute *attr, char *buf)
_


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: KSM: numa awareness sysfs knob
  2012-01-25  0:03 ` Andrew Morton
@ 2012-01-25 16:40   ` Petr Holasek
  0 siblings, 0 replies; 5+ messages in thread
From: Petr Holasek @ 2012-01-25 16:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Andrea Arcangeli, Chris Wright, Izik Eidus,
	linux-kernel, linux-mm, Anton Arapov

On Tue, 24 Jan 2012, Andrew Morton wrote:
> On Mon, 23 Jan 2012 11:29:28 +0100
> Petr Holasek <pholasek@redhat.com> wrote:
> 
> > This patch is based on RFC
> > 
> > https://lkml.org/lkml/2011/11/30/91
> > 
> > Introduces new sysfs binary knob /sys/kernel/mm/ksm/merge_nodes
> 
> It's not binary - it's ascii text!  "boolean" is a better term here ;)
> 

Of course, I'll fix it :)

> > which control merging pages across different numa nodes.
> > When it is set to zero only pages from the same node are merged,
> > otherwise pages from all nodes can be merged together (default behavior).
> > 
> > Typical use-case could be a lot of KVM guests on NUMA machine
> > where cpus from more distant nodes would have significant increase
> > of access latency to the merged ksm page. Switching merge_nodes
> > to 1 will result into these steps:
> > 
> > 	1) unmerging all ksm pages
> > 	2) re-merging all pages from VM_MERGEABLE vmas only within
> > 		their NUMA nodes.
> > 	3) lower average access latency to merged pages at the
> > 	   expense of higher memory usage.
> > 
> > Every numa node has its own stable & unstable trees because
> > of faster searching and inserting. Changing of merge_nodes
> > value breaks COW on all current ksm pages.
> > 
> 
> How useful is this code?  Do you have any performance testing results
> to help make the case for merging it?

I didn't any no performance testing, but number of nodes is still the same, 
the only difference is that they are distributed among more trees, 
so searching is faster within specified numa node. Every node includes pointer
to the tree's root, but I assume it is quite small payload for faster searching.
Or not?

> 
> Should the unmerged case be made permanent and not configurable?  IOW,
> what is the argument for continuing to permit the user to merge across
> nodes?
> 
> Should the code bother doing this unmerge when
> /sys/kernel/mm/ksm/merge_nodes is written to?  It would be simpler to
> expect the user to configure /sys/kernel/mm/ksm/merge_nodes prior to
> using KSM at all?

The only reason for this feature is being more user-friendly. But if 
we find some issue in doing merging/unmerging interactively, forcing user
to set merge_nodes value before first ksm run will be more safe.

> 
> > @@ -58,6 +58,9 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
> >                     e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
> >                     Default: 20 (chosen for demonstration purposes)
> >  
> > +merge_nodes      - specifies if pages from different numa nodes can be merged
> > +                   Default: 1
> 
> This documentation would be better if it informed the user about how to
> use merge_nodes.  What are the effects of altering it and why might
> they wish to do this?
> 
> >
> > ...
> >
> > +static ssize_t merge_nodes_store(struct kobject *kobj,
> > +				   struct kobj_attribute *attr,
> > +				   const char *buf, size_t count)
> > +{
> > +	int err;
> > +	long unsigned int knob;
> 
> Plain old "unsigned long" is more usual.
> 
> Better would be to make this "unsigned", to match ksm_merge_nodes.  Use
> kstrtouint() below.
> 
> >
> > ...
> >
> > @@ -1987,6 +2070,9 @@ static struct attribute *ksm_attrs[] = {
> >  	&pages_unshared_attr.attr,
> >  	&pages_volatile_attr.attr,
> >  	&full_scans_attr.attr,
> > +#ifdef CONFIG_NUMA
> > +	&merge_nodes_attr.attr,
> > +#endif
> 
> One might think that with CONFIG_NUMA=n, we just added a pile of
> useless codebloat to vmlinux.  But gcc is sneaky and removes the
> unreferenced functions.
> 
> However while doing so, gcc shows that it reads
> Documentation/SubmitChecklist, section 2:
> 
> mm/ksm.c:2017: warning: 'merge_nodes_attr' defined but not used
> 
> So...
> 
> diff -puN mm/ksm.c~ksm-numa-awareness-sysfs-knob-fix mm/ksm.c
> --- a/mm/ksm.c~ksm-numa-awareness-sysfs-knob-fix
> +++ a/mm/ksm.c
> @@ -1973,6 +1973,7 @@ static ssize_t run_store(struct kobject 
>  }
>  KSM_ATTR(run);
>  
> +#ifdef CONFIG_NUMA
>  static ssize_t merge_nodes_show(struct kobject *kobj,
>  				struct kobj_attribute *attr, char *buf)
>  {
> @@ -2015,6 +2016,7 @@ static ssize_t merge_nodes_store(struct 
>  	return count;
>  }
>  KSM_ATTR(merge_nodes);
> +#endif
>  
>  static ssize_t pages_shared_show(struct kobject *kobj,
>  				 struct kobj_attribute *attr, char *buf)
> _
> 

Apologize, I overlooked that. I'll fix it and other issues you pointed out
above in next version. 

Many thanks for reviewing!

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: KSM: numa awareness sysfs knob
       [not found]     ` <201112011940.19022.nai.xia@gmail.com>
@ 2011-12-01 13:04       ` Petr Holasek
  0 siblings, 0 replies; 5+ messages in thread
From: Petr Holasek @ 2011-12-01 13:04 UTC (permalink / raw)
  To: Nai Xia
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli, linux-kernel,
	linux-mm, Anton Arapov

On Thu, 01 Dec 2011, Nai Xia wrote:

> Date: Thu, 1 Dec 2011 19:40:18 +0800
> From: Nai Xia <nai.xia@gmail.com>
> To: Petr Holasek <pholasek@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>, Hugh Dickins
>  <hughd@google.com>, Andrea Arcangeli <aarcange@redhat.com>,
>  linux-kernel@vger.kernel.org, linux-mm@kvack.org, Anton Arapov
>  <anton@redhat.com>
> Subject: Re: KSM: numa awareness sysfs knob
> Reply-To: nai.xia@gmail.com
> 
> On Thursday 01 December 2011 18:16:40 Petr Holasek wrote:
> > On Wed, 30 Nov 2011, Andrew Morton wrote:
> > 
> > > Date: Wed, 30 Nov 2011 15:47:19 -0800
> > > From: Andrew Morton <akpm@linux-foundation.org>
> > > To: Petr Holasek <pholasek@redhat.com>
> > > Cc: Hugh Dickins <hughd@google.com>, Andrea Arcangeli
> > >  <aarcange@redhat.com>, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
> > >  Anton Arapov <anton@redhat.com>
> > > Subject: Re: [PATCH] [RFC] KSM: numa awareness sysfs knob
> > > 
> > > On Wed, 30 Nov 2011 11:37:26 +0100
> > > Petr Holasek <pholasek@redhat.com> wrote:
> > > 
> > > > Introduce a new sysfs knob /sys/kernel/mm/ksm/max_node_dist, whose
> > > > value will be used as the limitation for node distance of merged pages.
> > > > 
> > > 
> > > The changelog doesn't really describe why you think Linux needs this
> > > feature?  What's the reasoning?  Use cases?  What value does it provide?
> > 
> > Typical use-case could be a lot of KVM guests on NUMA machine and cpus from
> > more distant nodes would have significant increase of access latency to the
> > merged ksm page. I chose sysfs knob for higher scalability.
> 
> Seems this consideration for NUMA is sound. 
> 
> > 
> > > 
> > > > index b392e49..b882140 100644
> > > > --- a/Documentation/vm/ksm.txt
> > > > +++ b/Documentation/vm/ksm.txt
> > > > @@ -58,6 +58,10 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
> > > >                     e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
> > > >                     Default: 20 (chosen for demonstration purposes)
> > > >  
> > > > +max_node_dist    - maximum node distance between two pages which could be
> > > > +                   merged.
> > > > +                   Default: 255 (without any limitations)
> > > 
> > > And this doesn't explain to our users why they might want to alter it,
> > > and what effects they would see from doing so.  Maybe that's obvious to
> > > them...
> > 
> > Now I can't figure out more extensive description of this feature, but we
> > could explain it deeply, of course.
> 
> However, if we don't know what the number fed into this knob really means, 
> seems nobody would think of using this knob...
> 
> Then why not make this NUMA feature automatically adjusted by some algorithm
> instread of dropping it to userland?
> 
> BTW, the algrothim you already include in this patch seems unstable itself:
> 
> Suppose we have three duplicated pages in order: Page_a, Page_b, Page_c with 
> distance(Page_a, Page_b) == distance(Page_b, Page_c) == 3, 
> but distance(Page_a, Page_c) == 6 and if max_node_dist == 3, 
> a stable algorithm should result in Page_a and Page_c being merged to Page_b,
> independent of the order these pages get scanned. 
> 
> But with your patch, if ksmd goes Page_b --> Page_c --> Page_a, will it 
> result in Page_b being merged to Page_c but Page_a not merged since its 
> distance to Page_c is 6?

Yes, you're right. With this patch, merge order depends only on the order of
scanning. Use of some algorithm (maybe from graph-theory field?) is a really 
good point. Although the complexity of code will rise a lot, it maybe the
best solution for most of usecases when this algorithm would be able to do 
some heuristics and determine max_distance for merging on its own without 
any userspace inputs.

> 
> It may easy to further deduce that maybe a worst case(or even many cases?) 
> for your patch will get many many could-be-merged pages not merged simply 
> because of the sequence they are scanned.
> 
> The problem you plan to solve maybe worthwhile, but it may also be much more
> complex than you expected ;-)

That's the reason why it is only RFC, I mainly wanted to gather your opinions:)

>   
> 
> BR,
> 
> Nai

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: KSM: numa awareness sysfs knob
  2011-11-30 23:47 ` Andrew Morton
@ 2011-12-01 10:16   ` Petr Holasek
       [not found]     ` <201112011940.19022.nai.xia@gmail.com>
  0 siblings, 1 reply; 5+ messages in thread
From: Petr Holasek @ 2011-12-01 10:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Andrea Arcangeli, linux-kernel, linux-mm, Anton Arapov

On Wed, 30 Nov 2011, Andrew Morton wrote:

> Date: Wed, 30 Nov 2011 15:47:19 -0800
> From: Andrew Morton <akpm@linux-foundation.org>
> To: Petr Holasek <pholasek@redhat.com>
> Cc: Hugh Dickins <hughd@google.com>, Andrea Arcangeli
>  <aarcange@redhat.com>, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
>  Anton Arapov <anton@redhat.com>
> Subject: Re: [PATCH] [RFC] KSM: numa awareness sysfs knob
> 
> On Wed, 30 Nov 2011 11:37:26 +0100
> Petr Holasek <pholasek@redhat.com> wrote:
> 
> > Introduce a new sysfs knob /sys/kernel/mm/ksm/max_node_dist, whose
> > value will be used as the limitation for node distance of merged pages.
> > 
> 
> The changelog doesn't really describe why you think Linux needs this
> feature?  What's the reasoning?  Use cases?  What value does it provide?

Typical use-case could be a lot of KVM guests on NUMA machine and cpus from
more distant nodes would have significant increase of access latency to the
merged ksm page. I chose sysfs knob for higher scalability.

> 
> > index b392e49..b882140 100644
> > --- a/Documentation/vm/ksm.txt
> > +++ b/Documentation/vm/ksm.txt
> > @@ -58,6 +58,10 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
> >                     e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
> >                     Default: 20 (chosen for demonstration purposes)
> >  
> > +max_node_dist    - maximum node distance between two pages which could be
> > +                   merged.
> > +                   Default: 255 (without any limitations)
> 
> And this doesn't explain to our users why they might want to alter it,
> and what effects they would see from doing so.  Maybe that's obvious to
> them...

Now I can't figure out more extensive description of this feature, but we
could explain it deeply, of course.

> 
> >  run              - set 0 to stop ksmd from running but keep merged pages,
> >                     set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
> >                     set 2 to stop ksmd and unmerge all pages currently merged,
> >
> > ...
> >
> > +#ifdef CONFIG_NUMA
> > +static inline int node_dist(int from, int to)
> > +{
> > +	int dist = node_distance(from, to);
> > +
> > +	return dist == -1 ? 0 : dist;
> > +}
> 
> So I spent some time grubbing around trying to work out what a return
> value of -1 from node_distance() means, and wasn't successful.  Perhaps
> an explanatory comment here would have helped.

Yes, apologize, my mistake. I've overlooked that it should never return value
lower than LOCAL_DISTANCE even with CONFIG_NUMA unset. So those wrappers are
pointless and I'll remove them.

> 
> > +#else
> > +static inline int node_dist(int from, int to)
> > +{
> > +	return 0;
> > +}
> > +#endif
> >
> > ...
> >
> > +static ssize_t max_node_dist_store(struct kobject *kobj,
> > +				   struct kobj_attribute *attr,
> > +				   const char *buf, size_t count)
> > +{
> > +	int err;
> > +	unsigned long node_dist;
> > +
> > +	err = kstrtoul(buf, 10, &node_dist);
> > +	if (err || node_dist > 255)
> > +		return -EINVAL;
> 
> If kstrtoul() returned an errno we should propagate that back rather than
> overwriting it with -EINVAL.

Ok, I'll fix it.

> 
> > +	ksm_node_distance = node_dist;
> > +
> > +	return count;
> > +}
> >
> > ...
> >
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-01-25 16:41 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-23 10:29 [PATCH] KSM: numa awareness sysfs knob Petr Holasek
2012-01-25  0:03 ` Andrew Morton
2012-01-25 16:40   ` Petr Holasek
  -- strict thread matches above, loose matches on Subject: below --
2011-11-30 10:37 [PATCH] [RFC] " Petr Holasek
2011-11-30 23:47 ` Andrew Morton
2011-12-01 10:16   ` Petr Holasek
     [not found]     ` <201112011940.19022.nai.xia@gmail.com>
2011-12-01 13:04       ` Petr Holasek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).