All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
@ 2013-08-12 14:13 Yechen Li
  0 siblings, 0 replies; 11+ messages in thread
From: Yechen Li @ 2013-08-12 14:13 UTC (permalink / raw)
  To: xen-devel; +Cc: dario.faggioli, linux-kernel, Yechen Li

  This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
  It's just a RFC version, since I'm waiting for the interface of numa topology.
  The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
/target_nid, and settle the memory increase/decrease operation on specified
p-nodeID. 
  To achieve this, I expand the page-list: ballooned_pages to an array:
ballooned_pages[MAX_BALLOONNODES], so that balloon can distinguish pages from 
different node.
  For the guest without numa, this MAX_BALLOONNODES = 1 so that the balloon falls
back to a no-numa version.
  The small functions mark //todo: is the interface to numa topology. Now they
looks stupid because I'm still testing this code. The balloon works well (at
least it seems to) with this small debug interface. Please ignore the more stupid
commemts, I'll remove them in some version later...
  the patch of libxl is here: http://lists.xenproject.org/archives/html/xen-devel/2013-08/msg01157.html
  It's my first time submitting a patch, please point out the problems so that
I could work better in future, thanks very much!
  

---
 drivers/xen/balloon.c     | 358 ++++++++++++++++++++++++++++++++++++++++------
 drivers/xen/xen-balloon.c |  21 ++-
 include/xen/balloon.h     |  17 +++
 3 files changed, 345 insertions(+), 51 deletions(-)

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 2a2ef97..09ca1eb 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -36,8 +36,6 @@
  * IN THE SOFTWARE.
  */
 
-#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
-
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
@@ -53,6 +51,9 @@
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 
+//lcc:
+#include <linux/numa.h>
+
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -81,18 +82,43 @@ enum bp_state {
 	BP_EAGAIN,
 	BP_ECANCELED
 };
+struct bp_rt{
+    unsigned long donepages;
+    enum bp_state state;
+};
+#define DECLARE_BP_RT(bp_rt)     \
+    struct bp_rt bp_rt = {       \
+        .donepages = 0,          \
+        .state = BP_DONE        \
+    }
 
 
 static DEFINE_MUTEX(balloon_mutex);
 
+//lcc todo: should this balloon_stats change to balloon_stats[MAX_BALLOONNODES]?
 struct balloon_stats balloon_stats;
 EXPORT_SYMBOL_GPL(balloon_stats);
 
 /* We increase/decrease in batches which fit in a page */
 static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
+#ifdef CONFIG_HIGHMEM
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() do {} while (0)
+#define dec_totalhigh_pages() do {} while (0)
+#endif
+
 /* List of ballooned pages, threaded through the mem_map array. */
-static LIST_HEAD(ballooned_pages);
+//static LIST_HEAD(ballooned_pages);
+/*
+ * lcc:
+ * this array is index by vnid,
+ * because we need to use alloc_pages_node(xxx)
+ */
+static struct list_head ballooned_pages[MAX_BALLOONNODES];
+long long ballooned_pages_cnt[MAX_BALLOONNODES];
 
 /* Main work function, always executed in process context. */
 static void balloon_process(struct work_struct *work);
@@ -110,60 +136,115 @@ static void scrub_page(struct page *page)
 #endif
 }
 
+void ballooned_pages_init(void)
+{
+    int i;
+    for (i = 0; i<MAX_BALLOONNODES; i++){
+        INIT_LIST_HEAD(&ballooned_pages[i]);
+        ballooned_pages_cnt[i] = 0;
+    }
+}
+EXPORT_SYMBOL_GPL(ballooned_pages_init);
+
+unsigned long long xen_mnid_to_vnidmask(int mnid)
+{
+    //todo:
+    unsigned long long rc = 1;
+    return rc<<mnid;
+}
+
+int xen_vnid_to_mnid(int vnid)
+{
+    //todo:
+    return vnid % MAX_BALLOONNODES;
+}
+
+int balloon_page_to_vnid(struct page *page)
+{
+    //todo:
+    //return page_to_nid(page);
+    return ((unsigned long long)page & (1<<13))?0:1;
+}
+
+struct page* xen_alloc_pages_node(int vnid)
+{
+    //todo: vnid = 0 is for debug:
+    vnid = 0;
+    return alloc_pages_node(vnid, GFP_BALLOON, balloon_order);
+}
+
 /* balloon_append: add the given page to the balloon. */
 static void __balloon_append(struct page *page)
 {
+    //lcc:notice that this nid is of domU, not of Xen!
+    int vnid = balloon_page_to_vnid(page);
 	/* Lowmem is re-populated first, so highmem pages go at list tail. */
 	if (PageHighMem(page)) {
-		list_add_tail(&page->lru, &ballooned_pages);
+		list_add_tail(&page->lru, &ballooned_pages[vnid]);
 		balloon_stats.balloon_high++;
 	} else {
-		list_add(&page->lru, &ballooned_pages);
+		list_add(&page->lru, &ballooned_pages[vnid]);
 		balloon_stats.balloon_low++;
 	}
+    ballooned_pages_cnt[vnid]++;
 }
 
 static void balloon_append(struct page *page)
 {
 	__balloon_append(page);
-	adjust_managed_page_count(page, -1);
+	if (PageHighMem(page))
+		dec_totalhigh_pages();
+	totalram_pages--;
 }
 
-/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
-static struct page *balloon_retrieve(bool prefer_highmem)
+/* balloon_retrieve_node: rescue a page from virtual node vnid */
+static struct page *balloon_retrieve_node(int vnid, bool prefer_highmem)
 {
 	struct page *page;
 
-	if (list_empty(&ballooned_pages))
+	if (list_empty(&(ballooned_pages[vnid])))
 		return NULL;
 
 	if (prefer_highmem)
-		page = list_entry(ballooned_pages.prev, struct page, lru);
+		page = list_entry(ballooned_pages[vnid].prev, struct page, lru);
 	else
-		page = list_entry(ballooned_pages.next, struct page, lru);
+		page = list_entry(ballooned_pages[vnid].next, struct page, lru);
 	list_del(&page->lru);
+    ballooned_pages_cnt[vnid]--;
 
-	if (PageHighMem(page))
+	if (PageHighMem(page)) {
 		balloon_stats.balloon_high--;
-	else
+		inc_totalhigh_pages();
+	} else
 		balloon_stats.balloon_low--;
 
-	adjust_managed_page_count(page, 1);
+	totalram_pages++;
 
 	return page;
 }
 
-static struct page *balloon_first_page(void)
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(bool prefer_highmem)
 {
-	if (list_empty(&ballooned_pages))
+    int i;
+    struct page *page = NULL;
+    for (i = 0; i<MAX_BALLOONNODES && !page; i++)
+        page = balloon_retrieve_node(i, prefer_highmem);
+
+    return page;
+}
+
+static struct page *balloon_first_page(int vnid)
+{
+	if (list_empty(&ballooned_pages[vnid]))
 		return NULL;
-	return list_entry(ballooned_pages.next, struct page, lru);
+	return list_entry(ballooned_pages[vnid].next, struct page, lru);
 }
 
-static struct page *balloon_next_page(struct page *page)
+static struct page *balloon_next_page(int vnid, struct page *page)
 {
 	struct list_head *next = page->lru.next;
-	if (next == &ballooned_pages)
+	if (next == &ballooned_pages[vnid])
 		return NULL;
 	return list_entry(next, struct page, lru);
 }
@@ -233,7 +314,7 @@ static enum bp_state reserve_additional_memory(long credit)
 	rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
 
 	if (rc) {
-		pr_info("%s: add_memory() failed: %i\n", __func__, rc);
+		pr_info("xen_balloon: %s: add_memory() failed: %i\n", __func__, rc);
 		return BP_EAGAIN;
 	}
 
@@ -301,47 +382,60 @@ static enum bp_state reserve_additional_memory(long credit)
 }
 #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
 
-static enum bp_state increase_reservation(unsigned long nr_pages)
+//lcc: mnid means machine_node_id, differ from vid:virtual_node_id in guest
+/* lcc: but I think this function never called by xen. xen just change
+ * balloon_stats.target_pages, and balloon will autoly call increase_reservation
+ * and decrease_reservation to do the job.
+ */
+static struct bp_rt __increase_reservation_nodeonly(int vnid, unsigned long nr_pages)
 {
-	int rc;
+	long rc;
 	unsigned long  pfn, i;
 	struct page   *page;
+    //lcc: debug, below 0 should be mnid
+    int mnid = xen_vnid_to_mnid(vnid);
 	struct xen_memory_reservation reservation = {
-		.address_bits = 0,
+		.address_bits = MEMF_node(mnid) | MEMF_exact_node,
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
-
+    DECLARE_BP_RT(bp_rt);
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 	if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) {
 		nr_pages = min(nr_pages, balloon_stats.balloon_hotplug);
 		balloon_stats.hotplug_pages += nr_pages;
 		balloon_stats.balloon_hotplug -= nr_pages;
-		return BP_DONE;
+        bp_rt.donepages = nr_pages;
+		return bp_rt;
 	}
 #endif
 
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
-	page = balloon_first_page();
+	page = balloon_first_page(vnid);
 	for (i = 0; i < nr_pages; i++) {
 		if (!page) {
 			nr_pages = i;
 			break;
 		}
 		frame_list[i] = page_to_pfn(page);
-		page = balloon_next_page(page);
+		page = balloon_next_page(vnid, page);
 	}
 
+    if (nr_pages == 0)
+        return bp_rt;
+
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents = nr_pages;
 	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
-	if (rc <= 0)
-		return BP_EAGAIN;
+	if (rc <= 0){
+        bp_rt.state = BP_EAGAIN;
+        return bp_rt;
+    }
 
 	for (i = 0; i < rc; i++) {
-		page = balloon_retrieve(false);
+		page = balloon_retrieve_node(vnid, false);
 		BUG_ON(page == NULL);
 
 		pfn = page_to_pfn(page);
@@ -363,17 +457,92 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 #endif
 
 		/* Relinquish the page back to the allocator. */
-		__free_reserved_page(page);
+		ClearPageReserved(page);
+		init_page_count(page);
+		__free_page(page);
 	}
 
 	balloon_stats.current_pages += rc;
 
-	return BP_DONE;
+    printk(KERN_ALERT "lcc: __increase rc = %ld\n", rc);
+
+    bp_rt.donepages = rc;
+
+	return bp_rt;
 }
 
-static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
+/*
+ * notice that __increase_reservation_nodeonly is a batcher.
+ * it can only do with length(frame_list[]) pages at a time
+ * so run an loop, while still positive pages return (rc>0)
+ *  go on with another batcher
+ */
+static struct bp_rt increase_reservation_nodeonly(int vnid,
+            unsigned long nr_pages)
 {
-	enum bp_state state = BP_DONE;
+    unsigned long ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    while (nr_pages>0){
+        bp_rt = __increase_reservation_nodeonly(vnid, nr_pages);
+        nr_pages -= bp_rt.donepages;
+        if (bp_rt.donepages == 0 || bp_rt.state != BP_DONE)
+            break;
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    printk(KERN_ALERT "lcc: increase nodeonly vnid = %d, donepages = %lu\n",
+                vnid, bp_rt.donepages);
+    return bp_rt;
+}
+
+static struct bp_rt increase_reservation_nodemask(unsigned long long vnidmask,
+            unsigned long nr_pages)
+{
+    int i;
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+
+    if (vnidmask == 0)
+        return bp_rt;
+
+    for (i = 0; i<MAX_BALLOONNODES; i++){
+        if (vnidmask & (1<<i)){
+            bp_rt = increase_reservation_nodeonly(i, nr_pages);
+            nr_pages -= bp_rt.donepages;
+            if (bp_rt.state != BP_DONE){
+                break;
+            }
+        }
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+
+static struct bp_rt increase_reservation_numa(unsigned long long vnidmask,
+            bool nodeexact, unsigned long nr_pages)
+{
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    bp_rt = increase_reservation_nodemask(vnidmask, nr_pages);
+    nr_pages -= bp_rt.donepages;
+    if (nodeexact == false){
+        vnidmask = ((unsigned long long)1<<MAX_BALLOONNODES)-1;
+        bp_rt = increase_reservation_nodemask(vnidmask, nr_pages);
+        nr_pages -= bp_rt.donepages;
+    }
+    bp_rt.donepages  = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+/*
+static enum bp_state increase_reservation(unsigned long nr_pages){
+    struct bp_rt bp_rt = increase_reservation_numa(0,false,nr_pages);
+    return bp_rt.state;
+}
+*/
+
+static struct bp_rt __decrease_reservation_nodeonly(int vnid,
+            unsigned long nr_pages, gfp_t gfp)
+{
+    DECLARE_BP_RT(bp_rt);
 	unsigned long  pfn, i;
 	struct page   *page;
 	int ret;
@@ -382,13 +551,13 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
-
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 	if (balloon_stats.hotplug_pages) {
 		nr_pages = min(nr_pages, balloon_stats.hotplug_pages);
 		balloon_stats.hotplug_pages -= nr_pages;
 		balloon_stats.balloon_hotplug += nr_pages;
-		return BP_DONE;
+        bp_rt.donepages = nr_pages;
+        return bp_rt;
 	}
 #endif
 
@@ -396,10 +565,10 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 		nr_pages = ARRAY_SIZE(frame_list);
 
 	for (i = 0; i < nr_pages; i++) {
-		page = alloc_page(gfp);
-		if (page == NULL) {
+        //lcc:
+        if ((page = xen_alloc_pages_node(vnid)) == NULL){
 			nr_pages = i;
-			state = BP_EAGAIN;
+			bp_rt.state = BP_EAGAIN;
 			break;
 		}
 
@@ -436,7 +605,74 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 
 	balloon_stats.current_pages -= nr_pages;
 
-	return state;
+    printk(KERN_ALERT "lcc: __decrease nr_pages = %ld\n", nr_pages);
+
+    bp_rt.donepages = nr_pages;
+    return bp_rt;
+}
+
+/*
+ * the same reason to increase_reservaton_readonly
+ * run a loop for another batcher if rc > 0
+ */
+static struct bp_rt decrease_reservation_nodeonly(int vnid,
+            unsigned long nr_pages, gfp_t gfp)
+{
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    while (nr_pages>0){
+        bp_rt = __decrease_reservation_nodeonly(vnid, nr_pages, gfp);
+        nr_pages -= bp_rt.donepages;
+        if (bp_rt.donepages == 0 || bp_rt.state != BP_DONE)
+            break;
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    printk(KERN_ALERT "lcc: decrease nodeonly vnid = %d, donepages = %lu\n",
+            vnid, bp_rt.donepages);
+    return bp_rt;
+}
+static struct bp_rt decrease_reservation_nodemask(unsigned long long vnidmask,
+            unsigned long nr_pages, gfp_t gfp)
+{
+    int i;
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+
+    if (vnidmask == 0)
+        return bp_rt;
+
+    for (i = 0; i<MAX_BALLOONNODES; i++){
+        if (vnidmask & (1<<i)){
+            bp_rt = decrease_reservation_nodeonly(i, nr_pages, gfp);
+            nr_pages -= bp_rt.donepages;
+            if (bp_rt.state != BP_DONE)
+                break;
+        }
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+
+static struct bp_rt decrease_reservation_numa(unsigned long long vnidmask,
+            bool nodeexact, unsigned long nr_pages, gfp_t gfp)
+{
+    unsigned long ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    bp_rt = decrease_reservation_nodemask(vnidmask, nr_pages, gfp);
+    nr_pages -= bp_rt.donepages;
+    if (nodeexact == false){
+        vnidmask = ((unsigned long long)1<<MAX_BALLOONNODES)-1;
+        bp_rt = decrease_reservation_nodemask(vnidmask, nr_pages, gfp);
+        nr_pages -= bp_rt.donepages;
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+
+static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
+{
+    struct bp_rt bp_rt = decrease_reservation_numa(0, false, nr_pages, gfp);
+    return bp_rt.state;
 }
 
 /*
@@ -449,6 +685,11 @@ static void balloon_process(struct work_struct *work)
 {
 	enum bp_state state = BP_DONE;
 	long credit;
+    int mnid = balloon_stats.numa_mnid;
+    bool nodeexact = balloon_stats.numa_nodeexact;
+    int counter = 0;
+    int i;
+    unsigned long long vnidmask = xen_mnid_to_vnidmask(mnid);
 
 	mutex_lock(&balloon_mutex);
 
@@ -457,13 +698,25 @@ static void balloon_process(struct work_struct *work)
 
 		if (credit > 0) {
 			if (balloon_is_inflated())
-				state = increase_reservation(credit);
+                state = increase_reservation_numa(vnidmask,
+                            nodeexact, credit).state;
 			else
 				state=reserve_additional_memory(credit);
 		}
 
-		if (credit < 0)
-			state = decrease_reservation(-credit, GFP_BALLOON);
+		if (credit < 0){
+            state = decrease_reservation_numa(vnidmask, nodeexact,
+                        -credit, GFP_BALLOON).state;
+        }
+
+//lcc: debug
+        printk(KERN_ALERT "lcc: balloon nodeexact=%d retry counter = %d\n",
+                nodeexact, counter);
+        for (i = 0; i<MAX_BALLOONNODES; i++){
+            printk(KERN_ALERT "lcc: balloon node %d has %lld pages\n", i, ballooned_pages_cnt[i]);
+        }
+
+//--debug end
 
 		state = update_schedule(state);
 
@@ -471,6 +724,10 @@ static void balloon_process(struct work_struct *work)
 		if (need_resched())
 			schedule();
 #endif
+        counter++;
+        if (nodeexact && counter >= NUMA_BALLOON_RETRY_MAX)
+            break;
+
 	} while (credit && state == BP_DONE);
 
 	/* Schedule more work if there is some still to be done. */
@@ -480,13 +737,22 @@ static void balloon_process(struct work_struct *work)
 	mutex_unlock(&balloon_mutex);
 }
 
-/* Resets the Xen limit, sets new target, and kicks off processing. */
-void balloon_set_new_target(unsigned long target)
+void balloon_set_new_target_numa(unsigned long target, int mnid, bool nodeexact)
 {
 	/* No need for lock. Not read-modify-write updates. */
 	balloon_stats.target_pages = target;
+    balloon_stats.numa_mnid = mnid;
+    balloon_stats.numa_nodeexact = nodeexact;
+    printk(KERN_ALERT "lcc: target = %lu, mnid = %d, nodeexact= %d\n", target, mnid, nodeexact);
 	schedule_delayed_work(&balloon_worker, 0);
 }
+EXPORT_SYMBOL_GPL(balloon_set_new_target_numa);
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+    balloon_set_new_target_numa(target, -1, false);
+}
 EXPORT_SYMBOL_GPL(balloon_set_new_target);
 
 /**
@@ -580,7 +846,7 @@ static int __init balloon_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("Initialising balloon driver\n");
+	pr_info("xen/balloon: Initialising balloon driver.\n");
 
 	balloon_stats.current_pages = xen_pv_domain()
 		? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index e555845..28fa728 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -30,8 +30,6 @@
  * IN THE SOFTWARE.
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/capability.h>
@@ -56,6 +54,8 @@ static void watch_target(struct xenbus_watch *watch,
 			 const char **vec, unsigned int len)
 {
 	unsigned long long new_target;
+    int mnid;
+    int focus;
 	int err;
 
 	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
@@ -63,11 +63,20 @@ static void watch_target(struct xenbus_watch *watch,
 		/* This is ok (for domain0 at least) - so just return */
 		return;
 	}
+	err = xenbus_scanf(XBT_NIL, "memory", "target_nid", "%d %d", &mnid, &focus);
+    if (err != 2){
+        mnid = -1;
+    }
+    /* no numa node specify, set focus = false*/
+    if (mnid == -1){
+        mnid = 0;
+        focus = false;
+    }
 
 	/* The given memory/target value is in KiB, so it needs converting to
 	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
 	 */
-	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+	balloon_set_new_target_numa(new_target >> (PAGE_SHIFT - 10), mnid, focus);
 }
 static struct xenbus_watch target_watch = {
 	.node = "memory/target",
@@ -83,7 +92,7 @@ static int balloon_init_watcher(struct notifier_block *notifier,
 
 	err = register_xenbus_watch(&target_watch);
 	if (err)
-		pr_err("Failed to set balloon watcher\n");
+		printk(KERN_ERR "Failed to set balloon watcher\n");
 
 	return NOTIFY_DONE;
 }
@@ -97,7 +106,9 @@ static int __init balloon_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("Initialising balloon driver\n");
+	pr_info("xen-balloon: Initialising balloon driver.\n");
+
+    ballooned_pages_init();
 
 	register_balloon(&balloon_dev);
 
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
index cc2e1a7..80dc8d3 100644
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -3,11 +3,23 @@
  */
 
 #define RETRY_UNLIMITED	0
+#define NUMA_BALLOON_RETRY_MAX 20
+
+#define balloon_order 0
+//todo: numa support
+//xensource/xen/include/xen/mm.h
+//#define MEMF_exact_node (1U<<4)
+#define MEMF_exact_node (0U<<4)
+#define  MEMF_node(n)     ((((n)+1)&0xff)<<8)
+#define MAX_BALLOONNODES 2
 
 struct balloon_stats {
 	/* We aim for 'current allocation' == 'target allocation'. */
 	unsigned long current_pages;
 	unsigned long target_pages;
+    /* numa support */
+    int numa_mnid;
+    bool numa_nodeexact;
 	/* Number of pages in high- and low-memory balloons. */
 	unsigned long balloon_low;
 	unsigned long balloon_high;
@@ -23,6 +35,11 @@ struct balloon_stats {
 
 extern struct balloon_stats balloon_stats;
 
+void ballooned_pages_init(void);
+
+void balloon_set_new_target_numa(unsigned long target, int mnid,
+            bool nodeexact);
+
 void balloon_set_new_target(unsigned long target);
 
 int alloc_xenballooned_pages(int nr_pages, struct page **pages,
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-13 13:18     ` Konrad Rzeszutek Wilk
@ 2013-08-13 19:00       ` Dario Faggioli
  0 siblings, 0 replies; 11+ messages in thread
From: Dario Faggioli @ 2013-08-13 19:00 UTC (permalink / raw)
  To: konrad; +Cc: linux-kernel, Yechen Li, David Vrabel, Jan Beulich, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 2121 bytes --]

On mar, 2013-08-13 at 09:18 -0400, Konrad Rzeszutek Wilk wrote:
> On Tue, Aug 13, 2013 at 8:53 AM, David Vrabel <david.vrabel@citrix.com> wrote:
> > On 12/08/13 21:26, Dario Faggioli wrote:
> >>
> >> Mmm... Is that the case? I tried to apply it there and I have one failed
> >> hunk (#12). Anyway, could you please rebase it on the tip of some
> >> relevant git tree?
> >>
> >> Linus' tree would be fine, I guess:
> >>
> >> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git
> >
> > This is the one I use for most of my development work.  I wouldn't use a
> > subsystem specific tree unless there were specific conflicts to resolve
> > before hand.
> 
> Which we do:
> 
> commit cd9151e26d31048b2b5e00fd02e110e07d2200c9
> Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Date:   Sun Aug 4 15:39:40 2013 +0100
>     xen/balloon: set a mapping for ballooned out pages
> 
Ok... So, working on Linus' tree on a daily basis, but at lease try to
rebase the changes on top of .../xen/tip.git sounds like a sane enough
workflow to me. :-)

Yechen, would you be fine with something like that?

How familiar are you with concepts like branches, merging and rebasing
in the context of git?

Allow me to give some (pseudo random) pointers:

http://git-scm.com/documentation
http://git-scm.com/book/en/Git-Branching-Basic-Branching-and-Merging
http://git-scm.com/book/en/Git-Branching-Rebasing
http://gitref.org/branching/
http://stackoverflow.com/questions/315911/git-for-beginners-the-definitive-practical-guide
http://sixrevisions.com/resources/git-tutorials-beginners/
http://www.sbf5.com/~cduan/technical/git/
  http://www.sbf5.com/~cduan/technical/git/git-2.shtml
  http://www.sbf5.com/~cduan/technical/git/git-3.shtml
  http://www.sbf5.com/~cduan/technical/git/git-5.shtml

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-13 12:53   ` David Vrabel
@ 2013-08-13 13:18     ` Konrad Rzeszutek Wilk
  2013-08-13 19:00       ` Dario Faggioli
  2013-08-13 13:18     ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 11+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-13 13:18 UTC (permalink / raw)
  To: David Vrabel
  Cc: Dario Faggioli, Yechen Li, xen-devel, linux-kernel, Jan Beulich

On Tue, Aug 13, 2013 at 8:53 AM, David Vrabel <david.vrabel@citrix.com> wrote:
> On 12/08/13 21:26, Dario Faggioli wrote:
>>
>> Mmm... Is that the case? I tried to apply it there and I have one failed
>> hunk (#12). Anyway, could you please rebase it on the tip of some
>> relevant git tree?
>>
>> Linus' tree would be fine, I guess:
>>
>> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git
>
> This is the one I use for most of my development work.  I wouldn't use a
> subsystem specific tree unless there were specific conflicts to resolve
> before hand.

Which we do:

commit cd9151e26d31048b2b5e00fd02e110e07d2200c9
Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date:   Sun Aug 4 15:39:40 2013 +0100
    xen/balloon: set a mapping for ballooned out pages

>
> David

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-13 12:53   ` David Vrabel
  2013-08-13 13:18     ` Konrad Rzeszutek Wilk
@ 2013-08-13 13:18     ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 11+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-13 13:18 UTC (permalink / raw)
  To: David Vrabel
  Cc: Dario Faggioli, xen-devel, linux-kernel, Jan Beulich, Yechen Li

On Tue, Aug 13, 2013 at 8:53 AM, David Vrabel <david.vrabel@citrix.com> wrote:
> On 12/08/13 21:26, Dario Faggioli wrote:
>>
>> Mmm... Is that the case? I tried to apply it there and I have one failed
>> hunk (#12). Anyway, could you please rebase it on the tip of some
>> relevant git tree?
>>
>> Linus' tree would be fine, I guess:
>>
>> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git
>
> This is the one I use for most of my development work.  I wouldn't use a
> subsystem specific tree unless there were specific conflicts to resolve
> before hand.

Which we do:

commit cd9151e26d31048b2b5e00fd02e110e07d2200c9
Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date:   Sun Aug 4 15:39:40 2013 +0100
    xen/balloon: set a mapping for ballooned out pages

>
> David

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-12 20:26 ` Dario Faggioli
  2013-08-13 12:46   ` Konrad Rzeszutek Wilk
  2013-08-13 12:46   ` Konrad Rzeszutek Wilk
@ 2013-08-13 12:53   ` David Vrabel
  2013-08-13 13:18     ` Konrad Rzeszutek Wilk
  2013-08-13 13:18     ` Konrad Rzeszutek Wilk
  2 siblings, 2 replies; 11+ messages in thread
From: David Vrabel @ 2013-08-13 12:53 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: konrad.r.wilk, xen-devel, linux-kernel, Jan Beulich, Yechen Li

On 12/08/13 21:26, Dario Faggioli wrote:
> 
> Mmm... Is that the case? I tried to apply it there and I have one failed
> hunk (#12). Anyway, could you please rebase it on the tip of some
> relevant git tree?
> 
> Linus' tree would be fine, I guess:
> 
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git

This is the one I use for most of my development work.  I wouldn't use a
subsystem specific tree unless there were specific conflicts to resolve
before hand.

David

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-12 20:26 ` Dario Faggioli
@ 2013-08-13 12:46   ` Konrad Rzeszutek Wilk
  2013-08-13 12:46   ` Konrad Rzeszutek Wilk
  2013-08-13 12:53   ` David Vrabel
  2 siblings, 0 replies; 11+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-13 12:46 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Yechen Li, xen-devel, linux-kernel, David Vrabel, Jan Beulich

On Mon, Aug 12, 2013 at 4:26 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> Hi Yechen.
>
> Thanks for doing and sharing this part of the thing too.
>
> See what I already told you on xen-devel about Cc-ing the relevant
> people. For Linux as well, you can check at the MAINTAINERS file. This
> exact thing (i.e., who the actual maintainers are) is changing a bit
> right in these days... See this commit:
>
> http://git.kernel.org/cgit/linux/kernel/git/xen/tip.git/commit/?h=devel/for-jens-3.12&id=3eeef8f72a9365fe20f2c9d84b0afe60a20788cd
>
> On lun, 2013-08-12 at 22:13 +0800, Yechen Li wrote:
>>   This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
>>
> Mmm... Is that the case? I tried to apply it there and I have one failed
> hunk (#12). Anyway, could you please rebase it on the tip of some
> relevant git tree?
>
> Linus' tree would be fine, I guess:
>
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git
>
> However, since these kind of patches should probably go through Konrad's
> (or other Linux Xen maintainers) tree anyway, you could well base on top
> of:
>
> http://git.kernel.org/cgit/linux/kernel/git/konrad/xen.git/
>
> Or perhaps on some branch of this one here:
>
> http://git.kernel.org/cgit/linux/kernel/git/xen/tip.git/
>
> Konrad, David, would you find 5 minutes to explain Yechen which tree and
> which branch he should use, or point him to the appropriate
> documentation for understanding that? Thanks...

I would suggest he base it on top of devel/for-linus-3.12 for right now. That
branch will become 'stable/for-linus-3.12' which is then going to be
sent to Linus
for v3.12 merge window.

Obviously the patches will need to be reviewed before they go in.
>
>>   It's just a RFC version, since I'm waiting for the interface of numa topology.
>>   The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
>> /target_nid, and settle the memory increase/decrease operation on specified
>> p-nodeID.
>>   To achieve this, I expand the page-list: ballooned_pages to an array:
>> ballooned_pages[MAX_BALLOONNODES], so that balloon can distinguish pages from
>> different node.
>>   For the guest without numa, this MAX_BALLOONNODES = 1 so that the balloon falls
>> back to a no-numa version.
>>

You also need to be backwards compatible in case those keys don't exist, or not
all of them exist. Or they have bogus values.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-12 20:26 ` Dario Faggioli
  2013-08-13 12:46   ` Konrad Rzeszutek Wilk
@ 2013-08-13 12:46   ` Konrad Rzeszutek Wilk
  2013-08-13 12:53   ` David Vrabel
  2 siblings, 0 replies; 11+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-08-13 12:46 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: David Vrabel, xen-devel, linux-kernel, Jan Beulich, Yechen Li

On Mon, Aug 12, 2013 at 4:26 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> Hi Yechen.
>
> Thanks for doing and sharing this part of the thing too.
>
> See what I already told you on xen-devel about Cc-ing the relevant
> people. For Linux as well, you can check at the MAINTAINERS file. This
> exact thing (i.e., who the actual maintainers are) is changing a bit
> right in these days... See this commit:
>
> http://git.kernel.org/cgit/linux/kernel/git/xen/tip.git/commit/?h=devel/for-jens-3.12&id=3eeef8f72a9365fe20f2c9d84b0afe60a20788cd
>
> On lun, 2013-08-12 at 22:13 +0800, Yechen Li wrote:
>>   This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
>>
> Mmm... Is that the case? I tried to apply it there and I have one failed
> hunk (#12). Anyway, could you please rebase it on the tip of some
> relevant git tree?
>
> Linus' tree would be fine, I guess:
>
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git
>
> However, since these kind of patches should probably go through Konrad's
> (or other Linux Xen maintainers) tree anyway, you could well base on top
> of:
>
> http://git.kernel.org/cgit/linux/kernel/git/konrad/xen.git/
>
> Or perhaps on some branch of this one here:
>
> http://git.kernel.org/cgit/linux/kernel/git/xen/tip.git/
>
> Konrad, David, would you find 5 minutes to explain Yechen which tree and
> which branch he should use, or point him to the appropriate
> documentation for understanding that? Thanks...

I would suggest he base it on top of devel/for-linus-3.12 for right now. That
branch will become 'stable/for-linus-3.12' which is then going to be
sent to Linus
for v3.12 merge window.

Obviously the patches will need to be reviewed before they go in.
>
>>   It's just a RFC version, since I'm waiting for the interface of numa topology.
>>   The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
>> /target_nid, and settle the memory increase/decrease operation on specified
>> p-nodeID.
>>   To achieve this, I expand the page-list: ballooned_pages to an array:
>> ballooned_pages[MAX_BALLOONNODES], so that balloon can distinguish pages from
>> different node.
>>   For the guest without numa, this MAX_BALLOONNODES = 1 so that the balloon falls
>> back to a no-numa version.
>>

You also need to be backwards compatible in case those keys don't exist, or not
all of them exist. Or they have bogus values.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-12 14:13 Yechen Li
  2013-08-12 18:44 ` David Vrabel
  2013-08-12 18:44 ` [Xen-devel] " David Vrabel
@ 2013-08-12 20:26 ` Dario Faggioli
  2013-08-13 12:46   ` Konrad Rzeszutek Wilk
                     ` (2 more replies)
  2 siblings, 3 replies; 11+ messages in thread
From: Dario Faggioli @ 2013-08-12 20:26 UTC (permalink / raw)
  To: Yechen Li
  Cc: konrad.r.wilk, David Vrabel, linux-kernel, Jan Beulich, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 7606 bytes --]

Hi Yechen.

Thanks for doing and sharing this part of the thing too.

See what I already told you on xen-devel about Cc-ing the relevant
people. For Linux as well, you can check at the MAINTAINERS file. This
exact thing (i.e., who the actual maintainers are) is changing a bit
right in these days... See this commit:

http://git.kernel.org/cgit/linux/kernel/git/xen/tip.git/commit/?h=devel/for-jens-3.12&id=3eeef8f72a9365fe20f2c9d84b0afe60a20788cd

On lun, 2013-08-12 at 22:13 +0800, Yechen Li wrote:
>   This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
>
Mmm... Is that the case? I tried to apply it there and I have one failed
hunk (#12). Anyway, could you please rebase it on the tip of some
relevant git tree?

Linus' tree would be fine, I guess:

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git

However, since these kind of patches should probably go through Konrad's
(or other Linux Xen maintainers) tree anyway, you could well base on top
of:

http://git.kernel.org/cgit/linux/kernel/git/konrad/xen.git/

Or perhaps on some branch of this one here:

http://git.kernel.org/cgit/linux/kernel/git/xen/tip.git/

Konrad, David, would you find 5 minutes to explain Yechen which tree and
which branch he should use, or point him to the appropriate
documentation for understanding that? Thanks...

>   It's just a RFC version, since I'm waiting for the interface of numa topology.
>   The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
> /target_nid, and settle the memory increase/decrease operation on specified
> p-nodeID. 
>   To achieve this, I expand the page-list: ballooned_pages to an array:
> ballooned_pages[MAX_BALLOONNODES], so that balloon can distinguish pages from 
> different node.
>   For the guest without numa, this MAX_BALLOONNODES = 1 so that the balloon falls
> back to a no-numa version.
>
This one you have here is already a better explanation about what the
big picture is, as compared to what you did put in the Xen patch. You
still could explain a bit better, I think, especially the interactions
between virtual and physical NUMA.

It is true that is someone else that is doing the core of that job, but
at the same time, if you want people to give feedback to your work, it
is your call to put them in a position where they can understand as much
as possible of it.

>   The small functions mark //todo: is the interface to numa topology. Now they
> looks stupid because I'm still testing this code. The balloon works well (at
> least it seems to) with this small debug interface. Please ignore the more stupid
> commemts, I'll remove them in some version later...
>   the patch of libxl is here: http://lists.xenproject.org/archives/html/xen-devel/2013-08/msg01157.html
>
I would have done the other way around, i.e., send the Linux patch
(this) and then send the Xen patch and reference this one there.

>   It's my first time submitting a patch, please point out the problems so that
> I could work better in future, thanks very much!
>
That is the right attitude, man. Much appreciated! :-D

> ---
>  drivers/xen/balloon.c     | 358 ++++++++++++++++++++++++++++++++++++++++------
>  drivers/xen/xen-balloon.c |  21 ++-
>  include/xen/balloon.h     |  17 +++
>  3 files changed, 345 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index 2a2ef97..09ca1eb 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -36,8 +36,6 @@
>   * IN THE SOFTWARE.
>   */
>  
> -#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
> -
>
So you are killing this... Why?

>  #include <linux/kernel.h>
>  #include <linux/sched.h>
>  #include <linux/errno.h>
> @@ -53,6 +51,9 @@
>  #include <linux/memory.h>
>  #include <linux/memory_hotplug.h>
>  
> +//lcc:
> +#include <linux/numa.h>
> +
>
Get rid of these "lcc tags". They're done with '//' commenting, which is
bad. Also, git does a good enough job already in tracking what are your
own changes! :-P

>  #include <asm/page.h>
>  #include <asm/pgalloc.h>
>  #include <asm/pgtable.h>
> @@ -81,18 +82,43 @@ enum bp_state {
>  	BP_EAGAIN,
>  	BP_ECANCELED
>  };
> +struct bp_rt{
> +    unsigned long donepages;
> +    enum bp_state state;
> +};
> +#define DECLARE_BP_RT(bp_rt)     \
> +    struct bp_rt bp_rt = {       \
> +        .donepages = 0,          \
> +        .state = BP_DONE        \
> +    }
>  
Coding style! Do you remember when I said to you that Linux and Xen
coding styles are different? :-)

In this specific case, the thing is that you need to use 'tab' (and set
the width for that to 8) instead than hard coding 4 spaces. In general,
have a look below for a complete description of Linux's style, and make
sure your patch complies to that:

https://www.kernel.org/doc/Documentation/CodingStyle

Also, do you remember when I mention the SubmittingPatches document:

https://www.kernel.org/doc/Documentation/SubmittingPatches

Have another look there, and see the section called "Style check your
changes". It mentions running your patch through scripts/checkpatch.pl,
which is something really really helpful. :-)

BTW, what exactly is this BP_RT thing? How about adding a few words on
comment about it?

> 
>  static DEFINE_MUTEX(balloon_mutex);
>  
> +//lcc todo: should this balloon_stats change to balloon_stats[MAX_BALLOONNODES]?
>
Comments like this are fine in an RFC patch. While at it, do that
properly, i.e., something like:

/*
 * TODO: will this nee to be converted in
 *  balloon_stats[MAX_BALLOONNODES] ?
 */


>  struct balloon_stats balloon_stats;
>  EXPORT_SYMBOL_GPL(balloon_stats);
>  
>  /* We increase/decrease in batches which fit in a page */
>  static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
>  
> +#ifdef CONFIG_HIGHMEM
> +#define inc_totalhigh_pages() (totalhigh_pages++)
> +#define dec_totalhigh_pages() (totalhigh_pages--)
> +#else
> +#define inc_totalhigh_pages() do {} while (0)
> +#define dec_totalhigh_pages() do {} while (0)
> +#endif
> +
>
Oh, I see... This is the stuff Jan was talking about in Xen devel, that
you probably are reintroducing after it was removed. So, I'm sorry, but
it is indeed quite hard to review something without knowing what hunks
are really new/needed.

For instance, see what happened in this changeset:

$ git show 3dcc0571cd64816309765b7c7e4691a4cadf2ee7

commit 3dcc0571cd64816309765b7c7e4691a4cadf2ee7
Author: Jiang Liu <liuj97@gmail.com>
Date:   Wed Jul 3 15:03:21 2013 -0700

    mm: correctly update zone->managed_pages
    
    ...
    
    Remove inc_totalhigh_pages() and dec_totalhigh_pages() from xen/balloon
    driver bacause adjust_managed_page_count() has already adjusted
    totalhigh_pages.

    ...

So, could you perhaps remove these? It is probably just a matter of
rebasing your changes (and only them) on a more recent git tree (e.g.,
from above)?

Once you did that, please, resend the patch, if possible with all the
additional information about the design and intended usage that both Jan
and David are asking, and we'll perform a proper review of it (or at
least, I will :-P).

Thanks again and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-12 18:44 ` [Xen-devel] " David Vrabel
@ 2013-08-12 20:14   ` Dario Faggioli
  0 siblings, 0 replies; 11+ messages in thread
From: Dario Faggioli @ 2013-08-12 20:14 UTC (permalink / raw)
  To: David Vrabel; +Cc: xen-devel, linux-kernel, Yechen Li


[-- Attachment #1.1: Type: text/plain, Size: 2470 bytes --]

On lun, 2013-08-12 at 19:44 +0100, David Vrabel wrote:
> On 12/08/13 15:13, Yechen Li wrote:
> >   This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
> >   It's just a RFC version, since I'm waiting for the interface of numa topology.
> >   The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
> > /target_nid, and settle the memory increase/decrease operation on specified
> > p-nodeID.
> 
> Its is difficult to review an ABI change without any documentation for
> the new ABI.
> 
Indeed.

> I would also like to see a design document explaining the overall
> approach planned to be used here.  It's not clear why explicitly
> specifying nodes is preferable to (e.g.) the guest releasing/populating
> evenly across all its nodes (this would certainly be better for the guest).
> 
I see what you mean. Personally, I think they're different things. There
might be the need, from the host system administrator, to make as much
room as possible on one (or perhaps a few) nodes, in which case, the
possibility of specifying that explicitly would be a plus.

That would allow --if used wisely, I agree with you on this-- for better
resource utilization, in the long run.

In absence of this information, it is probably true that the guest would
benefit from a more even approach. What we want to achieve here,
however, is as follows: suppose that a virtual NUMA enabled guest (i.e.,
a guest with a virtual NUMA topology), has guest page X, which is on
virtual node g1 in the guest itself, backed by a frame from host node
h0. Well, we really would like to try having page X always backed by a
frame on host node h1, even after ballooning down and up.

> It seems like unless this is used carefully, all VMs will end up with
> suboptimal memory layouts as they are repeatedly balloon up and down to
> satisfy the whims of the latest VM being started etc.
> 
I'm not sure I see entirely what you mean, but for sure I repeat that I
agree that more information about the design and intended usage patterns
are needed... Let's see whether Yechen is up for providing that. :-)

Thanks for having a look anyway,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
  2013-08-12 14:13 Yechen Li
@ 2013-08-12 18:44 ` David Vrabel
  2013-08-12 18:44 ` [Xen-devel] " David Vrabel
  2013-08-12 20:26 ` Dario Faggioli
  2 siblings, 0 replies; 11+ messages in thread
From: David Vrabel @ 2013-08-12 18:44 UTC (permalink / raw)
  To: Yechen Li; +Cc: dario.faggioli, linux-kernel, xen-devel

On 12/08/13 15:13, Yechen Li wrote:
>   This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
>   It's just a RFC version, since I'm waiting for the interface of numa topology.
>   The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
> /target_nid, and settle the memory increase/decrease operation on specified
> p-nodeID.

Its is difficult to review an ABI change without any documentation for
the new ABI.

I would also like to see a design document explaining the overall
approach planned to be used here.  It's not clear why explicitly
specifying nodes is preferable to (e.g.) the guest releasing/populating
evenly across all its nodes (this would certainly be better for the guest).

It seems like unless this is used carefully, all VMs will end up with
suboptimal memory layouts as they are repeatedly balloon up and down to
satisfy the whims of the latest VM being started etc.

David

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel
@ 2013-08-12 14:13 Yechen Li
  2013-08-12 18:44 ` David Vrabel
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Yechen Li @ 2013-08-12 14:13 UTC (permalink / raw)
  To: xen-devel; +Cc: linux-kernel, dario.faggioli, Yechen Li

  This small patch adds numa support for balloon driver. Kernel version: 3.11-rc5
  It's just a RFC version, since I'm waiting for the interface of numa topology.
  The balloon driver will read arguments from xenstore: /local/domain/(id)/memory
/target_nid, and settle the memory increase/decrease operation on specified
p-nodeID. 
  To achieve this, I expand the page-list: ballooned_pages to an array:
ballooned_pages[MAX_BALLOONNODES], so that balloon can distinguish pages from 
different node.
  For the guest without numa, this MAX_BALLOONNODES = 1 so that the balloon falls
back to a no-numa version.
  The small functions mark //todo: is the interface to numa topology. Now they
looks stupid because I'm still testing this code. The balloon works well (at
least it seems to) with this small debug interface. Please ignore the more stupid
commemts, I'll remove them in some version later...
  the patch of libxl is here: http://lists.xenproject.org/archives/html/xen-devel/2013-08/msg01157.html
  It's my first time submitting a patch, please point out the problems so that
I could work better in future, thanks very much!
  

---
 drivers/xen/balloon.c     | 358 ++++++++++++++++++++++++++++++++++++++++------
 drivers/xen/xen-balloon.c |  21 ++-
 include/xen/balloon.h     |  17 +++
 3 files changed, 345 insertions(+), 51 deletions(-)

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 2a2ef97..09ca1eb 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -36,8 +36,6 @@
  * IN THE SOFTWARE.
  */
 
-#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
-
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
@@ -53,6 +51,9 @@
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 
+//lcc:
+#include <linux/numa.h>
+
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -81,18 +82,43 @@ enum bp_state {
 	BP_EAGAIN,
 	BP_ECANCELED
 };
+struct bp_rt{
+    unsigned long donepages;
+    enum bp_state state;
+};
+#define DECLARE_BP_RT(bp_rt)     \
+    struct bp_rt bp_rt = {       \
+        .donepages = 0,          \
+        .state = BP_DONE        \
+    }
 
 
 static DEFINE_MUTEX(balloon_mutex);
 
+//lcc todo: should this balloon_stats change to balloon_stats[MAX_BALLOONNODES]?
 struct balloon_stats balloon_stats;
 EXPORT_SYMBOL_GPL(balloon_stats);
 
 /* We increase/decrease in batches which fit in a page */
 static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
+#ifdef CONFIG_HIGHMEM
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() do {} while (0)
+#define dec_totalhigh_pages() do {} while (0)
+#endif
+
 /* List of ballooned pages, threaded through the mem_map array. */
-static LIST_HEAD(ballooned_pages);
+//static LIST_HEAD(ballooned_pages);
+/*
+ * lcc:
+ * this array is index by vnid,
+ * because we need to use alloc_pages_node(xxx)
+ */
+static struct list_head ballooned_pages[MAX_BALLOONNODES];
+long long ballooned_pages_cnt[MAX_BALLOONNODES];
 
 /* Main work function, always executed in process context. */
 static void balloon_process(struct work_struct *work);
@@ -110,60 +136,115 @@ static void scrub_page(struct page *page)
 #endif
 }
 
+void ballooned_pages_init(void)
+{
+    int i;
+    for (i = 0; i<MAX_BALLOONNODES; i++){
+        INIT_LIST_HEAD(&ballooned_pages[i]);
+        ballooned_pages_cnt[i] = 0;
+    }
+}
+EXPORT_SYMBOL_GPL(ballooned_pages_init);
+
+unsigned long long xen_mnid_to_vnidmask(int mnid)
+{
+    //todo:
+    unsigned long long rc = 1;
+    return rc<<mnid;
+}
+
+int xen_vnid_to_mnid(int vnid)
+{
+    //todo:
+    return vnid % MAX_BALLOONNODES;
+}
+
+int balloon_page_to_vnid(struct page *page)
+{
+    //todo:
+    //return page_to_nid(page);
+    return ((unsigned long long)page & (1<<13))?0:1;
+}
+
+struct page* xen_alloc_pages_node(int vnid)
+{
+    //todo: vnid = 0 is for debug:
+    vnid = 0;
+    return alloc_pages_node(vnid, GFP_BALLOON, balloon_order);
+}
+
 /* balloon_append: add the given page to the balloon. */
 static void __balloon_append(struct page *page)
 {
+    //lcc:notice that this nid is of domU, not of Xen!
+    int vnid = balloon_page_to_vnid(page);
 	/* Lowmem is re-populated first, so highmem pages go at list tail. */
 	if (PageHighMem(page)) {
-		list_add_tail(&page->lru, &ballooned_pages);
+		list_add_tail(&page->lru, &ballooned_pages[vnid]);
 		balloon_stats.balloon_high++;
 	} else {
-		list_add(&page->lru, &ballooned_pages);
+		list_add(&page->lru, &ballooned_pages[vnid]);
 		balloon_stats.balloon_low++;
 	}
+    ballooned_pages_cnt[vnid]++;
 }
 
 static void balloon_append(struct page *page)
 {
 	__balloon_append(page);
-	adjust_managed_page_count(page, -1);
+	if (PageHighMem(page))
+		dec_totalhigh_pages();
+	totalram_pages--;
 }
 
-/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
-static struct page *balloon_retrieve(bool prefer_highmem)
+/* balloon_retrieve_node: rescue a page from virtual node vnid */
+static struct page *balloon_retrieve_node(int vnid, bool prefer_highmem)
 {
 	struct page *page;
 
-	if (list_empty(&ballooned_pages))
+	if (list_empty(&(ballooned_pages[vnid])))
 		return NULL;
 
 	if (prefer_highmem)
-		page = list_entry(ballooned_pages.prev, struct page, lru);
+		page = list_entry(ballooned_pages[vnid].prev, struct page, lru);
 	else
-		page = list_entry(ballooned_pages.next, struct page, lru);
+		page = list_entry(ballooned_pages[vnid].next, struct page, lru);
 	list_del(&page->lru);
+    ballooned_pages_cnt[vnid]--;
 
-	if (PageHighMem(page))
+	if (PageHighMem(page)) {
 		balloon_stats.balloon_high--;
-	else
+		inc_totalhigh_pages();
+	} else
 		balloon_stats.balloon_low--;
 
-	adjust_managed_page_count(page, 1);
+	totalram_pages++;
 
 	return page;
 }
 
-static struct page *balloon_first_page(void)
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(bool prefer_highmem)
 {
-	if (list_empty(&ballooned_pages))
+    int i;
+    struct page *page = NULL;
+    for (i = 0; i<MAX_BALLOONNODES && !page; i++)
+        page = balloon_retrieve_node(i, prefer_highmem);
+
+    return page;
+}
+
+static struct page *balloon_first_page(int vnid)
+{
+	if (list_empty(&ballooned_pages[vnid]))
 		return NULL;
-	return list_entry(ballooned_pages.next, struct page, lru);
+	return list_entry(ballooned_pages[vnid].next, struct page, lru);
 }
 
-static struct page *balloon_next_page(struct page *page)
+static struct page *balloon_next_page(int vnid, struct page *page)
 {
 	struct list_head *next = page->lru.next;
-	if (next == &ballooned_pages)
+	if (next == &ballooned_pages[vnid])
 		return NULL;
 	return list_entry(next, struct page, lru);
 }
@@ -233,7 +314,7 @@ static enum bp_state reserve_additional_memory(long credit)
 	rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
 
 	if (rc) {
-		pr_info("%s: add_memory() failed: %i\n", __func__, rc);
+		pr_info("xen_balloon: %s: add_memory() failed: %i\n", __func__, rc);
 		return BP_EAGAIN;
 	}
 
@@ -301,47 +382,60 @@ static enum bp_state reserve_additional_memory(long credit)
 }
 #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
 
-static enum bp_state increase_reservation(unsigned long nr_pages)
+//lcc: mnid means machine_node_id, differ from vid:virtual_node_id in guest
+/* lcc: but I think this function never called by xen. xen just change
+ * balloon_stats.target_pages, and balloon will autoly call increase_reservation
+ * and decrease_reservation to do the job.
+ */
+static struct bp_rt __increase_reservation_nodeonly(int vnid, unsigned long nr_pages)
 {
-	int rc;
+	long rc;
 	unsigned long  pfn, i;
 	struct page   *page;
+    //lcc: debug, below 0 should be mnid
+    int mnid = xen_vnid_to_mnid(vnid);
 	struct xen_memory_reservation reservation = {
-		.address_bits = 0,
+		.address_bits = MEMF_node(mnid) | MEMF_exact_node,
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
-
+    DECLARE_BP_RT(bp_rt);
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 	if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) {
 		nr_pages = min(nr_pages, balloon_stats.balloon_hotplug);
 		balloon_stats.hotplug_pages += nr_pages;
 		balloon_stats.balloon_hotplug -= nr_pages;
-		return BP_DONE;
+        bp_rt.donepages = nr_pages;
+		return bp_rt;
 	}
 #endif
 
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
-	page = balloon_first_page();
+	page = balloon_first_page(vnid);
 	for (i = 0; i < nr_pages; i++) {
 		if (!page) {
 			nr_pages = i;
 			break;
 		}
 		frame_list[i] = page_to_pfn(page);
-		page = balloon_next_page(page);
+		page = balloon_next_page(vnid, page);
 	}
 
+    if (nr_pages == 0)
+        return bp_rt;
+
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents = nr_pages;
 	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
-	if (rc <= 0)
-		return BP_EAGAIN;
+	if (rc <= 0){
+        bp_rt.state = BP_EAGAIN;
+        return bp_rt;
+    }
 
 	for (i = 0; i < rc; i++) {
-		page = balloon_retrieve(false);
+		page = balloon_retrieve_node(vnid, false);
 		BUG_ON(page == NULL);
 
 		pfn = page_to_pfn(page);
@@ -363,17 +457,92 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 #endif
 
 		/* Relinquish the page back to the allocator. */
-		__free_reserved_page(page);
+		ClearPageReserved(page);
+		init_page_count(page);
+		__free_page(page);
 	}
 
 	balloon_stats.current_pages += rc;
 
-	return BP_DONE;
+    printk(KERN_ALERT "lcc: __increase rc = %ld\n", rc);
+
+    bp_rt.donepages = rc;
+
+	return bp_rt;
 }
 
-static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
+/*
+ * notice that __increase_reservation_nodeonly is a batcher.
+ * it can only do with length(frame_list[]) pages at a time
+ * so run an loop, while still positive pages return (rc>0)
+ *  go on with another batcher
+ */
+static struct bp_rt increase_reservation_nodeonly(int vnid,
+            unsigned long nr_pages)
 {
-	enum bp_state state = BP_DONE;
+    unsigned long ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    while (nr_pages>0){
+        bp_rt = __increase_reservation_nodeonly(vnid, nr_pages);
+        nr_pages -= bp_rt.donepages;
+        if (bp_rt.donepages == 0 || bp_rt.state != BP_DONE)
+            break;
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    printk(KERN_ALERT "lcc: increase nodeonly vnid = %d, donepages = %lu\n",
+                vnid, bp_rt.donepages);
+    return bp_rt;
+}
+
+static struct bp_rt increase_reservation_nodemask(unsigned long long vnidmask,
+            unsigned long nr_pages)
+{
+    int i;
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+
+    if (vnidmask == 0)
+        return bp_rt;
+
+    for (i = 0; i<MAX_BALLOONNODES; i++){
+        if (vnidmask & (1<<i)){
+            bp_rt = increase_reservation_nodeonly(i, nr_pages);
+            nr_pages -= bp_rt.donepages;
+            if (bp_rt.state != BP_DONE){
+                break;
+            }
+        }
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+
+static struct bp_rt increase_reservation_numa(unsigned long long vnidmask,
+            bool nodeexact, unsigned long nr_pages)
+{
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    bp_rt = increase_reservation_nodemask(vnidmask, nr_pages);
+    nr_pages -= bp_rt.donepages;
+    if (nodeexact == false){
+        vnidmask = ((unsigned long long)1<<MAX_BALLOONNODES)-1;
+        bp_rt = increase_reservation_nodemask(vnidmask, nr_pages);
+        nr_pages -= bp_rt.donepages;
+    }
+    bp_rt.donepages  = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+/*
+static enum bp_state increase_reservation(unsigned long nr_pages){
+    struct bp_rt bp_rt = increase_reservation_numa(0,false,nr_pages);
+    return bp_rt.state;
+}
+*/
+
+static struct bp_rt __decrease_reservation_nodeonly(int vnid,
+            unsigned long nr_pages, gfp_t gfp)
+{
+    DECLARE_BP_RT(bp_rt);
 	unsigned long  pfn, i;
 	struct page   *page;
 	int ret;
@@ -382,13 +551,13 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
-
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 	if (balloon_stats.hotplug_pages) {
 		nr_pages = min(nr_pages, balloon_stats.hotplug_pages);
 		balloon_stats.hotplug_pages -= nr_pages;
 		balloon_stats.balloon_hotplug += nr_pages;
-		return BP_DONE;
+        bp_rt.donepages = nr_pages;
+        return bp_rt;
 	}
 #endif
 
@@ -396,10 +565,10 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 		nr_pages = ARRAY_SIZE(frame_list);
 
 	for (i = 0; i < nr_pages; i++) {
-		page = alloc_page(gfp);
-		if (page == NULL) {
+        //lcc:
+        if ((page = xen_alloc_pages_node(vnid)) == NULL){
 			nr_pages = i;
-			state = BP_EAGAIN;
+			bp_rt.state = BP_EAGAIN;
 			break;
 		}
 
@@ -436,7 +605,74 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 
 	balloon_stats.current_pages -= nr_pages;
 
-	return state;
+    printk(KERN_ALERT "lcc: __decrease nr_pages = %ld\n", nr_pages);
+
+    bp_rt.donepages = nr_pages;
+    return bp_rt;
+}
+
+/*
+ * the same reason to increase_reservaton_readonly
+ * run a loop for another batcher if rc > 0
+ */
+static struct bp_rt decrease_reservation_nodeonly(int vnid,
+            unsigned long nr_pages, gfp_t gfp)
+{
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    while (nr_pages>0){
+        bp_rt = __decrease_reservation_nodeonly(vnid, nr_pages, gfp);
+        nr_pages -= bp_rt.donepages;
+        if (bp_rt.donepages == 0 || bp_rt.state != BP_DONE)
+            break;
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    printk(KERN_ALERT "lcc: decrease nodeonly vnid = %d, donepages = %lu\n",
+            vnid, bp_rt.donepages);
+    return bp_rt;
+}
+static struct bp_rt decrease_reservation_nodemask(unsigned long long vnidmask,
+            unsigned long nr_pages, gfp_t gfp)
+{
+    int i;
+    int ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+
+    if (vnidmask == 0)
+        return bp_rt;
+
+    for (i = 0; i<MAX_BALLOONNODES; i++){
+        if (vnidmask & (1<<i)){
+            bp_rt = decrease_reservation_nodeonly(i, nr_pages, gfp);
+            nr_pages -= bp_rt.donepages;
+            if (bp_rt.state != BP_DONE)
+                break;
+        }
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+
+static struct bp_rt decrease_reservation_numa(unsigned long long vnidmask,
+            bool nodeexact, unsigned long nr_pages, gfp_t gfp)
+{
+    unsigned long ori_nr_pages = nr_pages;
+    DECLARE_BP_RT(bp_rt);
+    bp_rt = decrease_reservation_nodemask(vnidmask, nr_pages, gfp);
+    nr_pages -= bp_rt.donepages;
+    if (nodeexact == false){
+        vnidmask = ((unsigned long long)1<<MAX_BALLOONNODES)-1;
+        bp_rt = decrease_reservation_nodemask(vnidmask, nr_pages, gfp);
+        nr_pages -= bp_rt.donepages;
+    }
+    bp_rt.donepages = ori_nr_pages - nr_pages;
+    return bp_rt;
+}
+
+static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
+{
+    struct bp_rt bp_rt = decrease_reservation_numa(0, false, nr_pages, gfp);
+    return bp_rt.state;
 }
 
 /*
@@ -449,6 +685,11 @@ static void balloon_process(struct work_struct *work)
 {
 	enum bp_state state = BP_DONE;
 	long credit;
+    int mnid = balloon_stats.numa_mnid;
+    bool nodeexact = balloon_stats.numa_nodeexact;
+    int counter = 0;
+    int i;
+    unsigned long long vnidmask = xen_mnid_to_vnidmask(mnid);
 
 	mutex_lock(&balloon_mutex);
 
@@ -457,13 +698,25 @@ static void balloon_process(struct work_struct *work)
 
 		if (credit > 0) {
 			if (balloon_is_inflated())
-				state = increase_reservation(credit);
+                state = increase_reservation_numa(vnidmask,
+                            nodeexact, credit).state;
 			else
 				state=reserve_additional_memory(credit);
 		}
 
-		if (credit < 0)
-			state = decrease_reservation(-credit, GFP_BALLOON);
+		if (credit < 0){
+            state = decrease_reservation_numa(vnidmask, nodeexact,
+                        -credit, GFP_BALLOON).state;
+        }
+
+//lcc: debug
+        printk(KERN_ALERT "lcc: balloon nodeexact=%d retry counter = %d\n",
+                nodeexact, counter);
+        for (i = 0; i<MAX_BALLOONNODES; i++){
+            printk(KERN_ALERT "lcc: balloon node %d has %lld pages\n", i, ballooned_pages_cnt[i]);
+        }
+
+//--debug end
 
 		state = update_schedule(state);
 
@@ -471,6 +724,10 @@ static void balloon_process(struct work_struct *work)
 		if (need_resched())
 			schedule();
 #endif
+        counter++;
+        if (nodeexact && counter >= NUMA_BALLOON_RETRY_MAX)
+            break;
+
 	} while (credit && state == BP_DONE);
 
 	/* Schedule more work if there is some still to be done. */
@@ -480,13 +737,22 @@ static void balloon_process(struct work_struct *work)
 	mutex_unlock(&balloon_mutex);
 }
 
-/* Resets the Xen limit, sets new target, and kicks off processing. */
-void balloon_set_new_target(unsigned long target)
+void balloon_set_new_target_numa(unsigned long target, int mnid, bool nodeexact)
 {
 	/* No need for lock. Not read-modify-write updates. */
 	balloon_stats.target_pages = target;
+    balloon_stats.numa_mnid = mnid;
+    balloon_stats.numa_nodeexact = nodeexact;
+    printk(KERN_ALERT "lcc: target = %lu, mnid = %d, nodeexact= %d\n", target, mnid, nodeexact);
 	schedule_delayed_work(&balloon_worker, 0);
 }
+EXPORT_SYMBOL_GPL(balloon_set_new_target_numa);
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+    balloon_set_new_target_numa(target, -1, false);
+}
 EXPORT_SYMBOL_GPL(balloon_set_new_target);
 
 /**
@@ -580,7 +846,7 @@ static int __init balloon_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("Initialising balloon driver\n");
+	pr_info("xen/balloon: Initialising balloon driver.\n");
 
 	balloon_stats.current_pages = xen_pv_domain()
 		? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index e555845..28fa728 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -30,8 +30,6 @@
  * IN THE SOFTWARE.
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/capability.h>
@@ -56,6 +54,8 @@ static void watch_target(struct xenbus_watch *watch,
 			 const char **vec, unsigned int len)
 {
 	unsigned long long new_target;
+    int mnid;
+    int focus;
 	int err;
 
 	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
@@ -63,11 +63,20 @@ static void watch_target(struct xenbus_watch *watch,
 		/* This is ok (for domain0 at least) - so just return */
 		return;
 	}
+	err = xenbus_scanf(XBT_NIL, "memory", "target_nid", "%d %d", &mnid, &focus);
+    if (err != 2){
+        mnid = -1;
+    }
+    /* no numa node specify, set focus = false*/
+    if (mnid == -1){
+        mnid = 0;
+        focus = false;
+    }
 
 	/* The given memory/target value is in KiB, so it needs converting to
 	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
 	 */
-	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+	balloon_set_new_target_numa(new_target >> (PAGE_SHIFT - 10), mnid, focus);
 }
 static struct xenbus_watch target_watch = {
 	.node = "memory/target",
@@ -83,7 +92,7 @@ static int balloon_init_watcher(struct notifier_block *notifier,
 
 	err = register_xenbus_watch(&target_watch);
 	if (err)
-		pr_err("Failed to set balloon watcher\n");
+		printk(KERN_ERR "Failed to set balloon watcher\n");
 
 	return NOTIFY_DONE;
 }
@@ -97,7 +106,9 @@ static int __init balloon_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	pr_info("Initialising balloon driver\n");
+	pr_info("xen-balloon: Initialising balloon driver.\n");
+
+    ballooned_pages_init();
 
 	register_balloon(&balloon_dev);
 
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
index cc2e1a7..80dc8d3 100644
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -3,11 +3,23 @@
  */
 
 #define RETRY_UNLIMITED	0
+#define NUMA_BALLOON_RETRY_MAX 20
+
+#define balloon_order 0
+//todo: numa support
+//xensource/xen/include/xen/mm.h
+//#define MEMF_exact_node (1U<<4)
+#define MEMF_exact_node (0U<<4)
+#define  MEMF_node(n)     ((((n)+1)&0xff)<<8)
+#define MAX_BALLOONNODES 2
 
 struct balloon_stats {
 	/* We aim for 'current allocation' == 'target allocation'. */
 	unsigned long current_pages;
 	unsigned long target_pages;
+    /* numa support */
+    int numa_mnid;
+    bool numa_nodeexact;
 	/* Number of pages in high- and low-memory balloons. */
 	unsigned long balloon_low;
 	unsigned long balloon_high;
@@ -23,6 +35,11 @@ struct balloon_stats {
 
 extern struct balloon_stats balloon_stats;
 
+void ballooned_pages_init(void);
+
+void balloon_set_new_target_numa(unsigned long target, int mnid,
+            bool nodeexact);
+
 void balloon_set_new_target(unsigned long target);
 
 int alloc_xenballooned_pages(int nr_pages, struct page **pages,
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2013-08-13 19:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-12 14:13 [PATCH v1][RFC] drivers/xen, balloon driver numa support in kernel Yechen Li
2013-08-12 14:13 Yechen Li
2013-08-12 18:44 ` David Vrabel
2013-08-12 18:44 ` [Xen-devel] " David Vrabel
2013-08-12 20:14   ` Dario Faggioli
2013-08-12 20:26 ` Dario Faggioli
2013-08-13 12:46   ` Konrad Rzeszutek Wilk
2013-08-13 12:46   ` Konrad Rzeszutek Wilk
2013-08-13 12:53   ` David Vrabel
2013-08-13 13:18     ` Konrad Rzeszutek Wilk
2013-08-13 19:00       ` Dario Faggioli
2013-08-13 13:18     ` Konrad Rzeszutek Wilk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.