All of lore.kernel.org
 help / color / mirror / Atom feed
From: Zi Yan <zi.yan@sent.com>
To: Dave Hansen <dave.hansen@linux.intel.com>,
	Yang Shi <yang.shi@linux.alibaba.com>,
	Keith Busch <keith.busch@intel.com>,
	Fengguang Wu <fengguang.wu@intel.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>,
	Michal Hocko <mhocko@kernel.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@techsingularity.net>,
	John Hubbard <jhubbard@nvidia.com>,
	Mark Hairgrove <mhairgrove@nvidia.com>,
	Nitin Gupta <nigupta@nvidia.com>,
	Javier Cabezas <jcabezas@nvidia.com>,
	David Nellans <dnellans@nvidia.com>, Zi Yan <ziy@nvidia.com>
Subject: [RFC PATCH 13/25] exchange pages: add multi-threaded exchange pages.
Date: Wed,  3 Apr 2019 19:00:34 -0700	[thread overview]
Message-ID: <20190404020046.32741-14-zi.yan@sent.com> (raw)
In-Reply-To: <20190404020046.32741-1-zi.yan@sent.com>

From: Zi Yan <ziy@nvidia.com>

Exchange two pages using multi threads. Exchange two lists of pages
using multi threads.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/Makefile        |   1 +
 mm/exchange.c      |  15 ++--
 mm/exchange_page.c | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/internal.h      |   5 ++
 4 files changed, 245 insertions(+), 5 deletions(-)
 create mode 100644 mm/exchange_page.c

diff --git a/mm/Makefile b/mm/Makefile
index 5e6c591..2f1f1ad 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -46,6 +46,7 @@ obj-y += memblock.o
 
 obj-y += copy_page.o
 obj-y += exchange.o
+obj-y += exchange_page.o
 
 ifdef CONFIG_MMU
 	obj-$(CONFIG_ADVISE_SYSCALLS)	+= madvise.o
diff --git a/mm/exchange.c b/mm/exchange.c
index 626bbea..ce2c899 100644
--- a/mm/exchange.c
+++ b/mm/exchange.c
@@ -345,11 +345,16 @@ static int exchange_from_to_pages(struct page *to_page, struct page *from_page,
 
 	rc = -EFAULT;
 
-	if (PageHuge(from_page) || PageTransHuge(from_page))
-		exchange_huge_page(to_page, from_page);
-	else
-		exchange_highpage(to_page, from_page);
-	rc = 0;
+	if (mode & MIGRATE_MT)
+		rc = exchange_page_mthread(to_page, from_page,
+				hpage_nr_pages(from_page));
+	if (rc) {
+		if (PageHuge(from_page) || PageTransHuge(from_page))
+			exchange_huge_page(to_page, from_page);
+		else
+			exchange_highpage(to_page, from_page);
+		rc = 0;
+	}
 
 	exchange_page_flags(to_page, from_page);
 
diff --git a/mm/exchange_page.c b/mm/exchange_page.c
new file mode 100644
index 0000000..6054697
--- /dev/null
+++ b/mm/exchange_page.c
@@ -0,0 +1,229 @@
+/*
+ * Exchange page copy routine.
+ *
+ * Copyright 2019 by NVIDIA.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Zi Yan <ziy@nvidia.com>
+ *
+ */
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
+
+/*
+ * nr_copythreads can be the highest number of threads for given node
+ * on any architecture. The actual number of copy threads will be
+ * limited by the cpumask weight of the target node.
+ */
+extern unsigned int limit_mt_num;
+
+struct copy_page_info {
+	struct work_struct copy_page_work;
+	char *to;
+	char *from;
+	unsigned long chunk_size;
+};
+
+static void exchange_page_routine(char *to, char *from, unsigned long chunk_size)
+{
+	u64 tmp;
+	int i;
+
+	for (i = 0; i < chunk_size; i += sizeof(tmp)) {
+		tmp = *((u64*)(from + i));
+		*((u64*)(from + i)) = *((u64*)(to + i));
+		*((u64*)(to + i)) = tmp;
+	}
+}
+
+static void exchange_page_work_queue_thread(struct work_struct *work)
+{
+	struct copy_page_info *my_work = (struct copy_page_info*)work;
+
+	exchange_page_routine(my_work->to,
+							  my_work->from,
+							  my_work->chunk_size);
+}
+
+int exchange_page_mthread(struct page *to, struct page *from, int nr_pages)
+{
+	int total_mt_num = limit_mt_num;
+	int to_node = page_to_nid(to);
+	int i;
+	struct copy_page_info *work_items;
+	char *vto, *vfrom;
+	unsigned long chunk_size;
+	const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);
+	int cpu_id_list[32] = {0};
+	int cpu;
+
+	total_mt_num = min_t(unsigned int, total_mt_num,
+						 cpumask_weight(per_node_cpumask));
+
+	if (total_mt_num > 1)
+		total_mt_num = (total_mt_num / 2) * 2;
+
+	if (total_mt_num > 32 || total_mt_num < 1)
+		return -ENODEV;
+
+	work_items = kvzalloc(sizeof(struct copy_page_info)*total_mt_num,
+						 GFP_KERNEL);
+	if (!work_items)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_cpu(cpu, per_node_cpumask) {
+		if (i >= total_mt_num)
+			break;
+		cpu_id_list[i] = cpu;
+		++i;
+	}
+
+	/* XXX: assume no highmem  */
+	vfrom = kmap(from);
+	vto = kmap(to);
+	chunk_size = PAGE_SIZE*nr_pages / total_mt_num;
+
+	for (i = 0; i < total_mt_num; ++i) {
+		INIT_WORK((struct work_struct *)&work_items[i],
+				exchange_page_work_queue_thread);
+
+		work_items[i].to = vto + i * chunk_size;
+		work_items[i].from = vfrom + i * chunk_size;
+		work_items[i].chunk_size = chunk_size;
+
+		queue_work_on(cpu_id_list[i],
+					  system_highpri_wq,
+					  (struct work_struct *)&work_items[i]);
+	}
+
+	/* Wait until it finishes  */
+	flush_workqueue(system_highpri_wq);
+
+	kunmap(to);
+	kunmap(from);
+
+	kvfree(work_items);
+
+	return 0;
+}
+
+int exchange_page_lists_mthread(struct page **to, struct page **from, int nr_pages)
+{
+	int err = 0;
+	unsigned int total_mt_num = limit_mt_num;
+	int to_node = page_to_nid(*to);
+	int i;
+	struct copy_page_info *work_items;
+	int nr_pages_per_page = hpage_nr_pages(*from);
+	const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);
+	int cpu_id_list[32] = {0};
+	int cpu;
+	int item_idx;
+
+
+	total_mt_num = min_t(unsigned int, total_mt_num,
+						 cpumask_weight(per_node_cpumask));
+
+	if (total_mt_num > 32 || total_mt_num < 1)
+		return -ENODEV;
+
+	if (nr_pages < total_mt_num) {
+		int residual_nr_pages = nr_pages - rounddown_pow_of_two(nr_pages);
+
+		if (residual_nr_pages) {
+			for (i = 0; i < residual_nr_pages; ++i) {
+				BUG_ON(hpage_nr_pages(to[i]) != hpage_nr_pages(from[i]));
+				err = exchange_page_mthread(to[i], from[i], hpage_nr_pages(to[i]));
+				VM_BUG_ON(err);
+			}
+			nr_pages = rounddown_pow_of_two(nr_pages);
+			to = &to[residual_nr_pages];
+			from = &from[residual_nr_pages];
+		}
+
+		work_items = kvzalloc(sizeof(struct copy_page_info)*total_mt_num,
+							 GFP_KERNEL);
+	} else
+		work_items = kvzalloc(sizeof(struct copy_page_info)*nr_pages,
+							 GFP_KERNEL);
+	if (!work_items)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_cpu(cpu, per_node_cpumask) {
+		if (i >= total_mt_num)
+			break;
+		cpu_id_list[i] = cpu;
+		++i;
+	}
+
+	if (nr_pages < total_mt_num) {
+		for (cpu = 0; cpu < total_mt_num; ++cpu)
+			INIT_WORK((struct work_struct *)&work_items[cpu],
+					  exchange_page_work_queue_thread);
+		cpu = 0;
+		for (item_idx = 0; item_idx < nr_pages; ++item_idx) {
+			unsigned long chunk_size = nr_pages * PAGE_SIZE * hpage_nr_pages(from[item_idx]) / total_mt_num;
+			char *vfrom = kmap(from[item_idx]);
+			char *vto = kmap(to[item_idx]);
+			VM_BUG_ON(PAGE_SIZE * hpage_nr_pages(from[item_idx]) % total_mt_num);
+			VM_BUG_ON(total_mt_num % nr_pages);
+			BUG_ON(hpage_nr_pages(to[item_idx]) !=
+				   hpage_nr_pages(from[item_idx]));
+
+			for (i = 0; i < (total_mt_num/nr_pages); ++cpu, ++i) {
+				work_items[cpu].to = vto + chunk_size * i;
+				work_items[cpu].from = vfrom + chunk_size * i;
+				work_items[cpu].chunk_size = chunk_size;
+			}
+		}
+		if (cpu != total_mt_num)
+			pr_err("%s: only %d out of %d pages are transferred\n", __func__,
+				cpu - 1, total_mt_num);
+
+		for (cpu = 0; cpu < total_mt_num; ++cpu)
+			queue_work_on(cpu_id_list[cpu],
+						  system_highpri_wq,
+						  (struct work_struct *)&work_items[cpu]);
+	} else {
+		for (i = 0; i < nr_pages; ++i) {
+			int thread_idx = i % total_mt_num;
+
+			INIT_WORK((struct work_struct *)&work_items[i], exchange_page_work_queue_thread);
+
+			/* XXX: assume no highmem  */
+			work_items[i].to = kmap(to[i]);
+			work_items[i].from = kmap(from[i]);
+			work_items[i].chunk_size = PAGE_SIZE * hpage_nr_pages(from[i]);
+
+			BUG_ON(hpage_nr_pages(to[i]) != hpage_nr_pages(from[i]));
+
+			queue_work_on(cpu_id_list[thread_idx], system_highpri_wq, (struct work_struct *)&work_items[i]);
+		}
+	}
+
+	/* Wait until it finishes  */
+	flush_workqueue(system_highpri_wq);
+
+	for (i = 0; i < nr_pages; ++i) {
+			kunmap(to[i]);
+			kunmap(from[i]);
+	}
+
+	kvfree(work_items);
+
+	return err;
+}
+
diff --git a/mm/internal.h b/mm/internal.h
index 51f5e1b..a039459 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -561,4 +561,9 @@ extern int copy_page_lists_dma_always(struct page **to,
 extern int copy_page_lists_mt(struct page **to,
 			struct page **from, int nr_pages);
 
+extern int exchange_page_mthread(struct page *to, struct page *from,
+			int nr_pages);
+extern int exchange_page_lists_mthread(struct page **to,
+						  struct page **from, 
+						  int nr_pages);
 #endif	/* __MM_INTERNAL_H */
-- 
2.7.4


  parent reply	other threads:[~2019-04-04  2:09 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-04  2:00 [RFC PATCH 00/25] Accelerate page migration and use memcg for PMEM management Zi Yan
2019-04-04  2:00 ` [RFC PATCH 01/25] mm: migrate: Change migrate_mode to support combination migration modes Zi Yan
2019-04-04  2:00 ` [RFC PATCH 02/25] mm: migrate: Add mode parameter to support future page copy routines Zi Yan
2019-04-04  2:00 ` [RFC PATCH 03/25] mm: migrate: Add a multi-threaded page migration function Zi Yan
2019-04-04  2:00 ` [RFC PATCH 04/25] mm: migrate: Add copy_page_multithread into migrate_pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 05/25] mm: migrate: Add vm.accel_page_copy in sysfs to control page copy acceleration Zi Yan
2019-04-04  2:00 ` [RFC PATCH 06/25] mm: migrate: Make the number of copy threads adjustable via sysctl Zi Yan
2019-04-04  2:00 ` [RFC PATCH 07/25] mm: migrate: Add copy_page_dma to use DMA Engine to copy pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 08/25] mm: migrate: Add copy_page_dma into migrate_page_copy Zi Yan
2019-04-04  2:00 ` [RFC PATCH 09/25] mm: migrate: Add copy_page_lists_dma_always to support copy a list of pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 10/25] mm: migrate: copy_page_lists_mt() to copy a page list using multi-threads Zi Yan
2019-04-04  2:00 ` [RFC PATCH 11/25] mm: migrate: Add concurrent page migration into move_pages syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 12/25] exchange pages: new page migration mechanism: exchange_pages() Zi Yan
2019-04-04  2:00 ` Zi Yan [this message]
2019-04-04  2:00 ` [RFC PATCH 14/25] exchange pages: concurrent exchange pages Zi Yan
2019-04-04  2:00 ` [RFC PATCH 15/25] exchange pages: exchange anonymous page and file-backed page Zi Yan
2019-04-04  2:00 ` [RFC PATCH 16/25] exchange page: Add THP exchange support Zi Yan
2019-04-04  2:00 ` [RFC PATCH 17/25] exchange page: Add exchange_page() syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 18/25] memcg: Add per node memory usage&max stats in memcg Zi Yan
2019-04-04  2:00 ` [RFC PATCH 19/25] mempolicy: add MPOL_F_MEMCG flag, enforcing memcg memory limit Zi Yan
2019-04-04  2:00 ` [RFC PATCH 20/25] memory manage: Add memory manage syscall Zi Yan
2019-04-04  2:00 ` [RFC PATCH 21/25] mm: move update_lru_sizes() to mm_inline.h for broader use Zi Yan
2019-04-04  2:00 ` [RFC PATCH 22/25] memory manage: active/inactive page list manipulation in memcg Zi Yan
2019-04-04  2:00 ` [RFC PATCH 23/25] memory manage: page migration based page manipulation between NUMA nodes Zi Yan
2019-04-04  2:00 ` [RFC PATCH 24/25] memory manage: limit migration batch size Zi Yan
2019-04-04  2:00 ` [RFC PATCH 25/25] memory manage: use exchange pages to memory manage to improve throughput Zi Yan
2019-04-04  7:13 ` [RFC PATCH 00/25] Accelerate page migration and use memcg for PMEM management Michal Hocko
2019-04-05  0:32 ` Yang Shi
2019-04-05 17:20   ` Zi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190404020046.32741-14-zi.yan@sent.com \
    --to=zi.yan@sent.com \
    --cc=akpm@linux-foundation.org \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dnellans@nvidia.com \
    --cc=fengguang.wu@intel.com \
    --cc=jcabezas@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=keith.busch@intel.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhairgrove@nvidia.com \
    --cc=mhocko@kernel.org \
    --cc=nigupta@nvidia.com \
    --cc=vbabka@suse.cz \
    --cc=yang.shi@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.