All of lore.kernel.org
 help / color / mirror / Atom feed
From: Aaron Lu <aaron.lu@intel.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Ying Huang <ying.huang@intel.com>, Aaron Lu <aaron.lu@intel.com>
Subject: [PATCH v2 2/5] mm: parallel free pages
Date: Wed, 15 Mar 2017 17:00:01 +0800	[thread overview]
Message-ID: <1489568404-7817-3-git-send-email-aaron.lu@intel.com> (raw)
In-Reply-To: <1489568404-7817-1-git-send-email-aaron.lu@intel.com>

For regular processes, the time taken in its exit() path to free its
used memory is not a problem. But there are heavy ones that consume
several Terabytes memory and the time taken to free its memory could
last more than ten minutes.

To optimize this use case, a parallel free method is proposed and it is
based on the current gather batch free.

The current gather batch free works like this:
For each struct mmu_gather *tlb, there is a static buffer to store those
to-be-freed page pointers. The size is MMU_GATHER_BUNDLE, which is
defined to be 8. So if a tlb tear down doesn't free more than 8 pages,
that is all we need. If 8+ pages are to be freed, new pages will need
to be allocated to store those to-be-freed page pointers.

The structure used to describe the saved page pointers is called
struct mmu_gather_batch and tlb->local is of this type. tlb->local is
different than other struct mmu_gather_batch(es) in that the page
pointer array used by tlb->local points to the previouslly described
static buffer while the other struct mmu_gather_batch(es) page pointer
array points to the dynamically allocated pages.

These batches will form a singly linked list, starting from &tlb->local.

tlb->local.pages  => tlb->pages(8 pointers)
      \|/
      next => batch1->pages => about 510 pointers
                \|/
                next => batch2->pages => about 510 pointers
                          \|/
                          next => batch3->pages => about 510 pointers
                                    ... ...

The proposed parallel free did this: if the process has many pages to be
freed, accumulate them in these struct mmu_gather_batch(es) one after
another till 256K pages are accumulated. Then take this singly linked
list starting from tlb->local.next off struct mmu_gather *tlb and free
them in a worker thread. The main thread can return to continue zap
other pages(after freeing pages pointed by tlb->local.pages).

Note that since we may be accumulating as many as 256K pages now, the
soft lockup on !CONFIG_PREEMPT issue which is fixed by
commit 53a59fc67f97 ("mm: limit mmu_gather batching to fix soft lockups
on !CONFIG_PREEMPT") can reappear. For that matter, add cond_resched()
in tlb_flush_mmu_free_batches where many pages can be freed.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 include/asm-generic/tlb.h | 15 +++++++------
 mm/memory.c               | 57 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4329bc6ef04b..7c2ac179cc47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -78,13 +78,10 @@ struct mmu_gather_batch {
 #define MAX_GATHER_BATCH	\
 	((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
 
-/*
- * Limit the maximum number of mmu_gather batches to reduce a risk of soft
- * lockups for non-preemptible kernels on huge machines when a lot of memory
- * is zapped during unmapping.
- * 10K pages freed at once should be safe even without a preemption point.
- */
-#define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
+#define ASYNC_FREE_THRESHOLD (256*1024UL)
+#define MAX_GATHER_BATCH_COUNT	\
+	DIV_ROUND_UP(ASYNC_FREE_THRESHOLD, MAX_GATHER_BATCH)
+#define PAGE_FREE_NR_TO_YIELD (10000UL)
 
 /* struct mmu_gather is an opaque type used by the mm code for passing around
  * any data needed by arch specific code for tlb_remove_page.
@@ -108,6 +105,10 @@ struct mmu_gather {
 	struct page		*__pages[MMU_GATHER_BUNDLE];
 	unsigned int		batch_count;
 	int page_size;
+	/* how many pages we have gathered to be freed */
+	unsigned int            page_nr;
+	/* list for spawned workers that do the free jobs */
+	struct list_head        worker_list;
 };
 
 #define HAVE_GENERIC_MMU_GATHER
diff --git a/mm/memory.c b/mm/memory.c
index cdb2a53f251f..001c7720d773 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -228,6 +228,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
 	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 	tlb->active     = &tlb->local;
 	tlb->batch_count = 0;
+	tlb->page_nr    = 0;
+
+	INIT_LIST_HEAD(&tlb->worker_list);
 
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
@@ -254,22 +257,65 @@ static void tlb_flush_mmu_free_batches(struct mmu_gather_batch *batch_start,
 				       bool free_batch_page)
 {
 	struct mmu_gather_batch *batch, *next;
+	int nr = 0;
 
 	for (batch = batch_start; batch; batch = next) {
 		next = batch->next;
 		if (batch->nr) {
 			free_pages_and_swap_cache(batch->pages, batch->nr);
+			nr += batch->nr;
 			batch->nr = 0;
 		}
-		if (free_batch_page)
+		if (free_batch_page) {
 			free_pages((unsigned long)batch, 0);
+			nr++;
+		}
+		if (nr >= PAGE_FREE_NR_TO_YIELD) {
+			cond_resched();
+			nr = 0;
+		}
 	}
 }
 
+struct batch_free_struct {
+	struct work_struct work;
+	struct mmu_gather_batch *batch_start;
+	struct list_head list;
+};
+
+static void batch_free_work(struct work_struct *work)
+{
+	struct batch_free_struct *batch_free = container_of(work,
+						struct batch_free_struct, work);
+	tlb_flush_mmu_free_batches(batch_free->batch_start, true);
+}
+
 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 {
+	struct batch_free_struct *batch_free = NULL;
+
+	if (tlb->page_nr >= ASYNC_FREE_THRESHOLD)
+		batch_free = kmalloc(sizeof(*batch_free),
+				     GFP_NOWAIT | __GFP_NOWARN);
+
+	if (batch_free) {
+		/*
+		 * Start a worker to free pages stored
+		 * in batches following tlb->local.
+		 */
+		batch_free->batch_start = tlb->local.next;
+		INIT_WORK(&batch_free->work, batch_free_work);
+		list_add_tail(&batch_free->list, &tlb->worker_list);
+		queue_work(system_unbound_wq, &batch_free->work);
+
+		tlb->batch_count = 0;
+		tlb->local.next = NULL;
+		/* fall through to free pages stored in tlb->local */
+	}
+
 	tlb_flush_mmu_free_batches(&tlb->local, false);
 	tlb->active = &tlb->local;
+	tlb->page_nr = 0;
 }
 
 void tlb_flush_mmu(struct mmu_gather *tlb)
@@ -284,11 +330,18 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  */
 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	struct batch_free_struct *batch_free, *n;
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
+	list_for_each_entry_safe(batch_free, n, &tlb->worker_list, list) {
+		flush_work(&batch_free->work);
+		kfree(batch_free);
+	}
+
 	tlb_flush_mmu_free_batches(tlb->local.next, true);
 	tlb->local.next = NULL;
 }
@@ -307,6 +360,8 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	VM_BUG_ON(!tlb->end);
 	VM_WARN_ON(tlb->page_size != page_size);
 
+	tlb->page_nr++;
+
 	batch = tlb->active;
 	/*
 	 * Add the page and check if we are full. If so
-- 
2.7.4

WARNING: multiple messages have this Message-ID (diff)
From: Aaron Lu <aaron.lu@intel.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Dave Hansen <dave.hansen@intel.com>,
	Tim Chen <tim.c.chen@intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Ying Huang <ying.huang@intel.com>, Aaron Lu <aaron.lu@intel.com>
Subject: [PATCH v2 2/5] mm: parallel free pages
Date: Wed, 15 Mar 2017 17:00:01 +0800	[thread overview]
Message-ID: <1489568404-7817-3-git-send-email-aaron.lu@intel.com> (raw)
In-Reply-To: <1489568404-7817-1-git-send-email-aaron.lu@intel.com>

For regular processes, the time taken in its exit() path to free its
used memory is not a problem. But there are heavy ones that consume
several Terabytes memory and the time taken to free its memory could
last more than ten minutes.

To optimize this use case, a parallel free method is proposed and it is
based on the current gather batch free.

The current gather batch free works like this:
For each struct mmu_gather *tlb, there is a static buffer to store those
to-be-freed page pointers. The size is MMU_GATHER_BUNDLE, which is
defined to be 8. So if a tlb tear down doesn't free more than 8 pages,
that is all we need. If 8+ pages are to be freed, new pages will need
to be allocated to store those to-be-freed page pointers.

The structure used to describe the saved page pointers is called
struct mmu_gather_batch and tlb->local is of this type. tlb->local is
different than other struct mmu_gather_batch(es) in that the page
pointer array used by tlb->local points to the previouslly described
static buffer while the other struct mmu_gather_batch(es) page pointer
array points to the dynamically allocated pages.

These batches will form a singly linked list, starting from &tlb->local.

tlb->local.pages  => tlb->pages(8 pointers)
      \|/
      next => batch1->pages => about 510 pointers
                \|/
                next => batch2->pages => about 510 pointers
                          \|/
                          next => batch3->pages => about 510 pointers
                                    ... ...

The proposed parallel free did this: if the process has many pages to be
freed, accumulate them in these struct mmu_gather_batch(es) one after
another till 256K pages are accumulated. Then take this singly linked
list starting from tlb->local.next off struct mmu_gather *tlb and free
them in a worker thread. The main thread can return to continue zap
other pages(after freeing pages pointed by tlb->local.pages).

Note that since we may be accumulating as many as 256K pages now, the
soft lockup on !CONFIG_PREEMPT issue which is fixed by
commit 53a59fc67f97 ("mm: limit mmu_gather batching to fix soft lockups
on !CONFIG_PREEMPT") can reappear. For that matter, add cond_resched()
in tlb_flush_mmu_free_batches where many pages can be freed.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 include/asm-generic/tlb.h | 15 +++++++------
 mm/memory.c               | 57 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4329bc6ef04b..7c2ac179cc47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -78,13 +78,10 @@ struct mmu_gather_batch {
 #define MAX_GATHER_BATCH	\
 	((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
 
-/*
- * Limit the maximum number of mmu_gather batches to reduce a risk of soft
- * lockups for non-preemptible kernels on huge machines when a lot of memory
- * is zapped during unmapping.
- * 10K pages freed at once should be safe even without a preemption point.
- */
-#define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
+#define ASYNC_FREE_THRESHOLD (256*1024UL)
+#define MAX_GATHER_BATCH_COUNT	\
+	DIV_ROUND_UP(ASYNC_FREE_THRESHOLD, MAX_GATHER_BATCH)
+#define PAGE_FREE_NR_TO_YIELD (10000UL)
 
 /* struct mmu_gather is an opaque type used by the mm code for passing around
  * any data needed by arch specific code for tlb_remove_page.
@@ -108,6 +105,10 @@ struct mmu_gather {
 	struct page		*__pages[MMU_GATHER_BUNDLE];
 	unsigned int		batch_count;
 	int page_size;
+	/* how many pages we have gathered to be freed */
+	unsigned int            page_nr;
+	/* list for spawned workers that do the free jobs */
+	struct list_head        worker_list;
 };
 
 #define HAVE_GENERIC_MMU_GATHER
diff --git a/mm/memory.c b/mm/memory.c
index cdb2a53f251f..001c7720d773 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -228,6 +228,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
 	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 	tlb->active     = &tlb->local;
 	tlb->batch_count = 0;
+	tlb->page_nr    = 0;
+
+	INIT_LIST_HEAD(&tlb->worker_list);
 
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
@@ -254,22 +257,65 @@ static void tlb_flush_mmu_free_batches(struct mmu_gather_batch *batch_start,
 				       bool free_batch_page)
 {
 	struct mmu_gather_batch *batch, *next;
+	int nr = 0;
 
 	for (batch = batch_start; batch; batch = next) {
 		next = batch->next;
 		if (batch->nr) {
 			free_pages_and_swap_cache(batch->pages, batch->nr);
+			nr += batch->nr;
 			batch->nr = 0;
 		}
-		if (free_batch_page)
+		if (free_batch_page) {
 			free_pages((unsigned long)batch, 0);
+			nr++;
+		}
+		if (nr >= PAGE_FREE_NR_TO_YIELD) {
+			cond_resched();
+			nr = 0;
+		}
 	}
 }
 
+struct batch_free_struct {
+	struct work_struct work;
+	struct mmu_gather_batch *batch_start;
+	struct list_head list;
+};
+
+static void batch_free_work(struct work_struct *work)
+{
+	struct batch_free_struct *batch_free = container_of(work,
+						struct batch_free_struct, work);
+	tlb_flush_mmu_free_batches(batch_free->batch_start, true);
+}
+
 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 {
+	struct batch_free_struct *batch_free = NULL;
+
+	if (tlb->page_nr >= ASYNC_FREE_THRESHOLD)
+		batch_free = kmalloc(sizeof(*batch_free),
+				     GFP_NOWAIT | __GFP_NOWARN);
+
+	if (batch_free) {
+		/*
+		 * Start a worker to free pages stored
+		 * in batches following tlb->local.
+		 */
+		batch_free->batch_start = tlb->local.next;
+		INIT_WORK(&batch_free->work, batch_free_work);
+		list_add_tail(&batch_free->list, &tlb->worker_list);
+		queue_work(system_unbound_wq, &batch_free->work);
+
+		tlb->batch_count = 0;
+		tlb->local.next = NULL;
+		/* fall through to free pages stored in tlb->local */
+	}
+
 	tlb_flush_mmu_free_batches(&tlb->local, false);
 	tlb->active = &tlb->local;
+	tlb->page_nr = 0;
 }
 
 void tlb_flush_mmu(struct mmu_gather *tlb)
@@ -284,11 +330,18 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  */
 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
+	struct batch_free_struct *batch_free, *n;
+
 	tlb_flush_mmu(tlb);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
 
+	list_for_each_entry_safe(batch_free, n, &tlb->worker_list, list) {
+		flush_work(&batch_free->work);
+		kfree(batch_free);
+	}
+
 	tlb_flush_mmu_free_batches(tlb->local.next, true);
 	tlb->local.next = NULL;
 }
@@ -307,6 +360,8 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	VM_BUG_ON(!tlb->end);
 	VM_WARN_ON(tlb->page_size != page_size);
 
+	tlb->page_nr++;
+
 	batch = tlb->active;
 	/*
 	 * Add the page and check if we are full. If so
-- 
2.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2017-03-15  9:00 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-15  8:59 [PATCH v2 0/5] mm: support parallel free of memory Aaron Lu
2017-03-15  8:59 ` Aaron Lu
2017-03-15  9:00 ` [PATCH v2 1/5] mm: add tlb_flush_mmu_free_batches Aaron Lu
2017-03-15  9:00   ` Aaron Lu
2017-03-15  9:00 ` Aaron Lu [this message]
2017-03-15  9:00   ` [PATCH v2 2/5] mm: parallel free pages Aaron Lu
2017-03-15  9:42   ` Hillf Danton
2017-03-15  9:42     ` Hillf Danton
2017-03-15 11:54     ` Aaron Lu
2017-03-15 11:54       ` Aaron Lu
2017-03-15  9:00 ` [PATCH v2 3/5] mm: use a dedicated workqueue for the free workers Aaron Lu
2017-03-15  9:00   ` Aaron Lu
2017-03-22  6:33   ` Minchan Kim
2017-03-22  6:33     ` Minchan Kim
2017-03-22  8:41     ` Aaron Lu
2017-03-22  8:41       ` Aaron Lu
2017-03-22  8:55       ` Minchan Kim
2017-03-22  8:55         ` Minchan Kim
2017-03-22 13:43         ` Aaron Lu
2017-03-22 13:43           ` Aaron Lu
2017-03-23  5:53           ` Minchan Kim
2017-03-23  5:53             ` Minchan Kim
2017-03-23 15:38       ` Dave Hansen
2017-03-23 15:38         ` Dave Hansen
2017-03-24 12:37         ` Aaron Lu
2017-03-24 12:37           ` Aaron Lu
2017-03-15  9:00 ` [PATCH v2 4/5] mm: add force_free_pages in zap_pte_range Aaron Lu
2017-03-15  9:00   ` Aaron Lu
2017-03-15  9:00 ` [PATCH v2 5/5] mm: add debugfs interface for parallel free tuning Aaron Lu
2017-03-15  9:00   ` Aaron Lu
2017-03-15 14:18 ` [PATCH v2 0/5] mm: support parallel free of memory Michal Hocko
2017-03-15 14:18   ` Michal Hocko
2017-03-15 15:44   ` Aaron Lu
2017-03-15 15:44     ` Aaron Lu
2017-03-15 16:28     ` Michal Hocko
2017-03-15 16:28       ` Michal Hocko
2017-03-15 21:38       ` Tim Chen
2017-03-15 21:38         ` Tim Chen
2017-03-16  9:07         ` Michal Hocko
2017-03-16  9:07           ` Michal Hocko
2017-03-16 18:36           ` Tim Chen
2017-03-16 18:36             ` Tim Chen
2017-03-17  7:47             ` Michal Hocko
2017-03-17  7:47               ` Michal Hocko
2017-03-17  8:07               ` Minchan Kim
2017-03-17  8:07                 ` Minchan Kim
2017-03-17 12:33               ` Aaron Lu
2017-03-17 12:33                 ` Aaron Lu
2017-03-17 12:59                 ` Michal Hocko
2017-03-17 12:59                   ` Michal Hocko
2017-03-17 13:16                 ` Peter Zijlstra
2017-03-17 13:16                   ` Peter Zijlstra
2017-03-17 12:53               ` Peter Zijlstra
2017-03-17 12:53                 ` Peter Zijlstra
2017-03-17 13:05                 ` Michal Hocko
2017-03-17 13:05                   ` Michal Hocko
2017-03-21 14:54           ` Dave Hansen
2017-03-21 14:54             ` Dave Hansen
2017-03-22  8:02             ` Aaron Lu
2017-03-22  8:02               ` Aaron Lu
2017-03-24  7:04             ` Aaron Lu
2017-03-24  7:04               ` Aaron Lu
2017-03-21 15:18           ` Tim Chen
2017-03-21 15:18             ` Tim Chen
2017-03-16  6:54       ` Aaron Lu
2017-03-16  6:54         ` Aaron Lu
2017-03-16  7:34       ` Aaron Lu
2017-03-16  7:34         ` Aaron Lu
2017-03-16 13:51         ` Aaron Lu
2017-03-16 13:51           ` Aaron Lu
2017-03-16 14:14           ` Aaron Lu
2017-03-16 14:14             ` Aaron Lu
2017-03-15 14:56 ` Vlastimil Babka
2017-03-15 14:56   ` Vlastimil Babka
2017-03-15 15:50   ` Aaron Lu
2017-03-15 15:50     ` Aaron Lu
2017-03-17  3:10   ` Aaron Lu
2017-03-17  3:10     ` Aaron Lu
2017-03-16 19:38 ` Alex Thorlton
2017-03-16 19:38   ` Alex Thorlton
2017-03-17  2:21   ` Aaron Lu
2017-03-17  2:21     ` Aaron Lu
2017-03-20 19:15     ` Alex Thorlton
2017-03-20 19:15       ` Alex Thorlton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1489568404-7817-3-git-send-email-aaron.lu@intel.com \
    --to=aaron.lu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=tim.c.chen@intel.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.