All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-01 17:37 ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 17:37 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins
  Cc: David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

We have a race condition between move_pages() and freeing hugepages,
where move_pages() calls follow_page(FOLL_GET) for hugepages internally
and tries to get its refcount without preventing concurrent freeing.
This race crashes the kernel, so this patch fixes it by moving FOLL_GET
code for hugepages into follow_huge_pmd() with taking the page table lock.

This patch passes the following test. And libhugetlbfs test shows no
regression.

  $ cat movepages.c
  #include <stdio.h>
  #include <stdlib.h>
  #include <numaif.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000
  #define PS              0x1000

  int main(int argc, char *argv[]) {
          int i;
          int nr_hp = strtol(argv[1], NULL, 0);
          int nr_p  = nr_hp * HPS / PS;
          int ret;
          void **addrs;
          int *status;
          int *nodes;
          pid_t pid;

          pid = strtol(argv[2], NULL, 0);
          addrs  = malloc(sizeof(char *) * nr_p + 1);
          status = malloc(sizeof(char *) * nr_p + 1);
          nodes  = malloc(sizeof(char *) * nr_p + 1);

          while (1) {
                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 1;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");

                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 0;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");
          }
          return 0;
  }

  $ cat hugepage.c
  #include <stdio.h>
  #include <sys/mman.h>
  #include <string.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000

  int main(int argc, char *argv[]) {
          int nr_hp = strtol(argv[1], NULL, 0);
          char *p;

          while (1) {
                  p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
                  if (p != (void *)ADDR_INPUT) {
                          perror("mmap");
                          break;
                  }
                  memset(p, 0, nr_hp * HPS);
                  munmap(p, nr_hp * HPS);
          }
  }

  $ sysctl vm.nr_hugepages=40
  $ ./hugepage 10 &
  $ ./movepages 10 $(pgrep -f hugepage)

Note for stable inclusion:
  This patch fixes e632a938d914 ("mm: migrate: add hugepage migration code
  to move_pages()"), so is applicable to -stable kernels which includes it.

ChangeLog v2:
- introduce follow_huge_pmd_lock() to do locking in arch-independent code.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>  # [3.12+]
---
 include/linux/hugetlb.h |  3 +++
 mm/gup.c                | 17 ++---------------
 mm/hugetlb.c            | 27 +++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h mmotm-2014-07-22-15-58/include/linux/hugetlb.h
index 41272bcf73f8..194834853d6f 100644
--- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
+++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
@@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int write);
+struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
+				unsigned long address, pmd_t *pmd, int flags);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -134,6 +136,7 @@ static inline void hugetlb_show_meminfo(void)
 }
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd_lock(vma, addr, pmd, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
diff --git mmotm-2014-07-22-15-58.orig/mm/gup.c mmotm-2014-07-22-15-58/mm/gup.c
index 91d044b1600d..e4bd59efe686 100644
--- mmotm-2014-07-22-15-58.orig/mm/gup.c
+++ mmotm-2014-07-22-15-58/mm/gup.c
@@ -174,21 +174,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		return no_page_table(vma, flags);
-	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
-		if (flags & FOLL_GET) {
-			/*
-			 * Refcount on tail pages are not well-defined and
-			 * shouldn't be taken. The caller should handle a NULL
-			 * return when trying to follow tail pages.
-			 */
-			if (PageHead(page))
-				get_page(page);
-			else
-				page = NULL;
-		}
-		return page;
-	}
+	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
+		return follow_huge_pmd_lock(vma, address, pmd, flags);
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_trans_huge(*pmd)) {
diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
index 7263c770e9b3..4437896cd6ed 100644
--- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
+++ mmotm-2014-07-22-15-58/mm/hugetlb.c
@@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
+struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
+				unsigned long address, pmd_t *pmd, int flags)
+{
+	struct page *page;
+	spinlock_t *ptl;
+
+	if (flags & FOLL_GET)
+		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+
+	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
+
+	if (flags & FOLL_GET) {
+		/*
+		 * Refcount on tail pages are not well-defined and
+		 * shouldn't be taken. The caller should handle a NULL
+		 * return when trying to follow tail pages.
+		 */
+		if (PageHead(page))
+			get_page(page);
+		else
+			page = NULL;
+		spin_unlock(ptl);
+	}
+
+	return page;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 
 /* Should be called in hugetlb_lock */
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-01 17:37 ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 17:37 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins
  Cc: David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

We have a race condition between move_pages() and freeing hugepages,
where move_pages() calls follow_page(FOLL_GET) for hugepages internally
and tries to get its refcount without preventing concurrent freeing.
This race crashes the kernel, so this patch fixes it by moving FOLL_GET
code for hugepages into follow_huge_pmd() with taking the page table lock.

This patch passes the following test. And libhugetlbfs test shows no
regression.

  $ cat movepages.c
  #include <stdio.h>
  #include <stdlib.h>
  #include <numaif.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000
  #define PS              0x1000

  int main(int argc, char *argv[]) {
          int i;
          int nr_hp = strtol(argv[1], NULL, 0);
          int nr_p  = nr_hp * HPS / PS;
          int ret;
          void **addrs;
          int *status;
          int *nodes;
          pid_t pid;

          pid = strtol(argv[2], NULL, 0);
          addrs  = malloc(sizeof(char *) * nr_p + 1);
          status = malloc(sizeof(char *) * nr_p + 1);
          nodes  = malloc(sizeof(char *) * nr_p + 1);

          while (1) {
                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 1;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");

                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 0;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");
          }
          return 0;
  }

  $ cat hugepage.c
  #include <stdio.h>
  #include <sys/mman.h>
  #include <string.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000

  int main(int argc, char *argv[]) {
          int nr_hp = strtol(argv[1], NULL, 0);
          char *p;

          while (1) {
                  p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
                  if (p != (void *)ADDR_INPUT) {
                          perror("mmap");
                          break;
                  }
                  memset(p, 0, nr_hp * HPS);
                  munmap(p, nr_hp * HPS);
          }
  }

  $ sysctl vm.nr_hugepages=40
  $ ./hugepage 10 &
  $ ./movepages 10 $(pgrep -f hugepage)

Note for stable inclusion:
  This patch fixes e632a938d914 ("mm: migrate: add hugepage migration code
  to move_pages()"), so is applicable to -stable kernels which includes it.

ChangeLog v2:
- introduce follow_huge_pmd_lock() to do locking in arch-independent code.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>  # [3.12+]
---
 include/linux/hugetlb.h |  3 +++
 mm/gup.c                | 17 ++---------------
 mm/hugetlb.c            | 27 +++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h mmotm-2014-07-22-15-58/include/linux/hugetlb.h
index 41272bcf73f8..194834853d6f 100644
--- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
+++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
@@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int write);
+struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
+				unsigned long address, pmd_t *pmd, int flags);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -134,6 +136,7 @@ static inline void hugetlb_show_meminfo(void)
 }
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd_lock(vma, addr, pmd, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
diff --git mmotm-2014-07-22-15-58.orig/mm/gup.c mmotm-2014-07-22-15-58/mm/gup.c
index 91d044b1600d..e4bd59efe686 100644
--- mmotm-2014-07-22-15-58.orig/mm/gup.c
+++ mmotm-2014-07-22-15-58/mm/gup.c
@@ -174,21 +174,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		return no_page_table(vma, flags);
-	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
-		if (flags & FOLL_GET) {
-			/*
-			 * Refcount on tail pages are not well-defined and
-			 * shouldn't be taken. The caller should handle a NULL
-			 * return when trying to follow tail pages.
-			 */
-			if (PageHead(page))
-				get_page(page);
-			else
-				page = NULL;
-		}
-		return page;
-	}
+	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
+		return follow_huge_pmd_lock(vma, address, pmd, flags);
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_trans_huge(*pmd)) {
diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
index 7263c770e9b3..4437896cd6ed 100644
--- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
+++ mmotm-2014-07-22-15-58/mm/hugetlb.c
@@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
+struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
+				unsigned long address, pmd_t *pmd, int flags)
+{
+	struct page *page;
+	spinlock_t *ptl;
+
+	if (flags & FOLL_GET)
+		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+
+	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
+
+	if (flags & FOLL_GET) {
+		/*
+		 * Refcount on tail pages are not well-defined and
+		 * shouldn't be taken. The caller should handle a NULL
+		 * return when trying to follow tail pages.
+		 */
+		if (PageHead(page))
+			get_page(page);
+		else
+			page = NULL;
+		spin_unlock(ptl);
+	}
+
+	return page;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 
 /* Should be called in hugetlb_lock */
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault()
  2014-08-01 17:37 ` Naoya Horiguchi
@ 2014-08-01 17:37   ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 17:37 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins
  Cc: David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

After fixing locking in follow_page(FOLL_GET) for hugepages, I start to
observe the BUG of "get_page() on refcount 0 page" in hugetlb_fault() in
the same test.

I'm not exactly sure about how this race is triggered, but hugetlb_fault()
calls pte_page() and get_page() outside page table lock, so it's not safe.
This patch checks the refcount of the gotten page, and aborts the page fault
if the refcount is 0, expecting to retry.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>  # [3.12+]
---
 mm/hugetlb.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
index 4437896cd6ed..863f45f63cd5 100644
--- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
+++ mmotm-2014-07-22-15-58/mm/hugetlb.c
@@ -3189,7 +3189,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * so no worry about deadlock.
 	 */
 	page = pte_page(entry);
-	get_page(page);
+	if (!get_page_unless_zero(page))
+		goto out_put_pagecache;
 	if (page != pagecache_page)
 		lock_page(page);
 
@@ -3215,15 +3216,14 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 out_ptl:
 	spin_unlock(ptl);
-
+	if (page != pagecache_page)
+		unlock_page(page);
+	put_page(page);
+out_put_pagecache:
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
 		put_page(pagecache_page);
 	}
-	if (page != pagecache_page)
-		unlock_page(page);
-	put_page(page);
-
 out_mutex:
 	mutex_unlock(&htlb_fault_mutex_table[hash]);
 	return ret;
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault()
@ 2014-08-01 17:37   ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 17:37 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins
  Cc: David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

After fixing locking in follow_page(FOLL_GET) for hugepages, I start to
observe the BUG of "get_page() on refcount 0 page" in hugetlb_fault() in
the same test.

I'm not exactly sure about how this race is triggered, but hugetlb_fault()
calls pte_page() and get_page() outside page table lock, so it's not safe.
This patch checks the refcount of the gotten page, and aborts the page fault
if the refcount is 0, expecting to retry.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>  # [3.12+]
---
 mm/hugetlb.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
index 4437896cd6ed..863f45f63cd5 100644
--- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
+++ mmotm-2014-07-22-15-58/mm/hugetlb.c
@@ -3189,7 +3189,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * so no worry about deadlock.
 	 */
 	page = pte_page(entry);
-	get_page(page);
+	if (!get_page_unless_zero(page))
+		goto out_put_pagecache;
 	if (page != pagecache_page)
 		lock_page(page);
 
@@ -3215,15 +3216,14 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 out_ptl:
 	spin_unlock(ptl);
-
+	if (page != pagecache_page)
+		unlock_page(page);
+	put_page(page);
+out_put_pagecache:
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
 		put_page(pagecache_page);
 	}
-	if (page != pagecache_page)
-		unlock_page(page);
-	put_page(page);
-
 out_mutex:
 	mutex_unlock(&htlb_fault_mutex_table[hash]);
 	return ret;
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection
  2014-08-01 17:37 ` Naoya Horiguchi
@ 2014-08-01 17:37   ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 17:37 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins
  Cc: David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

There is a race condition between hugepage migration and change_protection(),
where hugetlb_change_protection() doesn't care about migration entries and
wrongly overwrites them. That causes unexpected results like kernel crash.

This patch adds is_hugetlb_entry_(migration|hwpoisoned) check in this
function and skip all such entries.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>  # [3.12+]
---
 mm/hugetlb.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
index 863f45f63cd5..1da7ca2e2a02 100644
--- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
+++ mmotm-2014-07-22-15-58/mm/hugetlb.c
@@ -3355,7 +3355,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			spin_unlock(ptl);
 			continue;
 		}
-		if (!huge_pte_none(huge_ptep_get(ptep))) {
+		pte = huge_ptep_get(ptep);
+		if (unlikely(is_hugetlb_entry_migration(pte) ||
+			     is_hugetlb_entry_hwpoisoned(pte))) {
+			spin_unlock(ptl);
+			continue;
+		}
+		if (!huge_pte_none(pte)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection
@ 2014-08-01 17:37   ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 17:37 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins
  Cc: David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

There is a race condition between hugepage migration and change_protection(),
where hugetlb_change_protection() doesn't care about migration entries and
wrongly overwrites them. That causes unexpected results like kernel crash.

This patch adds is_hugetlb_entry_(migration|hwpoisoned) check in this
function and skip all such entries.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>  # [3.12+]
---
 mm/hugetlb.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
index 863f45f63cd5..1da7ca2e2a02 100644
--- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
+++ mmotm-2014-07-22-15-58/mm/hugetlb.c
@@ -3355,7 +3355,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			spin_unlock(ptl);
 			continue;
 		}
-		if (!huge_pte_none(huge_ptep_get(ptep))) {
+		pte = huge_ptep_get(ptep);
+		if (unlikely(is_hugetlb_entry_migration(pte) ||
+			     is_hugetlb_entry_hwpoisoned(pte))) {
+			spin_unlock(ptl);
+			continue;
+		}
+		if (!huge_pte_none(pte)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
  2014-08-01 17:37 ` Naoya Horiguchi
@ 2014-08-01 21:53   ` Andrew Morton
  -1 siblings, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2014-08-01 21:53 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri,  1 Aug 2014 13:37:41 -0400 Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> wrote:

> We have a race condition between move_pages() and freeing hugepages,
> where move_pages() calls follow_page(FOLL_GET) for hugepages internally
> and tries to get its refcount without preventing concurrent freeing.
> This race crashes the kernel, so this patch fixes it by moving FOLL_GET
> code for hugepages into follow_huge_pmd() with taking the page table lock.
> 
> This patch passes the following test. And libhugetlbfs test shows no
> regression.
> 
> ...

How were these bugs discovered?  Are we missing some Reported-by's?

> --- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
> +++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> @@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  				pmd_t *pmd, int write);
>  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  				pud_t *pud, int write);
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags);
>  int pmd_huge(pmd_t pmd);
>  int pud_huge(pud_t pmd);
>  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>
> ...
>
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags)

Some documentation here wouldn't hurt.  Why it exists, what it does. 
And especially: any preconditions to calling it (ie: locking).

> +{
> +	struct page *page;
> +	spinlock_t *ptl;
> +
> +	if (flags & FOLL_GET)
> +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> +
> +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> +
> +	if (flags & FOLL_GET) {
> +		/*
> +		 * Refcount on tail pages are not well-defined and
> +		 * shouldn't be taken. The caller should handle a NULL
> +		 * return when trying to follow tail pages.
> +		 */
> +		if (PageHead(page))
> +			get_page(page);
> +		else
> +			page = NULL;
> +		spin_unlock(ptl);
> +	}
> +
> +	return page;
> +}
> +
>  #ifdef CONFIG_MEMORY_FAILURE

I can't find an implementation of follow_huge_pmd() which actually uses
the fourth argument "write".  Zap?

Ditto for follow_huge_pud().


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-01 21:53   ` Andrew Morton
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Morton @ 2014-08-01 21:53 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri,  1 Aug 2014 13:37:41 -0400 Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> wrote:

> We have a race condition between move_pages() and freeing hugepages,
> where move_pages() calls follow_page(FOLL_GET) for hugepages internally
> and tries to get its refcount without preventing concurrent freeing.
> This race crashes the kernel, so this patch fixes it by moving FOLL_GET
> code for hugepages into follow_huge_pmd() with taking the page table lock.
> 
> This patch passes the following test. And libhugetlbfs test shows no
> regression.
> 
> ...

How were these bugs discovered?  Are we missing some Reported-by's?

> --- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
> +++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> @@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  				pmd_t *pmd, int write);
>  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  				pud_t *pud, int write);
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags);
>  int pmd_huge(pmd_t pmd);
>  int pud_huge(pud_t pmd);
>  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>
> ...
>
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags)

Some documentation here wouldn't hurt.  Why it exists, what it does. 
And especially: any preconditions to calling it (ie: locking).

> +{
> +	struct page *page;
> +	spinlock_t *ptl;
> +
> +	if (flags & FOLL_GET)
> +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> +
> +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> +
> +	if (flags & FOLL_GET) {
> +		/*
> +		 * Refcount on tail pages are not well-defined and
> +		 * shouldn't be taken. The caller should handle a NULL
> +		 * return when trying to follow tail pages.
> +		 */
> +		if (PageHead(page))
> +			get_page(page);
> +		else
> +			page = NULL;
> +		spin_unlock(ptl);
> +	}
> +
> +	return page;
> +}
> +
>  #ifdef CONFIG_MEMORY_FAILURE

I can't find an implementation of follow_huge_pmd() which actually uses
the fourth argument "write".  Zap?

Ditto for follow_huge_pud().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
  2014-08-01 21:53   ` Andrew Morton
@ 2014-08-01 21:58     ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 21:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri, Aug 01, 2014 at 02:53:58PM -0700, Andrew Morton wrote:
> On Fri,  1 Aug 2014 13:37:41 -0400 Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> wrote:
> 
> > We have a race condition between move_pages() and freeing hugepages,
> > where move_pages() calls follow_page(FOLL_GET) for hugepages internally
> > and tries to get its refcount without preventing concurrent freeing.
> > This race crashes the kernel, so this patch fixes it by moving FOLL_GET
> > code for hugepages into follow_huge_pmd() with taking the page table lock.
> > 
> > This patch passes the following test. And libhugetlbfs test shows no
> > regression.
> > 
> > ...
> 
> How were these bugs discovered?  Are we missing some Reported-by's?

Hugh pointed out this a few months ago, so I should have added his
Reported-by tag.

> > --- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
> > +++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> > @@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> >  				pmd_t *pmd, int write);
> >  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  				pud_t *pud, int write);
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags);
> >  int pmd_huge(pmd_t pmd);
> >  int pud_huge(pud_t pmd);
> >  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> >
> > ...
> >
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  
> >  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
> >  
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags)
> 
> Some documentation here wouldn't hurt.  Why it exists, what it does. 
> And especially: any preconditions to calling it (ie: locking).
> 
> > +{
> > +	struct page *page;
> > +	spinlock_t *ptl;
> > +
> > +	if (flags & FOLL_GET)
> > +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> > +
> > +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> > +
> > +	if (flags & FOLL_GET) {
> > +		/*
> > +		 * Refcount on tail pages are not well-defined and
> > +		 * shouldn't be taken. The caller should handle a NULL
> > +		 * return when trying to follow tail pages.
> > +		 */
> > +		if (PageHead(page))
> > +			get_page(page);
> > +		else
> > +			page = NULL;
> > +		spin_unlock(ptl);
> > +	}
> > +
> > +	return page;
> > +}
> > +
> >  #ifdef CONFIG_MEMORY_FAILURE
> 
> I can't find an implementation of follow_huge_pmd() which actually uses
> the fourth argument "write".  Zap?

OK, I'll post it later.

Thanks,
Naoya Horiguchi

> 
> Ditto for follow_huge_pud().

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-01 21:58     ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-01 21:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri, Aug 01, 2014 at 02:53:58PM -0700, Andrew Morton wrote:
> On Fri,  1 Aug 2014 13:37:41 -0400 Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> wrote:
> 
> > We have a race condition between move_pages() and freeing hugepages,
> > where move_pages() calls follow_page(FOLL_GET) for hugepages internally
> > and tries to get its refcount without preventing concurrent freeing.
> > This race crashes the kernel, so this patch fixes it by moving FOLL_GET
> > code for hugepages into follow_huge_pmd() with taking the page table lock.
> > 
> > This patch passes the following test. And libhugetlbfs test shows no
> > regression.
> > 
> > ...
> 
> How were these bugs discovered?  Are we missing some Reported-by's?

Hugh pointed out this a few months ago, so I should have added his
Reported-by tag.

> > --- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
> > +++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> > @@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> >  				pmd_t *pmd, int write);
> >  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  				pud_t *pud, int write);
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags);
> >  int pmd_huge(pmd_t pmd);
> >  int pud_huge(pud_t pmd);
> >  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> >
> > ...
> >
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  
> >  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
> >  
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags)
> 
> Some documentation here wouldn't hurt.  Why it exists, what it does. 
> And especially: any preconditions to calling it (ie: locking).
> 
> > +{
> > +	struct page *page;
> > +	spinlock_t *ptl;
> > +
> > +	if (flags & FOLL_GET)
> > +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> > +
> > +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> > +
> > +	if (flags & FOLL_GET) {
> > +		/*
> > +		 * Refcount on tail pages are not well-defined and
> > +		 * shouldn't be taken. The caller should handle a NULL
> > +		 * return when trying to follow tail pages.
> > +		 */
> > +		if (PageHead(page))
> > +			get_page(page);
> > +		else
> > +			page = NULL;
> > +		spin_unlock(ptl);
> > +	}
> > +
> > +	return page;
> > +}
> > +
> >  #ifdef CONFIG_MEMORY_FAILURE
> 
> I can't find an implementation of follow_huge_pmd() which actually uses
> the fourth argument "write".  Zap?

OK, I'll post it later.

Thanks,
Naoya Horiguchi

> 
> Ditto for follow_huge_pud().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
  2014-08-01 21:53   ` Andrew Morton
@ 2014-08-04 15:29     ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-04 15:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri, Aug 01, 2014 at 02:53:58PM -0700, Andrew Morton wrote:
...
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  
> >  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
> >  
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags)
> 
> Some documentation here wouldn't hurt.  Why it exists, what it does. 
> And especially: any preconditions to calling it (ie: locking).

Sorry, I missed this comment.
How about this?

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1da7ca2e2a02..923465c0b47f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3693,6 +3693,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
+/*
+ * This function calls the architecture dependent variant follow_huge_pmd()
+ * with holding page table lock depending on FOLL_GET.
+ * Whether hugepage migration is supported or not, follow() can be called
+ * with FOLL_GET from do_move_page_to_node_array(), so we need do this in
+ * common code.
+ * Should be called under read mmap_sem.
+ */
 struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
 				unsigned long address, pmd_t *pmd, int flags)
 {

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-04 15:29     ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-04 15:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri, Aug 01, 2014 at 02:53:58PM -0700, Andrew Morton wrote:
...
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  
> >  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
> >  
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags)
> 
> Some documentation here wouldn't hurt.  Why it exists, what it does. 
> And especially: any preconditions to calling it (ie: locking).

Sorry, I missed this comment.
How about this?

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1da7ca2e2a02..923465c0b47f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3693,6 +3693,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
+/*
+ * This function calls the architecture dependent variant follow_huge_pmd()
+ * with holding page table lock depending on FOLL_GET.
+ * Whether hugepage migration is supported or not, follow() can be called
+ * with FOLL_GET from do_move_page_to_node_array(), so we need do this in
+ * common code.
+ * Should be called under read mmap_sem.
+ */
 struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
 				unsigned long address, pmd_t *pmd, int flags)
 {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] mm/hugetlb: remove unused argument of follow_huge_(pmd|pud)
  2014-08-01 21:58     ` Naoya Horiguchi
@ 2014-08-04 15:50       ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-04 15:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri, Aug 01, 2014 at 05:58:45PM -0400, Naoya Horiguchi wrote:
...
> > I can't find an implementation of follow_huge_pmd() which actually uses
> > the fourth argument "write".  Zap?
> 
> OK, I'll post it later.

Here is the patch.

Thanks,
Naoya Horiguchi
---
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Mon, 4 Aug 2014 11:30:00 -0400
Subject: [PATCH] mm/hugetlb: remove unused argument of follow_huge_(pmd|pud)

There is no implementation of follow_huge_pmd() which actually uses
the fourth argument "write". So let's zap it.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
 arch/ia64/mm/hugetlbpage.c    |  2 +-
 arch/metag/mm/hugetlbpage.c   |  2 +-
 arch/mips/mm/hugetlbpage.c    |  2 +-
 arch/powerpc/mm/hugetlbpage.c |  2 +-
 arch/s390/mm/hugetlbpage.c    |  2 +-
 arch/sh/mm/hugetlbpage.c      |  2 +-
 arch/sparc/mm/hugetlbpage.c   |  2 +-
 arch/tile/mm/hugetlbpage.c    |  4 ++--
 arch/x86/mm/hugetlbpage.c     |  2 +-
 include/linux/hugetlb.h       |  8 ++++----
 mm/gup.c                      |  2 +-
 mm/hugetlb.c                  | 11 ++++-------
 12 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 76069c18ee42..d14ae6804106 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -115,7 +115,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c52fa6d0f8e..e2559b12f6aa 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -111,7 +111,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 4ec8ee10d371..690749184d0f 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -86,7 +86,7 @@ int pud_huge(pud_t pud)
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd)
 {
 	struct page *page;
 
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7e70ae968e5f..8339978033ad 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -700,7 +700,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd)
 {
 	BUG();
 	return NULL;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 0ff66a7e29bb..abbdee629790 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -221,7 +221,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmdp, int write)
+			     pmd_t *pmdp)
 {
 	struct page *page;
 
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index d7762349ea48..1f636ed3ffcd 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -84,7 +84,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d329537739c6..4cb2b4820bd5 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -232,7 +232,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index e514899e1100..3c07a555b9b9 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -167,7 +167,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	struct page *page;
 
@@ -178,7 +178,7 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-			     pud_t *pud, int write)
+			     pud_t *pud)
 {
 	struct page *page;
 
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8b977ebf9388..fc72bb59141c 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -54,7 +54,7 @@ int pud_huge(pud_t pud)
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 194834853d6f..3a8b338ff84a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -98,9 +98,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
+				pmd_t *pmd);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int write);
+				pud_t *pud);
 struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
 				unsigned long address, pmd_t *pmd, int flags);
 int pmd_huge(pmd_t pmd);
@@ -134,8 +134,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 static inline void hugetlb_show_meminfo(void)
 {
 }
-#define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd(mm, addr, pmd)	NULL
+#define follow_huge_pud(mm, addr, pud)	NULL
 #define follow_huge_pmd_lock(vma, addr, pmd, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
diff --git a/mm/gup.c b/mm/gup.c
index e4bd59efe686..57e394e33e66 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -165,7 +165,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 		if (flags & FOLL_GET)
 			return NULL;
-		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+		page = follow_huge_pud(mm, address, pud);
 		return page;
 	}
 	if (unlikely(pud_bad(*pud)))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 923465c0b47f..4f6d1a4f5339 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3657,8 +3657,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 }
 
 struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd)
 {
 	struct page *page;
 
@@ -3669,8 +3668,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 
 struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int write)
+follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud)
 {
 	struct page *page;
 
@@ -3684,8 +3682,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 /* Can be overriden by architectures */
 struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-	       pud_t *pud, int write)
+follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud)
 {
 	BUG();
 	return NULL;
@@ -3710,7 +3707,7 @@ struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
 	if (flags & FOLL_GET)
 		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 
-	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
+	page = follow_huge_pmd(vma->vm_mm, address, pmd);
 
 	if (flags & FOLL_GET) {
 		/*
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] mm/hugetlb: remove unused argument of follow_huge_(pmd|pud)
@ 2014-08-04 15:50       ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-04 15:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Fri, Aug 01, 2014 at 05:58:45PM -0400, Naoya Horiguchi wrote:
...
> > I can't find an implementation of follow_huge_pmd() which actually uses
> > the fourth argument "write".  Zap?
> 
> OK, I'll post it later.

Here is the patch.

Thanks,
Naoya Horiguchi
---
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Mon, 4 Aug 2014 11:30:00 -0400
Subject: [PATCH] mm/hugetlb: remove unused argument of follow_huge_(pmd|pud)

There is no implementation of follow_huge_pmd() which actually uses
the fourth argument "write". So let's zap it.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
---
 arch/ia64/mm/hugetlbpage.c    |  2 +-
 arch/metag/mm/hugetlbpage.c   |  2 +-
 arch/mips/mm/hugetlbpage.c    |  2 +-
 arch/powerpc/mm/hugetlbpage.c |  2 +-
 arch/s390/mm/hugetlbpage.c    |  2 +-
 arch/sh/mm/hugetlbpage.c      |  2 +-
 arch/sparc/mm/hugetlbpage.c   |  2 +-
 arch/tile/mm/hugetlbpage.c    |  4 ++--
 arch/x86/mm/hugetlbpage.c     |  2 +-
 include/linux/hugetlb.h       |  8 ++++----
 mm/gup.c                      |  2 +-
 mm/hugetlb.c                  | 11 ++++-------
 12 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 76069c18ee42..d14ae6804106 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -115,7 +115,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c52fa6d0f8e..e2559b12f6aa 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -111,7 +111,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 4ec8ee10d371..690749184d0f 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -86,7 +86,7 @@ int pud_huge(pud_t pud)
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd)
 {
 	struct page *page;
 
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7e70ae968e5f..8339978033ad 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -700,7 +700,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd)
 {
 	BUG();
 	return NULL;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 0ff66a7e29bb..abbdee629790 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -221,7 +221,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmdp, int write)
+			     pmd_t *pmdp)
 {
 	struct page *page;
 
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index d7762349ea48..1f636ed3ffcd 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -84,7 +84,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d329537739c6..4cb2b4820bd5 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -232,7 +232,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index e514899e1100..3c07a555b9b9 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -167,7 +167,7 @@ int pud_huge(pud_t pud)
 }
 
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+			     pmd_t *pmd)
 {
 	struct page *page;
 
@@ -178,7 +178,7 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-			     pud_t *pud, int write)
+			     pud_t *pud)
 {
 	struct page *page;
 
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8b977ebf9388..fc72bb59141c 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -54,7 +54,7 @@ int pud_huge(pud_t pud)
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd)
 {
 	return NULL;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 194834853d6f..3a8b338ff84a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -98,9 +98,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
+				pmd_t *pmd);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int write);
+				pud_t *pud);
 struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
 				unsigned long address, pmd_t *pmd, int flags);
 int pmd_huge(pmd_t pmd);
@@ -134,8 +134,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 static inline void hugetlb_show_meminfo(void)
 {
 }
-#define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd(mm, addr, pmd)	NULL
+#define follow_huge_pud(mm, addr, pud)	NULL
 #define follow_huge_pmd_lock(vma, addr, pmd, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
diff --git a/mm/gup.c b/mm/gup.c
index e4bd59efe686..57e394e33e66 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -165,7 +165,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 		if (flags & FOLL_GET)
 			return NULL;
-		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+		page = follow_huge_pud(mm, address, pud);
 		return page;
 	}
 	if (unlikely(pud_bad(*pud)))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 923465c0b47f..4f6d1a4f5339 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3657,8 +3657,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 }
 
 struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd)
 {
 	struct page *page;
 
@@ -3669,8 +3668,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 
 struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int write)
+follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud)
 {
 	struct page *page;
 
@@ -3684,8 +3682,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 /* Can be overriden by architectures */
 struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-	       pud_t *pud, int write)
+follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud)
 {
 	BUG();
 	return NULL;
@@ -3710,7 +3707,7 @@ struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
 	if (flags & FOLL_GET)
 		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 
-	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
+	page = follow_huge_pmd(vma->vm_mm, address, pmd);
 
 	if (flags & FOLL_GET) {
 		/*
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
  2014-08-01 17:37 ` Naoya Horiguchi
@ 2014-08-09 23:01   ` Hugh Dickins
  -1 siblings, 0 replies; 26+ messages in thread
From: Hugh Dickins @ 2014-08-09 23:01 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, David Rientjes, linux-mm,
	linux-kernel, Naoya Horiguchi

On Fri, 1 Aug 2014, Naoya Horiguchi wrote:

> We have a race condition between move_pages() and freeing hugepages,
> where move_pages() calls follow_page(FOLL_GET) for hugepages internally
> and tries to get its refcount without preventing concurrent freeing.
> This race crashes the kernel, so this patch fixes it by moving FOLL_GET
> code for hugepages into follow_huge_pmd() with taking the page table lock.
> 
> This patch passes the following test. And libhugetlbfs test shows no
> regression.
> 
>   $ cat movepages.c
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <numaif.h>
> 
>   #define ADDR_INPUT      0x700000000000UL
>   #define HPS             0x200000
>   #define PS              0x1000
> 
>   int main(int argc, char *argv[]) {
>           int i;
>           int nr_hp = strtol(argv[1], NULL, 0);
>           int nr_p  = nr_hp * HPS / PS;
>           int ret;
>           void **addrs;
>           int *status;
>           int *nodes;
>           pid_t pid;
> 
>           pid = strtol(argv[2], NULL, 0);
>           addrs  = malloc(sizeof(char *) * nr_p + 1);
>           status = malloc(sizeof(char *) * nr_p + 1);
>           nodes  = malloc(sizeof(char *) * nr_p + 1);
> 
>           while (1) {
>                   for (i = 0; i < nr_p; i++) {
>                           addrs[i] = (void *)ADDR_INPUT + i * PS;
>                           nodes[i] = 1;
>                           status[i] = 0;
>                   }
>                   ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
>                                         MPOL_MF_MOVE_ALL);
>                   if (ret == -1)
>                           err("move_pages");
> 
>                   for (i = 0; i < nr_p; i++) {
>                           addrs[i] = (void *)ADDR_INPUT + i * PS;
>                           nodes[i] = 0;
>                           status[i] = 0;
>                   }
>                   ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
>                                         MPOL_MF_MOVE_ALL);
>                   if (ret == -1)
>                           err("move_pages");
>           }
>           return 0;
>   }
> 
>   $ cat hugepage.c
>   #include <stdio.h>
>   #include <sys/mman.h>
>   #include <string.h>
> 
>   #define ADDR_INPUT      0x700000000000UL
>   #define HPS             0x200000
> 
>   int main(int argc, char *argv[]) {
>           int nr_hp = strtol(argv[1], NULL, 0);
>           char *p;
> 
>           while (1) {
>                   p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
>                            MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
>                   if (p != (void *)ADDR_INPUT) {
>                           perror("mmap");
>                           break;
>                   }
>                   memset(p, 0, nr_hp * HPS);
>                   munmap(p, nr_hp * HPS);
>           }
>   }
> 
>   $ sysctl vm.nr_hugepages=40
>   $ ./hugepage 10 &
>   $ ./movepages 10 $(pgrep -f hugepage)
> 
> Note for stable inclusion:
>   This patch fixes e632a938d914 ("mm: migrate: add hugepage migration code
>   to move_pages()"), so is applicable to -stable kernels which includes it.
> 
> ChangeLog v2:
> - introduce follow_huge_pmd_lock() to do locking in arch-independent code.
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: <stable@vger.kernel.org>  # [3.12+]
> ---
>  include/linux/hugetlb.h |  3 +++
>  mm/gup.c                | 17 ++---------------
>  mm/hugetlb.c            | 27 +++++++++++++++++++++++++++
>  3 files changed, 32 insertions(+), 15 deletions(-)
> 
> diff --git mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> index 41272bcf73f8..194834853d6f 100644
> --- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
> +++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> @@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  				pmd_t *pmd, int write);
>  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  				pud_t *pud, int write);
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags);
>  int pmd_huge(pmd_t pmd);
>  int pud_huge(pud_t pmd);
>  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> @@ -134,6 +136,7 @@ static inline void hugetlb_show_meminfo(void)
>  }
>  #define follow_huge_pmd(mm, addr, pmd, write)	NULL
>  #define follow_huge_pud(mm, addr, pud, write)	NULL
> +#define follow_huge_pmd_lock(vma, addr, pmd, flags)	NULL
>  #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
>  #define pmd_huge(x)	0
>  #define pud_huge(x)	0
> diff --git mmotm-2014-07-22-15-58.orig/mm/gup.c mmotm-2014-07-22-15-58/mm/gup.c
> index 91d044b1600d..e4bd59efe686 100644
> --- mmotm-2014-07-22-15-58.orig/mm/gup.c
> +++ mmotm-2014-07-22-15-58/mm/gup.c
> @@ -174,21 +174,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>  	pmd = pmd_offset(pud, address);
>  	if (pmd_none(*pmd))
>  		return no_page_table(vma, flags);
> -	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
> -		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
> -		if (flags & FOLL_GET) {
> -			/*
> -			 * Refcount on tail pages are not well-defined and
> -			 * shouldn't be taken. The caller should handle a NULL
> -			 * return when trying to follow tail pages.
> -			 */
> -			if (PageHead(page))
> -				get_page(page);
> -			else
> -				page = NULL;
> -		}
> -		return page;
> -	}
> +	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
> +		return follow_huge_pmd_lock(vma, address, pmd, flags);

Yes, that's good (except I don't like the _lock name).

>  	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
>  		return no_page_table(vma, flags);
>  	if (pmd_trans_huge(*pmd)) {
> diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> index 7263c770e9b3..4437896cd6ed 100644
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags)
> +{
> +	struct page *page;
> +	spinlock_t *ptl;
> +
> +	if (flags & FOLL_GET)
> +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> +

But this is not good enough, I'm afraid.

> +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> +
> +	if (flags & FOLL_GET) {
> +		/*
> +		 * Refcount on tail pages are not well-defined and
> +		 * shouldn't be taken. The caller should handle a NULL
> +		 * return when trying to follow tail pages.
> +		 */
> +		if (PageHead(page))
> +			get_page(page);
> +		else
> +			page = NULL;
> +		spin_unlock(ptl);
> +	}
> +
> +	return page;
> +}
> +
>  #ifdef CONFIG_MEMORY_FAILURE
>  
>  /* Should be called in hugetlb_lock */
> -- 
> 1.9.3

Thanks a lot for remembering this, but it's not enough, I think.

It is an improvement over the current code (except for the annoying new
level, and its confusing name follow_huge_pmd_lock); but I don't want to
keep on coming back, repeatedly sending new corrections to four or more
releases of -stable.  Please let's get it right and be done with it.

I see two problems with the above, but perhaps I'm mistaken.

One is hugetlb_vmtruncate(): follow_huge_pmd_lock() is only called
when we have observed pmd_huge(*pmd), fine, but how can we assume
that pmd_huge(*pmd) still after getting the necessary huge_pte_lock?
Truncation could have changed that *pmd to none, and then pte_page()
will supply an incorrect (but non-NULL) address.

(I observe the follow_huge_pmd()s all doing an "if (page)" after
their pte_page(), but when I checked at the time of the original
follow_huge_addr() problem, I could not find any architecture with
a pte_page() returning NULL for an invalid entry - pte_page() is
a simple blind translation in every architecture, I believe, but
please check.)

Two is x86-32 PAE (and perhaps some other architectures), in which
the pmd entry spans two machine words, loaded independently.  It's a
very narrow race window, but we cannot safely access the whole *pmd
without locking: we might pick up two mismatched halves.  Take a look
at pte_unmap_same() in mm/memory.c, it's handling that issue on ptes.

So, if I follow my distaste for the intermediate follow_huge_pmd_lock
level (and in patch 4/3 you are already changing all the declarations,
so no need to be deterred by that task), I think what we need is:

struct page *
follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
		pmd_t *pmd, unsigned int flags)
{
	struct page *page;
	spinlock_t *ptl;

	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);

	if (!pmd_huge(*pmd)) {
		page = NULL;
		goto out;
	}

	page = pte_page(*(pte_t *)pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);

	if (flags & FOLL_GET) {
		/*
		 * Refcount on tail pages are not well-defined and
		 * shouldn't be taken. The caller should handle a NULL
		 * return when trying to follow tail pages.
		 */
		if (PageHead(page))
			get_page(page);
		else
			page = NULL;
	}
out:
	spin_unlock(ptl);
	return page;
}

Yes, there are many !FOLL_GET cases which could use an ACCESS_ONCE(*pmd)
and avoid taking the lock; but follow_page_pte() is content to take its
lock in all cases, so I don't see any need to avoid it in this much less
common case.

And it looks to me as if this follow_huge_pmd() would be good for every
hugetlb architecture (but I may especially be wrong on that, having
compiled it for none but x86_64).  On some architectures, the ones which
currently present just a stub follow_huge_pmd(), the optimizer should
eliminate everything after the !pmd_huge test, and we won't be calling
it on those anyway.  On mips s390 and tile, I think the above represents
what they're currently doing, despite some saying HPAGE_MASK in place of
PMD_MASK, and having that funny "if (page)" condition after pte_page().

Please check carefully: I think the above follow_huge_pmd() can sit in
mm/hugetlb.c, for use on all architectures; and the variants be deleted;
and I think that would be an improvement.

I'm not sure what should happen to follow_huge_pud() if we go this way.
There's a good argument for adapting it in exactly the same way, but
that may not appeal to those wanting to remove the never used argument.

And, please, let's go just a little further, while we are having to
think of these issues.  Isn't what we're doing here much the same as
we need to do to follow_huge_addr(), to fix the May 28th issues which
led you to disable hugetlb migration on all but x86_64?

I'm not arguing to re-enable hugetlb migration on those architectures
which you cannot test, no, you did the right thing to leave that to
them.  But could we please update follow_huge_addr() (in a separate
patch) to make it consistent with this follow_huge_pmd(), so that at
least you can tell maintainers that you believe it is working now?

Uh oh.  I thought I had finished writing about this patch, but just
realized more.  Above you can see that I've faithfully copied your
"Refcount on tail pages are not well-defined" comment and !PageHead
NULL.  But that's nonsense, isn't it?  Refcount on tail pages is and
must be well-defined, and that's been handled in follow_hugetlb_page()
for, well, at least ten years.

But note the "Some archs" comment in follow_hugetlb_page(): I have
not followed it up, and it may prove to be irrelevant here; but it
suggests that in general some additional care might be needed for
the get_page()s - or perhaps they should now be get_page_folls()?

I guess the "not well-defined" comment was your guess as to why I had
put in the BUG_ON(flags & FOLL_GET)s: no, they were because nobody
required huge FOLL_GET at that time, and that case lacked the locking
which you are now supplying.

Hugh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-09 23:01   ` Hugh Dickins
  0 siblings, 0 replies; 26+ messages in thread
From: Hugh Dickins @ 2014-08-09 23:01 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, David Rientjes, linux-mm,
	linux-kernel, Naoya Horiguchi

On Fri, 1 Aug 2014, Naoya Horiguchi wrote:

> We have a race condition between move_pages() and freeing hugepages,
> where move_pages() calls follow_page(FOLL_GET) for hugepages internally
> and tries to get its refcount without preventing concurrent freeing.
> This race crashes the kernel, so this patch fixes it by moving FOLL_GET
> code for hugepages into follow_huge_pmd() with taking the page table lock.
> 
> This patch passes the following test. And libhugetlbfs test shows no
> regression.
> 
>   $ cat movepages.c
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <numaif.h>
> 
>   #define ADDR_INPUT      0x700000000000UL
>   #define HPS             0x200000
>   #define PS              0x1000
> 
>   int main(int argc, char *argv[]) {
>           int i;
>           int nr_hp = strtol(argv[1], NULL, 0);
>           int nr_p  = nr_hp * HPS / PS;
>           int ret;
>           void **addrs;
>           int *status;
>           int *nodes;
>           pid_t pid;
> 
>           pid = strtol(argv[2], NULL, 0);
>           addrs  = malloc(sizeof(char *) * nr_p + 1);
>           status = malloc(sizeof(char *) * nr_p + 1);
>           nodes  = malloc(sizeof(char *) * nr_p + 1);
> 
>           while (1) {
>                   for (i = 0; i < nr_p; i++) {
>                           addrs[i] = (void *)ADDR_INPUT + i * PS;
>                           nodes[i] = 1;
>                           status[i] = 0;
>                   }
>                   ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
>                                         MPOL_MF_MOVE_ALL);
>                   if (ret == -1)
>                           err("move_pages");
> 
>                   for (i = 0; i < nr_p; i++) {
>                           addrs[i] = (void *)ADDR_INPUT + i * PS;
>                           nodes[i] = 0;
>                           status[i] = 0;
>                   }
>                   ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
>                                         MPOL_MF_MOVE_ALL);
>                   if (ret == -1)
>                           err("move_pages");
>           }
>           return 0;
>   }
> 
>   $ cat hugepage.c
>   #include <stdio.h>
>   #include <sys/mman.h>
>   #include <string.h>
> 
>   #define ADDR_INPUT      0x700000000000UL
>   #define HPS             0x200000
> 
>   int main(int argc, char *argv[]) {
>           int nr_hp = strtol(argv[1], NULL, 0);
>           char *p;
> 
>           while (1) {
>                   p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
>                            MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
>                   if (p != (void *)ADDR_INPUT) {
>                           perror("mmap");
>                           break;
>                   }
>                   memset(p, 0, nr_hp * HPS);
>                   munmap(p, nr_hp * HPS);
>           }
>   }
> 
>   $ sysctl vm.nr_hugepages=40
>   $ ./hugepage 10 &
>   $ ./movepages 10 $(pgrep -f hugepage)
> 
> Note for stable inclusion:
>   This patch fixes e632a938d914 ("mm: migrate: add hugepage migration code
>   to move_pages()"), so is applicable to -stable kernels which includes it.
> 
> ChangeLog v2:
> - introduce follow_huge_pmd_lock() to do locking in arch-independent code.
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: <stable@vger.kernel.org>  # [3.12+]
> ---
>  include/linux/hugetlb.h |  3 +++
>  mm/gup.c                | 17 ++---------------
>  mm/hugetlb.c            | 27 +++++++++++++++++++++++++++
>  3 files changed, 32 insertions(+), 15 deletions(-)
> 
> diff --git mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> index 41272bcf73f8..194834853d6f 100644
> --- mmotm-2014-07-22-15-58.orig/include/linux/hugetlb.h
> +++ mmotm-2014-07-22-15-58/include/linux/hugetlb.h
> @@ -101,6 +101,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  				pmd_t *pmd, int write);
>  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  				pud_t *pud, int write);
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags);
>  int pmd_huge(pmd_t pmd);
>  int pud_huge(pud_t pmd);
>  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> @@ -134,6 +136,7 @@ static inline void hugetlb_show_meminfo(void)
>  }
>  #define follow_huge_pmd(mm, addr, pmd, write)	NULL
>  #define follow_huge_pud(mm, addr, pud, write)	NULL
> +#define follow_huge_pmd_lock(vma, addr, pmd, flags)	NULL
>  #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
>  #define pmd_huge(x)	0
>  #define pud_huge(x)	0
> diff --git mmotm-2014-07-22-15-58.orig/mm/gup.c mmotm-2014-07-22-15-58/mm/gup.c
> index 91d044b1600d..e4bd59efe686 100644
> --- mmotm-2014-07-22-15-58.orig/mm/gup.c
> +++ mmotm-2014-07-22-15-58/mm/gup.c
> @@ -174,21 +174,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>  	pmd = pmd_offset(pud, address);
>  	if (pmd_none(*pmd))
>  		return no_page_table(vma, flags);
> -	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
> -		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
> -		if (flags & FOLL_GET) {
> -			/*
> -			 * Refcount on tail pages are not well-defined and
> -			 * shouldn't be taken. The caller should handle a NULL
> -			 * return when trying to follow tail pages.
> -			 */
> -			if (PageHead(page))
> -				get_page(page);
> -			else
> -				page = NULL;
> -		}
> -		return page;
> -	}
> +	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
> +		return follow_huge_pmd_lock(vma, address, pmd, flags);

Yes, that's good (except I don't like the _lock name).

>  	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
>  		return no_page_table(vma, flags);
>  	if (pmd_trans_huge(*pmd)) {
> diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> index 7263c770e9b3..4437896cd6ed 100644
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
>  
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  
> +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags)
> +{
> +	struct page *page;
> +	spinlock_t *ptl;
> +
> +	if (flags & FOLL_GET)
> +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> +

But this is not good enough, I'm afraid.

> +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> +
> +	if (flags & FOLL_GET) {
> +		/*
> +		 * Refcount on tail pages are not well-defined and
> +		 * shouldn't be taken. The caller should handle a NULL
> +		 * return when trying to follow tail pages.
> +		 */
> +		if (PageHead(page))
> +			get_page(page);
> +		else
> +			page = NULL;
> +		spin_unlock(ptl);
> +	}
> +
> +	return page;
> +}
> +
>  #ifdef CONFIG_MEMORY_FAILURE
>  
>  /* Should be called in hugetlb_lock */
> -- 
> 1.9.3

Thanks a lot for remembering this, but it's not enough, I think.

It is an improvement over the current code (except for the annoying new
level, and its confusing name follow_huge_pmd_lock); but I don't want to
keep on coming back, repeatedly sending new corrections to four or more
releases of -stable.  Please let's get it right and be done with it.

I see two problems with the above, but perhaps I'm mistaken.

One is hugetlb_vmtruncate(): follow_huge_pmd_lock() is only called
when we have observed pmd_huge(*pmd), fine, but how can we assume
that pmd_huge(*pmd) still after getting the necessary huge_pte_lock?
Truncation could have changed that *pmd to none, and then pte_page()
will supply an incorrect (but non-NULL) address.

(I observe the follow_huge_pmd()s all doing an "if (page)" after
their pte_page(), but when I checked at the time of the original
follow_huge_addr() problem, I could not find any architecture with
a pte_page() returning NULL for an invalid entry - pte_page() is
a simple blind translation in every architecture, I believe, but
please check.)

Two is x86-32 PAE (and perhaps some other architectures), in which
the pmd entry spans two machine words, loaded independently.  It's a
very narrow race window, but we cannot safely access the whole *pmd
without locking: we might pick up two mismatched halves.  Take a look
at pte_unmap_same() in mm/memory.c, it's handling that issue on ptes.

So, if I follow my distaste for the intermediate follow_huge_pmd_lock
level (and in patch 4/3 you are already changing all the declarations,
so no need to be deterred by that task), I think what we need is:

struct page *
follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
		pmd_t *pmd, unsigned int flags)
{
	struct page *page;
	spinlock_t *ptl;

	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);

	if (!pmd_huge(*pmd)) {
		page = NULL;
		goto out;
	}

	page = pte_page(*(pte_t *)pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);

	if (flags & FOLL_GET) {
		/*
		 * Refcount on tail pages are not well-defined and
		 * shouldn't be taken. The caller should handle a NULL
		 * return when trying to follow tail pages.
		 */
		if (PageHead(page))
			get_page(page);
		else
			page = NULL;
	}
out:
	spin_unlock(ptl);
	return page;
}

Yes, there are many !FOLL_GET cases which could use an ACCESS_ONCE(*pmd)
and avoid taking the lock; but follow_page_pte() is content to take its
lock in all cases, so I don't see any need to avoid it in this much less
common case.

And it looks to me as if this follow_huge_pmd() would be good for every
hugetlb architecture (but I may especially be wrong on that, having
compiled it for none but x86_64).  On some architectures, the ones which
currently present just a stub follow_huge_pmd(), the optimizer should
eliminate everything after the !pmd_huge test, and we won't be calling
it on those anyway.  On mips s390 and tile, I think the above represents
what they're currently doing, despite some saying HPAGE_MASK in place of
PMD_MASK, and having that funny "if (page)" condition after pte_page().

Please check carefully: I think the above follow_huge_pmd() can sit in
mm/hugetlb.c, for use on all architectures; and the variants be deleted;
and I think that would be an improvement.

I'm not sure what should happen to follow_huge_pud() if we go this way.
There's a good argument for adapting it in exactly the same way, but
that may not appeal to those wanting to remove the never used argument.

And, please, let's go just a little further, while we are having to
think of these issues.  Isn't what we're doing here much the same as
we need to do to follow_huge_addr(), to fix the May 28th issues which
led you to disable hugetlb migration on all but x86_64?

I'm not arguing to re-enable hugetlb migration on those architectures
which you cannot test, no, you did the right thing to leave that to
them.  But could we please update follow_huge_addr() (in a separate
patch) to make it consistent with this follow_huge_pmd(), so that at
least you can tell maintainers that you believe it is working now?

Uh oh.  I thought I had finished writing about this patch, but just
realized more.  Above you can see that I've faithfully copied your
"Refcount on tail pages are not well-defined" comment and !PageHead
NULL.  But that's nonsense, isn't it?  Refcount on tail pages is and
must be well-defined, and that's been handled in follow_hugetlb_page()
for, well, at least ten years.

But note the "Some archs" comment in follow_hugetlb_page(): I have
not followed it up, and it may prove to be irrelevant here; but it
suggests that in general some additional care might be needed for
the get_page()s - or perhaps they should now be get_page_folls()?

I guess the "not well-defined" comment was your guess as to why I had
put in the BUG_ON(flags & FOLL_GET)s: no, they were because nobody
required huge FOLL_GET at that time, and that case lacked the locking
which you are now supplying.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault()
  2014-08-01 17:37   ` Naoya Horiguchi
@ 2014-08-09 23:11     ` Hugh Dickins
  -1 siblings, 0 replies; 26+ messages in thread
From: Hugh Dickins @ 2014-08-09 23:11 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, David Rientjes, Chris Metcalf,
	linux-mm, linux-kernel, Naoya Horiguchi

On Fri, 1 Aug 2014, Naoya Horiguchi wrote:

> After fixing locking in follow_page(FOLL_GET) for hugepages, I start to
> observe the BUG of "get_page() on refcount 0 page" in hugetlb_fault() in
> the same test.
> 
> I'm not exactly sure about how this race is triggered, but hugetlb_fault()
> calls pte_page() and get_page() outside page table lock, so it's not safe.
> This patch checks the refcount of the gotten page, and aborts the page fault
> if the refcount is 0, expecting to retry.
> 

Fixes: 66aebce747ea ("hugetlb: fix race condition in hugetlb_fault()")

> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: <stable@vger.kernel.org>  # [3.12+]


I disagree with your 3.12+ annotation there: you may have hit the issue
in testing your hugepage migration work, but it's older than that: the
problematic get_page() was introduced in 3.4, and has been backported
to 3.2-stable: so 3.2+.

I was suspicious of this patch at first, then on the point of giving it
an Ack, and then realized that I had been right to be suspicious of it.

You're not the first the get the sequence wrong here; and it won't be
surprising if there are other instances of subtle get_page_unless_zero()
misuse elsewhere in the tree (I dare not look!  someone else please do).

It's not the use of get_page_unless_zero() itself that is wrong, it's
the unjustified confidence in it: what's wrong is the lock_page() after.

As you have found, and acknowledged with get_page_unless_zero(), is
that the page here may be stale, it might be already freed, it might
be already reused.  If reused, then its page_count will no longer be 0,
but the new user expects to have sole ownership of the page.  The new
owner might be using __set_page_locked() (or one of the other nonatomic
flags operations), or "if (!trylock_page(newpage)) BUG()" like
migration's move_to_new_page().

We are dealing with a recently-hugetlb page here: that might make the
race I'm describing even less likely than with usual order:0 pages,
but I don't think it eliminates it.

What to do instead?  The first answer that occurs to me is to move the
the pte_page,get_page down after the pte_same check inside the spin_lock,
and only then do trylock_page(), backing out to wait_on_page_locked and
retry or refault if not.

Though if doing that, it might be more sensible only to trylock_page
before dropping ptl inside hugetlb_cow().  That would be a bigger,
maybe harder to backport, rearrangement.

What do you think?

Hugh

> ---
>  mm/hugetlb.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> index 4437896cd6ed..863f45f63cd5 100644
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3189,7 +3189,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	 * so no worry about deadlock.
>  	 */
>  	page = pte_page(entry);
> -	get_page(page);
> +	if (!get_page_unless_zero(page))
> +		goto out_put_pagecache;
>  	if (page != pagecache_page)
>  		lock_page(page);
>  
> @@ -3215,15 +3216,14 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  
>  out_ptl:
>  	spin_unlock(ptl);
> -
> +	if (page != pagecache_page)
> +		unlock_page(page);
> +	put_page(page);
> +out_put_pagecache:
>  	if (pagecache_page) {
>  		unlock_page(pagecache_page);
>  		put_page(pagecache_page);
>  	}
> -	if (page != pagecache_page)
> -		unlock_page(page);
> -	put_page(page);
> -
>  out_mutex:
>  	mutex_unlock(&htlb_fault_mutex_table[hash]);
>  	return ret;
> -- 
> 1.9.3

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault()
@ 2014-08-09 23:11     ` Hugh Dickins
  0 siblings, 0 replies; 26+ messages in thread
From: Hugh Dickins @ 2014-08-09 23:11 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, David Rientjes, Chris Metcalf,
	linux-mm, linux-kernel, Naoya Horiguchi

On Fri, 1 Aug 2014, Naoya Horiguchi wrote:

> After fixing locking in follow_page(FOLL_GET) for hugepages, I start to
> observe the BUG of "get_page() on refcount 0 page" in hugetlb_fault() in
> the same test.
> 
> I'm not exactly sure about how this race is triggered, but hugetlb_fault()
> calls pte_page() and get_page() outside page table lock, so it's not safe.
> This patch checks the refcount of the gotten page, and aborts the page fault
> if the refcount is 0, expecting to retry.
> 

Fixes: 66aebce747ea ("hugetlb: fix race condition in hugetlb_fault()")

> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: <stable@vger.kernel.org>  # [3.12+]


I disagree with your 3.12+ annotation there: you may have hit the issue
in testing your hugepage migration work, but it's older than that: the
problematic get_page() was introduced in 3.4, and has been backported
to 3.2-stable: so 3.2+.

I was suspicious of this patch at first, then on the point of giving it
an Ack, and then realized that I had been right to be suspicious of it.

You're not the first the get the sequence wrong here; and it won't be
surprising if there are other instances of subtle get_page_unless_zero()
misuse elsewhere in the tree (I dare not look!  someone else please do).

It's not the use of get_page_unless_zero() itself that is wrong, it's
the unjustified confidence in it: what's wrong is the lock_page() after.

As you have found, and acknowledged with get_page_unless_zero(), is
that the page here may be stale, it might be already freed, it might
be already reused.  If reused, then its page_count will no longer be 0,
but the new user expects to have sole ownership of the page.  The new
owner might be using __set_page_locked() (or one of the other nonatomic
flags operations), or "if (!trylock_page(newpage)) BUG()" like
migration's move_to_new_page().

We are dealing with a recently-hugetlb page here: that might make the
race I'm describing even less likely than with usual order:0 pages,
but I don't think it eliminates it.

What to do instead?  The first answer that occurs to me is to move the
the pte_page,get_page down after the pte_same check inside the spin_lock,
and only then do trylock_page(), backing out to wait_on_page_locked and
retry or refault if not.

Though if doing that, it might be more sensible only to trylock_page
before dropping ptl inside hugetlb_cow().  That would be a bigger,
maybe harder to backport, rearrangement.

What do you think?

Hugh

> ---
>  mm/hugetlb.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> index 4437896cd6ed..863f45f63cd5 100644
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3189,7 +3189,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	 * so no worry about deadlock.
>  	 */
>  	page = pte_page(entry);
> -	get_page(page);
> +	if (!get_page_unless_zero(page))
> +		goto out_put_pagecache;
>  	if (page != pagecache_page)
>  		lock_page(page);
>  
> @@ -3215,15 +3216,14 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  
>  out_ptl:
>  	spin_unlock(ptl);
> -
> +	if (page != pagecache_page)
> +		unlock_page(page);
> +	put_page(page);
> +out_put_pagecache:
>  	if (pagecache_page) {
>  		unlock_page(pagecache_page);
>  		put_page(pagecache_page);
>  	}
> -	if (page != pagecache_page)
> -		unlock_page(page);
> -	put_page(page);
> -
>  out_mutex:
>  	mutex_unlock(&htlb_fault_mutex_table[hash]);
>  	return ret;
> -- 
> 1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection
  2014-08-01 17:37   ` Naoya Horiguchi
@ 2014-08-09 23:12     ` Hugh Dickins
  -1 siblings, 0 replies; 26+ messages in thread
From: Hugh Dickins @ 2014-08-09 23:12 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, David Rientjes, linux-mm,
	linux-kernel, Naoya Horiguchi

On Fri, 1 Aug 2014, Naoya Horiguchi wrote:

> There is a race condition between hugepage migration and change_protection(),
> where hugetlb_change_protection() doesn't care about migration entries and
> wrongly overwrites them. That causes unexpected results like kernel crash.
> 
> This patch adds is_hugetlb_entry_(migration|hwpoisoned) check in this
> function and skip all such entries.
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: <stable@vger.kernel.org>  # [3.12+]
> ---
>  mm/hugetlb.c | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> index 863f45f63cd5..1da7ca2e2a02 100644
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3355,7 +3355,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>  			spin_unlock(ptl);
>  			continue;
>  		}
> -		if (!huge_pte_none(huge_ptep_get(ptep))) {
> +		pte = huge_ptep_get(ptep);
> +		if (unlikely(is_hugetlb_entry_migration(pte) ||
> +			     is_hugetlb_entry_hwpoisoned(pte))) {

Another instance of this pattern.  Oh well, perhaps we have to continue
this way while backporting fixes, but the repetition irritates me.
Or use is_swap_pte() as follow_hugetlb_page() does?

More importantly, the regular change_pte_range() has to
make_migration_entry_read() if is_migration_entry_write():
why is that not necessary here?

And have you compared hugetlb codepaths with normal codepaths, to see
if there are other huge places which need to check for a migration entry
now?  If you have checked, please reassure us in the commit message:
we would prefer not to have these fixes coming in one by one.

(I first thought __unmap_hugepage_range() would need it, but since
zap_pte_range() only checks it for rss stats, and hugetlb does not
participate in rss stats, it looks like no need.)

Hugh

> +			spin_unlock(ptl);
> +			continue;
> +		}
> +		if (!huge_pte_none(pte)) {
>  			pte = huge_ptep_get_and_clear(mm, address, ptep);
>  			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
>  			pte = arch_make_huge_pte(pte, vma, NULL, 0);
> -- 
> 1.9.3

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection
@ 2014-08-09 23:12     ` Hugh Dickins
  0 siblings, 0 replies; 26+ messages in thread
From: Hugh Dickins @ 2014-08-09 23:12 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, Hugh Dickins, David Rientjes, linux-mm,
	linux-kernel, Naoya Horiguchi

On Fri, 1 Aug 2014, Naoya Horiguchi wrote:

> There is a race condition between hugepage migration and change_protection(),
> where hugetlb_change_protection() doesn't care about migration entries and
> wrongly overwrites them. That causes unexpected results like kernel crash.
> 
> This patch adds is_hugetlb_entry_(migration|hwpoisoned) check in this
> function and skip all such entries.
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: <stable@vger.kernel.org>  # [3.12+]
> ---
>  mm/hugetlb.c | 8 +++++++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> index 863f45f63cd5..1da7ca2e2a02 100644
> --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> @@ -3355,7 +3355,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
>  			spin_unlock(ptl);
>  			continue;
>  		}
> -		if (!huge_pte_none(huge_ptep_get(ptep))) {
> +		pte = huge_ptep_get(ptep);
> +		if (unlikely(is_hugetlb_entry_migration(pte) ||
> +			     is_hugetlb_entry_hwpoisoned(pte))) {

Another instance of this pattern.  Oh well, perhaps we have to continue
this way while backporting fixes, but the repetition irritates me.
Or use is_swap_pte() as follow_hugetlb_page() does?

More importantly, the regular change_pte_range() has to
make_migration_entry_read() if is_migration_entry_write():
why is that not necessary here?

And have you compared hugetlb codepaths with normal codepaths, to see
if there are other huge places which need to check for a migration entry
now?  If you have checked, please reassure us in the commit message:
we would prefer not to have these fixes coming in one by one.

(I first thought __unmap_hugepage_range() would need it, but since
zap_pte_range() only checks it for rss stats, and hugetlb does not
participate in rss stats, it looks like no need.)

Hugh

> +			spin_unlock(ptl);
> +			continue;
> +		}
> +		if (!huge_pte_none(pte)) {
>  			pte = huge_ptep_get_and_clear(mm, address, ptep);
>  			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
>  			pte = arch_make_huge_pte(pte, vma, NULL, 0);
> -- 
> 1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
  2014-08-09 23:01   ` Hugh Dickins
@ 2014-08-12 18:55     ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-12 18:55 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Sat, Aug 09, 2014 at 04:01:39PM -0700, Hugh Dickins wrote:
> On Fri, 1 Aug 2014, Naoya Horiguchi wrote:
...
> > diff --git mmotm-2014-07-22-15-58.orig/mm/gup.c mmotm-2014-07-22-15-58/mm/gup.c
> > index 91d044b1600d..e4bd59efe686 100644
> > --- mmotm-2014-07-22-15-58.orig/mm/gup.c
> > +++ mmotm-2014-07-22-15-58/mm/gup.c
> > @@ -174,21 +174,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
> >  	pmd = pmd_offset(pud, address);
> >  	if (pmd_none(*pmd))
> >  		return no_page_table(vma, flags);
> > -	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
> > -		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
> > -		if (flags & FOLL_GET) {
> > -			/*
> > -			 * Refcount on tail pages are not well-defined and
> > -			 * shouldn't be taken. The caller should handle a NULL
> > -			 * return when trying to follow tail pages.
> > -			 */
> > -			if (PageHead(page))
> > -				get_page(page);
> > -			else
> > -				page = NULL;
> > -		}
> > -		return page;
> > -	}
> > +	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
> > +		return follow_huge_pmd_lock(vma, address, pmd, flags);
> 
> Yes, that's good (except I don't like the _lock name).
> 
> >  	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
> >  		return no_page_table(vma, flags);
> >  	if (pmd_trans_huge(*pmd)) {
> > diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> > index 7263c770e9b3..4437896cd6ed 100644
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  
> >  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
> >  
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags)
> > +{
> > +	struct page *page;
> > +	spinlock_t *ptl;
> > +
> > +	if (flags & FOLL_GET)
> > +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> > +
> 
> But this is not good enough, I'm afraid.
> 
> > +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> > +
> > +	if (flags & FOLL_GET) {
> > +		/*
> > +		 * Refcount on tail pages are not well-defined and
> > +		 * shouldn't be taken. The caller should handle a NULL
> > +		 * return when trying to follow tail pages.
> > +		 */
> > +		if (PageHead(page))
> > +			get_page(page);
> > +		else
> > +			page = NULL;
> > +		spin_unlock(ptl);
> > +	}
> > +
> > +	return page;
> > +}
> > +
> >  #ifdef CONFIG_MEMORY_FAILURE
> >  
> >  /* Should be called in hugetlb_lock */
> > -- 
> > 1.9.3
> 
> Thanks a lot for remembering this, but it's not enough, I think.

Thank you very much for detailed comments.

> It is an improvement over the current code (except for the annoying new
> level, and its confusing name follow_huge_pmd_lock); but I don't want to
> keep on coming back, repeatedly sending new corrections to four or more
> releases of -stable.  Please let's get it right and be done with it.
> 
> I see two problems with the above, but perhaps I'm mistaken.
> 
> One is hugetlb_vmtruncate(): follow_huge_pmd_lock() is only called
> when we have observed pmd_huge(*pmd), fine, but how can we assume
> that pmd_huge(*pmd) still after getting the necessary huge_pte_lock?
> Truncation could have changed that *pmd to none, and then pte_page()
> will supply an incorrect (but non-NULL) address.

OK, this race window is still open, so double-checking inside
huge_pte_lock() should be necessary.

> (I observe the follow_huge_pmd()s all doing an "if (page)" after
> their pte_page(), but when I checked at the time of the original
> follow_huge_addr() problem, I could not find any architecture with
> a pte_page() returning NULL for an invalid entry - pte_page() is
> a simple blind translation in every architecture, I believe, but
> please check.)

I agree that no implementation of pte_page() has invalid entry check.
They essentially do (base address + pfn calculated by pte_val), so
never return null for any regular input.

> Two is x86-32 PAE (and perhaps some other architectures), in which
> the pmd entry spans two machine words, loaded independently.  It's a
> very narrow race window, but we cannot safely access the whole *pmd
> without locking: we might pick up two mismatched halves.  Take a look
> at pte_unmap_same() in mm/memory.c, it's handling that issue on ptes.

OK, so ...

> So, if I follow my distaste for the intermediate follow_huge_pmd_lock
> level (and in patch 4/3 you are already changing all the declarations,
> so no need to be deterred by that task), I think what we need is:
> 
> struct page *
> follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
> 		pmd_t *pmd, unsigned int flags)
> {
> 	struct page *page;
> 	spinlock_t *ptl;
> 
> 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> 
> 	if (!pmd_huge(*pmd)) {

.. I guess that checking pte_same() is better than pmd_huge(), because
it also covers your 2nd concern above?

> 		page = NULL;
> 		goto out;
> 	}
> 
> 	page = pte_page(*(pte_t *)pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> 
> 	if (flags & FOLL_GET) {
> 		/*
> 		 * Refcount on tail pages are not well-defined and
> 		 * shouldn't be taken. The caller should handle a NULL
> 		 * return when trying to follow tail pages.
> 		 */
> 		if (PageHead(page))
> 			get_page(page);
> 		else
> 			page = NULL;
> 	}
> out:
> 	spin_unlock(ptl);
> 	return page;
> }
> 
> Yes, there are many !FOLL_GET cases which could use an ACCESS_ONCE(*pmd)
> and avoid taking the lock; but follow_page_pte() is content to take its
> lock in all cases, so I don't see any need to avoid it in this much less
> common case.

OK.

> And it looks to me as if this follow_huge_pmd() would be good for every
> hugetlb architecture (but I may especially be wrong on that, having
> compiled it for none but x86_64).  On some architectures, the ones which
> currently present just a stub follow_huge_pmd(), the optimizer should
> eliminate everything after the !pmd_huge test, and we won't be calling
> it on those anyway.  On mips s390 and tile, I think the above represents
> what they're currently doing, despite some saying HPAGE_MASK in place of
> PMD_MASK, and having that funny "if (page)" condition after pte_page().

Yes, I think that we can replace all arch-dependent follow_huge_pmd()
with the above one.
Then we need check hugepage_migration_supported() on move_pages() side.

> Please check carefully: I think the above follow_huge_pmd() can sit in
> mm/hugetlb.c, for use on all architectures; and the variants be deleted;
> and I think that would be an improvement.

Yes, I'll try this.

> I'm not sure what should happen to follow_huge_pud() if we go this way.
> There's a good argument for adapting it in exactly the same way, but
> that may not appeal to those wanting to remove the never used argument.
> 
> And, please, let's go just a little further, while we are having to
> think of these issues.  Isn't what we're doing here much the same as
> we need to do to follow_huge_addr(), to fix the May 28th issues which
> led you to disable hugetlb migration on all but x86_64?

Currently follow_huge_addr() ignores FOLL_GET, so just fixing locking
problem as this patch do is not enough for follow_huge_addr().
But when we reenable hugepage migration for example on powerpc, we will
need consider exactly the same problem.

> I'm not arguing to re-enable hugetlb migration on those architectures
> which you cannot test, no, you did the right thing to leave that to
> them.  But could we please update follow_huge_addr() (in a separate
> patch) to make it consistent with this follow_huge_pmd(), so that at
> least you can tell maintainers that you believe it is working now?

OK, I'll do it. What we can do is to take page table lock for pte_page,
and to remove unnecessary "if (page)" check, I think.

> Uh oh.  I thought I had finished writing about this patch, but just
> realized more.  Above you can see that I've faithfully copied your
> "Refcount on tail pages are not well-defined" comment and !PageHead
> NULL.  But that's nonsense, isn't it?  Refcount on tail pages is and
> must be well-defined, and that's been handled in follow_hugetlb_page()
> for, well, at least ten years.

Ah, this "it's not well-defined" was completely wrong. I'll remove it.

> 
> But note the "Some archs" comment in follow_hugetlb_page(): I have
> not followed it up, and it may prove to be irrelevant here; but it
> suggests that in general some additional care might be needed for
> the get_page()s - or perhaps they should now be get_page_folls()?

Yes, it's get_page_folls() now, and it internally pins tail pages by
incrementing page->_mapcount, so using get_page_unless_zero() in
follow_huge_pmd_lock() might be better (to skip tails naturally.)

Thanks,
Naoya Horiguchi

> I guess the "not well-defined" comment was your guess as to why I had
> put in the BUG_ON(flags & FOLL_GET)s: no, they were because nobody
> required huge FOLL_GET at that time, and that case lacked the locking
> which you are now supplying.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd()
@ 2014-08-12 18:55     ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-12 18:55 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Sat, Aug 09, 2014 at 04:01:39PM -0700, Hugh Dickins wrote:
> On Fri, 1 Aug 2014, Naoya Horiguchi wrote:
...
> > diff --git mmotm-2014-07-22-15-58.orig/mm/gup.c mmotm-2014-07-22-15-58/mm/gup.c
> > index 91d044b1600d..e4bd59efe686 100644
> > --- mmotm-2014-07-22-15-58.orig/mm/gup.c
> > +++ mmotm-2014-07-22-15-58/mm/gup.c
> > @@ -174,21 +174,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
> >  	pmd = pmd_offset(pud, address);
> >  	if (pmd_none(*pmd))
> >  		return no_page_table(vma, flags);
> > -	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
> > -		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
> > -		if (flags & FOLL_GET) {
> > -			/*
> > -			 * Refcount on tail pages are not well-defined and
> > -			 * shouldn't be taken. The caller should handle a NULL
> > -			 * return when trying to follow tail pages.
> > -			 */
> > -			if (PageHead(page))
> > -				get_page(page);
> > -			else
> > -				page = NULL;
> > -		}
> > -		return page;
> > -	}
> > +	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
> > +		return follow_huge_pmd_lock(vma, address, pmd, flags);
> 
> Yes, that's good (except I don't like the _lock name).
> 
> >  	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
> >  		return no_page_table(vma, flags);
> >  	if (pmd_trans_huge(*pmd)) {
> > diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> > index 7263c770e9b3..4437896cd6ed 100644
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3687,6 +3687,33 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >  
> >  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
> >  
> > +struct page *follow_huge_pmd_lock(struct vm_area_struct *vma,
> > +				unsigned long address, pmd_t *pmd, int flags)
> > +{
> > +	struct page *page;
> > +	spinlock_t *ptl;
> > +
> > +	if (flags & FOLL_GET)
> > +		ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> > +
> 
> But this is not good enough, I'm afraid.
> 
> > +	page = follow_huge_pmd(vma->vm_mm, address, pmd, flags & FOLL_WRITE);
> > +
> > +	if (flags & FOLL_GET) {
> > +		/*
> > +		 * Refcount on tail pages are not well-defined and
> > +		 * shouldn't be taken. The caller should handle a NULL
> > +		 * return when trying to follow tail pages.
> > +		 */
> > +		if (PageHead(page))
> > +			get_page(page);
> > +		else
> > +			page = NULL;
> > +		spin_unlock(ptl);
> > +	}
> > +
> > +	return page;
> > +}
> > +
> >  #ifdef CONFIG_MEMORY_FAILURE
> >  
> >  /* Should be called in hugetlb_lock */
> > -- 
> > 1.9.3
> 
> Thanks a lot for remembering this, but it's not enough, I think.

Thank you very much for detailed comments.

> It is an improvement over the current code (except for the annoying new
> level, and its confusing name follow_huge_pmd_lock); but I don't want to
> keep on coming back, repeatedly sending new corrections to four or more
> releases of -stable.  Please let's get it right and be done with it.
> 
> I see two problems with the above, but perhaps I'm mistaken.
> 
> One is hugetlb_vmtruncate(): follow_huge_pmd_lock() is only called
> when we have observed pmd_huge(*pmd), fine, but how can we assume
> that pmd_huge(*pmd) still after getting the necessary huge_pte_lock?
> Truncation could have changed that *pmd to none, and then pte_page()
> will supply an incorrect (but non-NULL) address.

OK, this race window is still open, so double-checking inside
huge_pte_lock() should be necessary.

> (I observe the follow_huge_pmd()s all doing an "if (page)" after
> their pte_page(), but when I checked at the time of the original
> follow_huge_addr() problem, I could not find any architecture with
> a pte_page() returning NULL for an invalid entry - pte_page() is
> a simple blind translation in every architecture, I believe, but
> please check.)

I agree that no implementation of pte_page() has invalid entry check.
They essentially do (base address + pfn calculated by pte_val), so
never return null for any regular input.

> Two is x86-32 PAE (and perhaps some other architectures), in which
> the pmd entry spans two machine words, loaded independently.  It's a
> very narrow race window, but we cannot safely access the whole *pmd
> without locking: we might pick up two mismatched halves.  Take a look
> at pte_unmap_same() in mm/memory.c, it's handling that issue on ptes.

OK, so ...

> So, if I follow my distaste for the intermediate follow_huge_pmd_lock
> level (and in patch 4/3 you are already changing all the declarations,
> so no need to be deterred by that task), I think what we need is:
> 
> struct page *
> follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
> 		pmd_t *pmd, unsigned int flags)
> {
> 	struct page *page;
> 	spinlock_t *ptl;
> 
> 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
> 
> 	if (!pmd_huge(*pmd)) {

.. I guess that checking pte_same() is better than pmd_huge(), because
it also covers your 2nd concern above?

> 		page = NULL;
> 		goto out;
> 	}
> 
> 	page = pte_page(*(pte_t *)pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> 
> 	if (flags & FOLL_GET) {
> 		/*
> 		 * Refcount on tail pages are not well-defined and
> 		 * shouldn't be taken. The caller should handle a NULL
> 		 * return when trying to follow tail pages.
> 		 */
> 		if (PageHead(page))
> 			get_page(page);
> 		else
> 			page = NULL;
> 	}
> out:
> 	spin_unlock(ptl);
> 	return page;
> }
> 
> Yes, there are many !FOLL_GET cases which could use an ACCESS_ONCE(*pmd)
> and avoid taking the lock; but follow_page_pte() is content to take its
> lock in all cases, so I don't see any need to avoid it in this much less
> common case.

OK.

> And it looks to me as if this follow_huge_pmd() would be good for every
> hugetlb architecture (but I may especially be wrong on that, having
> compiled it for none but x86_64).  On some architectures, the ones which
> currently present just a stub follow_huge_pmd(), the optimizer should
> eliminate everything after the !pmd_huge test, and we won't be calling
> it on those anyway.  On mips s390 and tile, I think the above represents
> what they're currently doing, despite some saying HPAGE_MASK in place of
> PMD_MASK, and having that funny "if (page)" condition after pte_page().

Yes, I think that we can replace all arch-dependent follow_huge_pmd()
with the above one.
Then we need check hugepage_migration_supported() on move_pages() side.

> Please check carefully: I think the above follow_huge_pmd() can sit in
> mm/hugetlb.c, for use on all architectures; and the variants be deleted;
> and I think that would be an improvement.

Yes, I'll try this.

> I'm not sure what should happen to follow_huge_pud() if we go this way.
> There's a good argument for adapting it in exactly the same way, but
> that may not appeal to those wanting to remove the never used argument.
> 
> And, please, let's go just a little further, while we are having to
> think of these issues.  Isn't what we're doing here much the same as
> we need to do to follow_huge_addr(), to fix the May 28th issues which
> led you to disable hugetlb migration on all but x86_64?

Currently follow_huge_addr() ignores FOLL_GET, so just fixing locking
problem as this patch do is not enough for follow_huge_addr().
But when we reenable hugepage migration for example on powerpc, we will
need consider exactly the same problem.

> I'm not arguing to re-enable hugetlb migration on those architectures
> which you cannot test, no, you did the right thing to leave that to
> them.  But could we please update follow_huge_addr() (in a separate
> patch) to make it consistent with this follow_huge_pmd(), so that at
> least you can tell maintainers that you believe it is working now?

OK, I'll do it. What we can do is to take page table lock for pte_page,
and to remove unnecessary "if (page)" check, I think.

> Uh oh.  I thought I had finished writing about this patch, but just
> realized more.  Above you can see that I've faithfully copied your
> "Refcount on tail pages are not well-defined" comment and !PageHead
> NULL.  But that's nonsense, isn't it?  Refcount on tail pages is and
> must be well-defined, and that's been handled in follow_hugetlb_page()
> for, well, at least ten years.

Ah, this "it's not well-defined" was completely wrong. I'll remove it.

> 
> But note the "Some archs" comment in follow_hugetlb_page(): I have
> not followed it up, and it may prove to be irrelevant here; but it
> suggests that in general some additional care might be needed for
> the get_page()s - or perhaps they should now be get_page_folls()?

Yes, it's get_page_folls() now, and it internally pins tail pages by
incrementing page->_mapcount, so using get_page_unless_zero() in
follow_huge_pmd_lock() might be better (to skip tails naturally.)

Thanks,
Naoya Horiguchi

> I guess the "not well-defined" comment was your guess as to why I had
> put in the BUG_ON(flags & FOLL_GET)s: no, they were because nobody
> required huge FOLL_GET at that time, and that case lacked the locking
> which you are now supplying.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault()
  2014-08-09 23:11     ` Hugh Dickins
@ 2014-08-12 18:55       ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-12 18:55 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, David Rientjes, Chris Metcalf, linux-mm,
	linux-kernel, Naoya Horiguchi

On Sat, Aug 09, 2014 at 04:11:06PM -0700, Hugh Dickins wrote:
> On Fri, 1 Aug 2014, Naoya Horiguchi wrote:
> 
> > After fixing locking in follow_page(FOLL_GET) for hugepages, I start to
> > observe the BUG of "get_page() on refcount 0 page" in hugetlb_fault() in
> > the same test.
> > 
> > I'm not exactly sure about how this race is triggered, but hugetlb_fault()
> > calls pte_page() and get_page() outside page table lock, so it's not safe.
> > This patch checks the refcount of the gotten page, and aborts the page fault
> > if the refcount is 0, expecting to retry.
> > 
> 
> Fixes: 66aebce747ea ("hugetlb: fix race condition in hugetlb_fault()")
> 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Cc: <stable@vger.kernel.org>  # [3.12+]
> 
> 
> I disagree with your 3.12+ annotation there: you may have hit the issue
> in testing your hugepage migration work, but it's older than that: the
> problematic get_page() was introduced in 3.4, and has been backported
> to 3.2-stable: so 3.2+.

Right, thanks.

> I was suspicious of this patch at first, then on the point of giving it
> an Ack, and then realized that I had been right to be suspicious of it.
> 
> You're not the first the get the sequence wrong here; and it won't be
> surprising if there are other instances of subtle get_page_unless_zero()
> misuse elsewhere in the tree (I dare not look!  someone else please do).
> 
> It's not the use of get_page_unless_zero() itself that is wrong, it's
> the unjustified confidence in it: what's wrong is the lock_page() after.
> 
> As you have found, and acknowledged with get_page_unless_zero(), is
> that the page here may be stale, it might be already freed, it might
> be already reused.  If reused, then its page_count will no longer be 0,
> but the new user expects to have sole ownership of the page.  The new
> owner might be using __set_page_locked() (or one of the other nonatomic
> flags operations), or "if (!trylock_page(newpage)) BUG()" like
> migration's move_to_new_page().
> 
> We are dealing with a recently-hugetlb page here: that might make the
> race I'm describing even less likely than with usual order:0 pages,
> but I don't think it eliminates it.

I agree.

> What to do instead?  The first answer that occurs to me is to move the
> the pte_page,get_page down after the pte_same check inside the spin_lock,
> and only then do trylock_page(), backing out to wait_on_page_locked and
> retry or refault if not.

I think that should work.
According to the lock ordering commented in mm/rmap.c, page lock is prior
to page table lock, so we can't take page lock inside page table lock.
But with trylock_page() we check if the page lock is taken or not, so
we can avoid deadlock.

> Though if doing that, it might be more sensible only to trylock_page
> before dropping ptl inside hugetlb_cow().  That would be a bigger,
> maybe harder to backport, rearrangement.

Yes, the patch will be somewhat complicated for stable, and we can't
avoid that.

Thanks,
Naoya Horiguchi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault()
@ 2014-08-12 18:55       ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-12 18:55 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, David Rientjes, Chris Metcalf, linux-mm,
	linux-kernel, Naoya Horiguchi

On Sat, Aug 09, 2014 at 04:11:06PM -0700, Hugh Dickins wrote:
> On Fri, 1 Aug 2014, Naoya Horiguchi wrote:
> 
> > After fixing locking in follow_page(FOLL_GET) for hugepages, I start to
> > observe the BUG of "get_page() on refcount 0 page" in hugetlb_fault() in
> > the same test.
> > 
> > I'm not exactly sure about how this race is triggered, but hugetlb_fault()
> > calls pte_page() and get_page() outside page table lock, so it's not safe.
> > This patch checks the refcount of the gotten page, and aborts the page fault
> > if the refcount is 0, expecting to retry.
> > 
> 
> Fixes: 66aebce747ea ("hugetlb: fix race condition in hugetlb_fault()")
> 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Cc: <stable@vger.kernel.org>  # [3.12+]
> 
> 
> I disagree with your 3.12+ annotation there: you may have hit the issue
> in testing your hugepage migration work, but it's older than that: the
> problematic get_page() was introduced in 3.4, and has been backported
> to 3.2-stable: so 3.2+.

Right, thanks.

> I was suspicious of this patch at first, then on the point of giving it
> an Ack, and then realized that I had been right to be suspicious of it.
> 
> You're not the first the get the sequence wrong here; and it won't be
> surprising if there are other instances of subtle get_page_unless_zero()
> misuse elsewhere in the tree (I dare not look!  someone else please do).
> 
> It's not the use of get_page_unless_zero() itself that is wrong, it's
> the unjustified confidence in it: what's wrong is the lock_page() after.
> 
> As you have found, and acknowledged with get_page_unless_zero(), is
> that the page here may be stale, it might be already freed, it might
> be already reused.  If reused, then its page_count will no longer be 0,
> but the new user expects to have sole ownership of the page.  The new
> owner might be using __set_page_locked() (or one of the other nonatomic
> flags operations), or "if (!trylock_page(newpage)) BUG()" like
> migration's move_to_new_page().
> 
> We are dealing with a recently-hugetlb page here: that might make the
> race I'm describing even less likely than with usual order:0 pages,
> but I don't think it eliminates it.

I agree.

> What to do instead?  The first answer that occurs to me is to move the
> the pte_page,get_page down after the pte_same check inside the spin_lock,
> and only then do trylock_page(), backing out to wait_on_page_locked and
> retry or refault if not.

I think that should work.
According to the lock ordering commented in mm/rmap.c, page lock is prior
to page table lock, so we can't take page lock inside page table lock.
But with trylock_page() we check if the page lock is taken or not, so
we can avoid deadlock.

> Though if doing that, it might be more sensible only to trylock_page
> before dropping ptl inside hugetlb_cow().  That would be a bigger,
> maybe harder to backport, rearrangement.

Yes, the patch will be somewhat complicated for stable, and we can't
avoid that.

Thanks,
Naoya Horiguchi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection
  2014-08-09 23:12     ` Hugh Dickins
@ 2014-08-12 18:55       ` Naoya Horiguchi
  -1 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-12 18:55 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Sat, Aug 09, 2014 at 04:12:09PM -0700, Hugh Dickins wrote:
> On Fri, 1 Aug 2014, Naoya Horiguchi wrote:
> 
> > There is a race condition between hugepage migration and change_protection(),
> > where hugetlb_change_protection() doesn't care about migration entries and
> > wrongly overwrites them. That causes unexpected results like kernel crash.
> > 
> > This patch adds is_hugetlb_entry_(migration|hwpoisoned) check in this
> > function and skip all such entries.
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Cc: <stable@vger.kernel.org>  # [3.12+]
> > ---
> >  mm/hugetlb.c | 8 +++++++-
> >  1 file changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> > index 863f45f63cd5..1da7ca2e2a02 100644
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3355,7 +3355,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> >  			spin_unlock(ptl);
> >  			continue;
> >  		}
> > -		if (!huge_pte_none(huge_ptep_get(ptep))) {
> > +		pte = huge_ptep_get(ptep);
> > +		if (unlikely(is_hugetlb_entry_migration(pte) ||
> > +			     is_hugetlb_entry_hwpoisoned(pte))) {
> 
> Another instance of this pattern.  Oh well, perhaps we have to continue
> this way while backporting fixes, but the repetition irritates me.

Yes, I thought about the repetition too, so at some point (hopefully
in this patchset?) it would be nice to fix up all the similar code.

> Or use is_swap_pte() as follow_hugetlb_page() does?
>
> More importantly, the regular change_pte_range() has to
> make_migration_entry_read() if is_migration_entry_write():
> why is that not necessary here?

It's necessary for migration entry. For hwpoison entry, just unlocking is ok.
(I focused on avoiding bug and thought not enough about proper fixing, sorry.)

> And have you compared hugetlb codepaths with normal codepaths, to see
> if there are other huge places which need to check for a migration entry
> now?  If you have checked, please reassure us in the commit message:
> we would prefer not to have these fixes coming in one by one.

I've not checked all hugetlb codepaths, so will do this.
(for example free_pgtables() may need a check of migration pmd entry.)

> (I first thought __unmap_hugepage_range() would need it, but since
> zap_pte_range() only checks it for rss stats, and hugetlb does not
> participate in rss stats, it looks like no need.)

You catch the point. I thought that is_hugetlb_entry_migration() check
is necessary in __unmap_hugepage_range(), but didn't include it in this
patch just because it's not related to this specific problem.
But it's an inefficient manner of kernel development, so I'll include
it in the next version.

Thanks,
Naoya Horiguchi

> Hugh
> 
> > +			spin_unlock(ptl);
> > +			continue;
> > +		}
> > +		if (!huge_pte_none(pte)) {
> >  			pte = huge_ptep_get_and_clear(mm, address, ptep);
> >  			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
> >  			pte = arch_make_huge_pte(pte, vma, NULL, 0);
> > -- 
> > 1.9.3
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection
@ 2014-08-12 18:55       ` Naoya Horiguchi
  0 siblings, 0 replies; 26+ messages in thread
From: Naoya Horiguchi @ 2014-08-12 18:55 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, David Rientjes, linux-mm, linux-kernel, Naoya Horiguchi

On Sat, Aug 09, 2014 at 04:12:09PM -0700, Hugh Dickins wrote:
> On Fri, 1 Aug 2014, Naoya Horiguchi wrote:
> 
> > There is a race condition between hugepage migration and change_protection(),
> > where hugetlb_change_protection() doesn't care about migration entries and
> > wrongly overwrites them. That causes unexpected results like kernel crash.
> > 
> > This patch adds is_hugetlb_entry_(migration|hwpoisoned) check in this
> > function and skip all such entries.
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Cc: <stable@vger.kernel.org>  # [3.12+]
> > ---
> >  mm/hugetlb.c | 8 +++++++-
> >  1 file changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git mmotm-2014-07-22-15-58.orig/mm/hugetlb.c mmotm-2014-07-22-15-58/mm/hugetlb.c
> > index 863f45f63cd5..1da7ca2e2a02 100644
> > --- mmotm-2014-07-22-15-58.orig/mm/hugetlb.c
> > +++ mmotm-2014-07-22-15-58/mm/hugetlb.c
> > @@ -3355,7 +3355,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> >  			spin_unlock(ptl);
> >  			continue;
> >  		}
> > -		if (!huge_pte_none(huge_ptep_get(ptep))) {
> > +		pte = huge_ptep_get(ptep);
> > +		if (unlikely(is_hugetlb_entry_migration(pte) ||
> > +			     is_hugetlb_entry_hwpoisoned(pte))) {
> 
> Another instance of this pattern.  Oh well, perhaps we have to continue
> this way while backporting fixes, but the repetition irritates me.

Yes, I thought about the repetition too, so at some point (hopefully
in this patchset?) it would be nice to fix up all the similar code.

> Or use is_swap_pte() as follow_hugetlb_page() does?
>
> More importantly, the regular change_pte_range() has to
> make_migration_entry_read() if is_migration_entry_write():
> why is that not necessary here?

It's necessary for migration entry. For hwpoison entry, just unlocking is ok.
(I focused on avoiding bug and thought not enough about proper fixing, sorry.)

> And have you compared hugetlb codepaths with normal codepaths, to see
> if there are other huge places which need to check for a migration entry
> now?  If you have checked, please reassure us in the commit message:
> we would prefer not to have these fixes coming in one by one.

I've not checked all hugetlb codepaths, so will do this.
(for example free_pgtables() may need a check of migration pmd entry.)

> (I first thought __unmap_hugepage_range() would need it, but since
> zap_pte_range() only checks it for rss stats, and hugetlb does not
> participate in rss stats, it looks like no need.)

You catch the point. I thought that is_hugetlb_entry_migration() check
is necessary in __unmap_hugepage_range(), but didn't include it in this
patch just because it's not related to this specific problem.
But it's an inefficient manner of kernel development, so I'll include
it in the next version.

Thanks,
Naoya Horiguchi

> Hugh
> 
> > +			spin_unlock(ptl);
> > +			continue;
> > +		}
> > +		if (!huge_pte_none(pte)) {
> >  			pte = huge_ptep_get_and_clear(mm, address, ptep);
> >  			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
> >  			pte = arch_make_huge_pte(pte, vma, NULL, 0);
> > -- 
> > 1.9.3
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2014-08-12 19:53 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-01 17:37 [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd() Naoya Horiguchi
2014-08-01 17:37 ` Naoya Horiguchi
2014-08-01 17:37 ` [PATCH v2 2/3] mm/hugetlb: use get_page_unless_zero() in hugetlb_fault() Naoya Horiguchi
2014-08-01 17:37   ` Naoya Horiguchi
2014-08-09 23:11   ` Hugh Dickins
2014-08-09 23:11     ` Hugh Dickins
2014-08-12 18:55     ` Naoya Horiguchi
2014-08-12 18:55       ` Naoya Horiguchi
2014-08-01 17:37 ` [PATCH v2 3/3] mm/hugetlb: add migration entry check in hugetlb_change_protection Naoya Horiguchi
2014-08-01 17:37   ` Naoya Horiguchi
2014-08-09 23:12   ` Hugh Dickins
2014-08-09 23:12     ` Hugh Dickins
2014-08-12 18:55     ` Naoya Horiguchi
2014-08-12 18:55       ` Naoya Horiguchi
2014-08-01 21:53 ` [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd() Andrew Morton
2014-08-01 21:53   ` Andrew Morton
2014-08-01 21:58   ` Naoya Horiguchi
2014-08-01 21:58     ` Naoya Horiguchi
2014-08-04 15:50     ` [PATCH] mm/hugetlb: remove unused argument of follow_huge_(pmd|pud) Naoya Horiguchi
2014-08-04 15:50       ` Naoya Horiguchi
2014-08-04 15:29   ` [PATCH v2 1/3] mm/hugetlb: take refcount under page table lock in follow_huge_pmd() Naoya Horiguchi
2014-08-04 15:29     ` Naoya Horiguchi
2014-08-09 23:01 ` Hugh Dickins
2014-08-09 23:01   ` Hugh Dickins
2014-08-12 18:55   ` Naoya Horiguchi
2014-08-12 18:55     ` Naoya Horiguchi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.