All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
@ 2010-10-14 20:56 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-14 20:56 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 3767 bytes --]



Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgd pages.

Bug reported by Ian Campbell <ian.cambell@eu.citrix.com>
Derived from a patch by Jan Beulich <jbeulich@novell.com>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785..422b363 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e..b7f9ae1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
@@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);



[-- Attachment #2: x86-lock-page_table_lock-in-vmalloc_sync.patch --]
[-- Type: text/plain, Size: 3999 bytes --]

From 6ddec918473deed20c2b42a19530608ef96b7afc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 21 Sep 2010 12:01:51 -0700
Subject: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync

Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgds.

Bug reported by Ian Campbell <ian.cambell@eu.citrix.com>
Derived from a patch by Jan Beulich <jbeulich@novell.com>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785..422b363 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e..b7f9ae1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
@@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
@ 2010-10-14 20:56 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-14 20:56 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Jan Beulich, Ian Campbell

[-- Attachment #1: Type: text/plain, Size: 3767 bytes --]



Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgd pages.

Bug reported by Ian Campbell <ian.cambell@eu.citrix.com>
Derived from a patch by Jan Beulich <jbeulich@novell.com>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785..422b363 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e..b7f9ae1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
@@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);



[-- Attachment #2: x86-lock-page_table_lock-in-vmalloc_sync.patch --]
[-- Type: text/plain, Size: 3999 bytes --]

From 6ddec918473deed20c2b42a19530608ef96b7afc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 21 Sep 2010 12:01:51 -0700
Subject: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync

Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgds.

Bug reported by Ian Campbell <ian.cambell@eu.citrix.com>
Derived from a patch by Jan Beulich <jbeulich@novell.com>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785..422b363 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e..b7f9ae1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
@@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [Xen-devel] [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2010-10-14 20:56 ` Jeremy Fitzhardinge
@ 2010-10-15 17:07   ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-15 17:07 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Jan Beulich, Ian Campbell

 On 10/14/2010 01:56 PM, Jeremy Fitzhardinge wrote:
> Take mm->page_table_lock while syncing the vmalloc region.  This prevents
> a race with the Xen pagetable pin/unpin code, which expects that the
> page_table_lock is already held.  If this race occurs, then Xen can see
> an inconsistent page type (a page can either be read/write or a pagetable
> page, and pin/unpin converts it between them), which will cause either
> the pin or the set_p[gm]d to fail; either will crash the kernel.

I've merged this into tip/x86/mm, which had some conflicting changes
(and fixed up some whitespace issues in one of those changes):

The following changes since commit a416e9e1dde0fbcf20cda59df284cc0dcf2aadc4:

  x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation (2010-10-07 16:36:17 -0700)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git x86-mm

Jeremy Fitzhardinge (2):
      x86/mm: fix bogus whitespace in sync_global_pgds()
      x86: hold mm->page_table_lock while doing vmalloc_sync

 arch/x86/include/asm/pgtable.h |    2 +
 arch/x86/mm/fault.c            |   11 ++++++++-
 arch/x86/mm/init_64.c          |   51 ++++++++++++++++++++++-----------------
 arch/x86/mm/pgtable.c          |   20 +++++++++++++--
 4 files changed, 58 insertions(+), 26 deletions(-)

	J


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
@ 2010-10-15 17:07   ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-15 17:07 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ian Campbell, Xen-devel, the arch/x86 maintainers, Jan Beulich,
	Linux Kernel Mailing List

 On 10/14/2010 01:56 PM, Jeremy Fitzhardinge wrote:
> Take mm->page_table_lock while syncing the vmalloc region.  This prevents
> a race with the Xen pagetable pin/unpin code, which expects that the
> page_table_lock is already held.  If this race occurs, then Xen can see
> an inconsistent page type (a page can either be read/write or a pagetable
> page, and pin/unpin converts it between them), which will cause either
> the pin or the set_p[gm]d to fail; either will crash the kernel.

I've merged this into tip/x86/mm, which had some conflicting changes
(and fixed up some whitespace issues in one of those changes):

The following changes since commit a416e9e1dde0fbcf20cda59df284cc0dcf2aadc4:

  x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation (2010-10-07 16:36:17 -0700)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git x86-mm

Jeremy Fitzhardinge (2):
      x86/mm: fix bogus whitespace in sync_global_pgds()
      x86: hold mm->page_table_lock while doing vmalloc_sync

 arch/x86/include/asm/pgtable.h |    2 +
 arch/x86/mm/fault.c            |   11 ++++++++-
 arch/x86/mm/init_64.c          |   51 ++++++++++++++++++++++-----------------
 arch/x86/mm/pgtable.c          |   20 +++++++++++++--
 4 files changed, 58 insertions(+), 26 deletions(-)

	J

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [tip:x86/mm] x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
  2010-10-15 17:07   ` Jeremy Fitzhardinge
  (?)
@ 2010-10-19 22:17   ` tip-bot for Jeremy Fitzhardinge
  2010-10-20 10:36     ` Borislav Petkov
  -1 siblings, 1 reply; 68+ messages in thread
From: tip-bot for Jeremy Fitzhardinge @ 2010-10-19 22:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, ian.cambell, jbeulich, tglx, hpa,
	jeremy.fitzhardinge

Commit-ID:  617d34d9e5d8326ec8f188c616aa06ac59d083fe
Gitweb:     http://git.kernel.org/tip/617d34d9e5d8326ec8f188c616aa06ac59d083fe
Author:     Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
AuthorDate: Tue, 21 Sep 2010 12:01:51 -0700
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Tue, 19 Oct 2010 13:57:08 -0700

x86, mm: Hold mm->page_table_lock while doing vmalloc_sync

Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgds.

Reported-by: Ian Campbell <ian.cambell@eu.citrix.com>
Originally-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4CB88A4C.1080305@goop.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/pgtable.h |    2 ++
 arch/x86/mm/fault.c            |   11 ++++++++++-
 arch/x86/mm/init_64.c          |    7 +++++++
 arch/x86/mm/pgtable.c          |   20 +++++++++++++++++---
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2d0a33b..ada823a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index caec229..6c27c39 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ad7c0f..4d323fb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -116,12 +116,19 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd)
 				       != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
  2010-10-19 22:17   ` [tip:x86/mm] x86, mm: Hold " tip-bot for Jeremy Fitzhardinge
@ 2010-10-20 10:36     ` Borislav Petkov
  2010-10-20 19:31       ` [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all() tip-bot for tip-bot for Jeremy Fitzhardinge
  2010-10-20 19:58       ` tip-bot for Borislav Petkov
  0 siblings, 2 replies; 68+ messages in thread
From: Borislav Petkov @ 2010-10-20 10:36 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: linux-tip-commits, linux-kernel, hpa, mingo, ian.cambell,
	jbeulich, tglx, hpa

From: tip-bot for Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, Oct 19, 2010 at 10:17:37PM +0000

> Commit-ID:  617d34d9e5d8326ec8f188c616aa06ac59d083fe
> Gitweb:     http://git.kernel.org/tip/617d34d9e5d8326ec8f188c616aa06ac59d083fe
> Author:     Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> AuthorDate: Tue, 21 Sep 2010 12:01:51 -0700
> Committer:  H. Peter Anvin <hpa@linux.intel.com>
> CommitDate: Tue, 19 Oct 2010 13:57:08 -0700

--
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 Oct 2010 12:23:48 +0200
Subject: [PATCH] x86, mm: Fix build warning on 32-bit

arch/x86/mm/fault.c: In function 'vmalloc_sync_all':
arch/x86/mm/fault.c:238: warning: assignment makes integer from pointer without a cast

introduced by 617d34d9e5d8326ec8f188c616aa06ac59d083fe.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/mm/fault.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 85c2eb9..79b0b37 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -230,7 +230,7 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
-			int ret;
+			pmd_t *ret;
 
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
-- 
1.7.3.1


-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 10:36     ` Borislav Petkov
@ 2010-10-20 19:31       ` tip-bot for tip-bot for Jeremy Fitzhardinge
  2010-10-20 19:50         ` Borislav Petkov
  2010-10-20 19:58       ` tip-bot for Borislav Petkov
  1 sibling, 1 reply; 68+ messages in thread
From: tip-bot for tip-bot for Jeremy Fitzhardinge @ 2010-10-20 19:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, tglx, hpa, borislav.petkov,
	jeremy.fitzhardinge

Commit-ID:  bfac67208d6d2a84fe7980187d3d2f21424dde59
Gitweb:     http://git.kernel.org/tip/bfac67208d6d2a84fe7980187d3d2f21424dde59
Author:     tip-bot for Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
AuthorDate: Tue, 19 Oct 2010 22:17:37 +0000
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Wed, 20 Oct 2010 11:53:10 -0700

x86, mm: Fix incorrect data type in vmalloc_sync_all()

arch/x86/mm/fault.c: In function 'vmalloc_sync_all':
arch/x86/mm/fault.c:238: warning: assignment makes integer from pointer without a cast

introduced by 617d34d9e5d8326ec8f188c616aa06ac59d083fe.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20101020103642.GA3135@kryptos.osrc.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6c27c39..0cdb8d4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -230,7 +230,7 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
-			int ret;
+			pmd_t *ret;
 
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 19:31       ` [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all() tip-bot for tip-bot for Jeremy Fitzhardinge
@ 2010-10-20 19:50         ` Borislav Petkov
  2010-10-20 19:53           ` H. Peter Anvin
  0 siblings, 1 reply; 68+ messages in thread
From: Borislav Petkov @ 2010-10-20 19:50 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: mingo, linux-kernel, tglx, hpa, jeremy.fitzhardinge, linux-tip-commits

> Commit-ID:  bfac67208d6d2a84fe7980187d3d2f21424dde59
> Gitweb:     http://git.kernel.org/tip/bfac67208d6d2a84fe7980187d3d2f21424dde59
> Author:     tip-bot for Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

Looks like git (or your script) picked up the first From: in the mail as
author...

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 19:50         ` Borislav Petkov
@ 2010-10-20 19:53           ` H. Peter Anvin
  2010-10-20 20:10             ` Borislav Petkov
  2010-10-20 21:26             ` Ben Pfaff
  0 siblings, 2 replies; 68+ messages in thread
From: H. Peter Anvin @ 2010-10-20 19:53 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, mingo, linux-kernel, tglx, jeremy.fitzhardinge,
	linux-tip-commits

On 10/20/2010 12:50 PM, Borislav Petkov wrote:
>> Commit-ID:  bfac67208d6d2a84fe7980187d3d2f21424dde59
>> Gitweb:     http://git.kernel.org/tip/bfac67208d6d2a84fe7980187d3d2f21424dde59
>> Author:     tip-bot for Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> 
> Looks like git (or your script) picked up the first From: in the mail as
> author...
> 

Yes, your email started out:


From: tip-bot for Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, Oct 19, 2010 at 10:17:37PM +0000

> > Commit-ID:  617d34d9e5d8326ec8f188c616aa06ac59d083fe
> > Gitweb:
http://git.kernel.org/tip/617d34d9e5d8326ec8f188c616aa06ac59d083fe
> > Author:     Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> > AuthorDate: Tue, 21 Sep 2010 12:01:51 -0700
> > Committer:  H. Peter Anvin <hpa@linux.intel.com>
> > CommitDate: Tue, 19 Oct 2010 13:57:08 -0700
--
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 Oct 2010 12:23:48 +0200
Subject: [PATCH] x86, mm: Fix build warning on 32-bit

... and that confused the scripts.  The way git wants it commentary
information should go *after* the "---" line.

(This is a constant problem, by the way: people seems to want to prefix,
not suffix, their commentary; "cover letter" style.)  I suspect git
needs to change, but it's hard to know exactly how to change it.

	-hpa

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 10:36     ` Borislav Petkov
  2010-10-20 19:31       ` [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all() tip-bot for tip-bot for Jeremy Fitzhardinge
@ 2010-10-20 19:58       ` tip-bot for Borislav Petkov
  1 sibling, 0 replies; 68+ messages in thread
From: tip-bot for Borislav Petkov @ 2010-10-20 19:58 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, tglx, hpa, borislav.petkov

Commit-ID:  f01f7c56a1425b9749a99af821e1de334fb64d7e
Gitweb:     http://git.kernel.org/tip/f01f7c56a1425b9749a99af821e1de334fb64d7e
Author:     Borislav Petkov <borislav.petkov@amd.com>
AuthorDate: Tue, 19 Oct 2010 22:17:37 +0000
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Wed, 20 Oct 2010 12:54:04 -0700

x86, mm: Fix incorrect data type in vmalloc_sync_all()

arch/x86/mm/fault.c: In function 'vmalloc_sync_all':
arch/x86/mm/fault.c:238: warning: assignment makes integer from pointer without a cast

introduced by 617d34d9e5d8326ec8f188c616aa06ac59d083fe.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20101020103642.GA3135@kryptos.osrc.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6c27c39..0cdb8d4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -230,7 +230,7 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
-			int ret;
+			pmd_t *ret;
 
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 19:53           ` H. Peter Anvin
@ 2010-10-20 20:10             ` Borislav Petkov
  2010-10-20 20:13               ` H. Peter Anvin
  2010-10-20 21:26             ` Ben Pfaff
  1 sibling, 1 reply; 68+ messages in thread
From: Borislav Petkov @ 2010-10-20 20:10 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, H. Peter Anvin, mingo, linux-kernel, tglx,
	jeremy.fitzhardinge, linux-tip-commits

From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, Oct 20, 2010 at 12:53:43PM -0700

> (This is a constant problem, by the way: people seems to want to prefix,
> not suffix, their commentary; "cover letter" style.)  I suspect git
> needs to change, but it's hard to know exactly how to change it.

That's easy, remove the cover letter style when inlining patches further
down the mail.

But yeah, I'm still trying to decide for me which mail attribution
format (set attribution="..." in mutt) is better/clearer:

From: foo
Date: bar

or

On <date>, <person> said:

Hmm...

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 20:10             ` Borislav Petkov
@ 2010-10-20 20:13               ` H. Peter Anvin
  2010-10-20 22:11                 ` Borislav Petkov
  0 siblings, 1 reply; 68+ messages in thread
From: H. Peter Anvin @ 2010-10-20 20:13 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: H. Peter Anvin, mingo, linux-kernel, tglx, jeremy.fitzhardinge,
	linux-tip-commits

On 10/20/2010 01:10 PM, Borislav Petkov wrote:
> From: "H. Peter Anvin" <hpa@linux.intel.com>
> Date: Wed, Oct 20, 2010 at 12:53:43PM -0700
> 
>> (This is a constant problem, by the way: people seems to want to prefix,
>> not suffix, their commentary; "cover letter" style.)  I suspect git
>> needs to change, but it's hard to know exactly how to change it.
> 
> That's easy, remove the cover letter style when inlining patches further
> down the mail.

The problem is that right now, if there is text like this:

A
---
B

patch

A is retained, and B discarded.  When to discard A and when to discard B
is the hard part.

> But yeah, I'm still trying to decide for me which mail attribution
> format (set attribution="..." in mutt) is better/clearer:
> 
> From: foo
> Date: bar
> 
> or
> 
> On <date>, <person> said:
> 
> Hmm...
> 

Well, the latter can at least be edited out.

The former *really* gunks up git.

	-hpa


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 19:53           ` H. Peter Anvin
  2010-10-20 20:10             ` Borislav Petkov
@ 2010-10-20 21:26             ` Ben Pfaff
  1 sibling, 0 replies; 68+ messages in thread
From: Ben Pfaff @ 2010-10-20 21:26 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, H. Peter Anvin, mingo, linux-kernel, tglx,
	jeremy.fitzhardinge, linux-tip-commits

"H. Peter Anvin" <hpa@linux.intel.com> writes:

> ... and that confused the scripts.  The way git wants it commentary
> information should go *after* the "---" line.
>
> (This is a constant problem, by the way: people seems to want to prefix,
> not suffix, their commentary; "cover letter" style.)  I suspect git
> needs to change, but it's hard to know exactly how to change it.

Recent Git has a "scissors" feature that can be used to insert a
cover letter above the commit message.
-- 
Peter Seebach on managing engineers:
"It's like herding cats, only most of the engineers are already
 sick of laser pointers."

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all()
  2010-10-20 20:13               ` H. Peter Anvin
@ 2010-10-20 22:11                 ` Borislav Petkov
  0 siblings, 0 replies; 68+ messages in thread
From: Borislav Petkov @ 2010-10-20 22:11 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, H. Peter Anvin, mingo, linux-kernel, tglx,
	jeremy.fitzhardinge, linux-tip-commits

On Wed, Oct 20, 2010 at 04:13:44PM -0400, H. Peter Anvin wrote:
...

> > From: foo
> > Date: bar
> > 
> > or
> > 
> > On <date>, <person> said:
> > 
> > Hmm...
> > 
> 
> Well, the latter can at least be edited out.
> 
> The former *really* gunks up git.

this decides it then, fall back to the mutt-default. It's not like the
former one is radically better to keep it and disrupt git workflows.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2010-10-14 20:56 ` Jeremy Fitzhardinge
@ 2010-10-21 21:06   ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-21 21:06 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich

 Ping?  Have you had any thoughts about possible x86-64 problems with this?

Thanks,
    J

On 10/14/2010 01:56 PM, Jeremy Fitzhardinge wrote:
>
> Take mm->page_table_lock while syncing the vmalloc region.  This prevents
> a race with the Xen pagetable pin/unpin code, which expects that the
> page_table_lock is already held.  If this race occurs, then Xen can see
> an inconsistent page type (a page can either be read/write or a pagetable
> page, and pin/unpin converts it between them), which will cause either
> the pin or the set_p[gm]d to fail; either will crash the kernel.
>
> vmalloc_sync_all() should be called rarely, so this extra use of
> page_table_lock should not interfere with its normal users.
>
> The mm pointer is stashed in the pgd page's index field, as that won't
> be otherwise used for pgd pages.
>
> Bug reported by Ian Campbell <ian.cambell@eu.citrix.com>
> Derived from a patch by Jan Beulich <jbeulich@novell.com>
>
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index a34c785..422b363 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
>  extern spinlock_t pgd_lock;
>  extern struct list_head pgd_list;
>  
> +extern struct mm_struct *pgd_page_get_mm(struct page *page);
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <asm/paravirt.h>
>  #else  /* !CONFIG_PARAVIRT */
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 4c4508e..b7f9ae1 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
>  
>  		spin_lock_irqsave(&pgd_lock, flags);
>  		list_for_each_entry(page, &pgd_list, lru) {
> -			if (!vmalloc_sync_one(page_address(page), address))
> +			spinlock_t *pgt_lock;
> +			int ret;
> +
> +			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
> +
> +			spin_lock(pgt_lock);
> +			ret = vmalloc_sync_one(page_address(page), address);
> +			spin_unlock(pgt_lock);
> +
> +			if (!ret)
>  				break;
>  		}
>  		spin_unlock_irqrestore(&pgd_lock, flags);
> @@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
>  		spin_lock_irqsave(&pgd_lock, flags);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
> +			spinlock_t *pgt_lock;
> +
>  			pgd = (pgd_t *)page_address(page) + pgd_index(address);
> +
> +			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
> +			spin_lock(pgt_lock);
> +
>  			if (pgd_none(*pgd))
>  				set_pgd(pgd, *pgd_ref);
>  			else
>  				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
> +
> +			spin_unlock(pgt_lock);
>  		}
>  		spin_unlock_irqrestore(&pgd_lock, flags);
>  	}
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> index 5c4ee42..c70e57d 100644
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
>  #define UNSHARED_PTRS_PER_PGD				\
>  	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
>  
> -static void pgd_ctor(pgd_t *pgd)
> +
> +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
> +{
> +	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
> +	virt_to_page(pgd)->index = (pgoff_t)mm;
> +}
> +
> +struct mm_struct *pgd_page_get_mm(struct page *page)
> +{
> +	return (struct mm_struct *)page->index;
> +}
> +
> +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
>  {
>  	/* If the pgd points to a shared pagetable level (either the
>  	   ptes in non-PAE, or shared PMD in PAE), then just copy the
> @@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
>  	}
>  
>  	/* list required to sync kernel mapping updates */
> -	if (!SHARED_KERNEL_PMD)
> +	if (!SHARED_KERNEL_PMD) {
> +		pgd_set_mm(pgd, mm);
>  		pgd_list_add(pgd);
> +	}
>  }
>  
>  static void pgd_dtor(pgd_t *pgd)
> @@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
>  	 */
>  	spin_lock_irqsave(&pgd_lock, flags);
>  
> -	pgd_ctor(pgd);
> +	pgd_ctor(mm, pgd);
>  	pgd_prepopulate_pmd(mm, pgd, pmds);
>  
>  	spin_unlock_irqrestore(&pgd_lock, flags);
>
>


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
@ 2010-10-21 21:06   ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-21 21:06 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Jan Beulich, Ian Campbell

 Ping?  Have you had any thoughts about possible x86-64 problems with this?

Thanks,
    J

On 10/14/2010 01:56 PM, Jeremy Fitzhardinge wrote:
>
> Take mm->page_table_lock while syncing the vmalloc region.  This prevents
> a race with the Xen pagetable pin/unpin code, which expects that the
> page_table_lock is already held.  If this race occurs, then Xen can see
> an inconsistent page type (a page can either be read/write or a pagetable
> page, and pin/unpin converts it between them), which will cause either
> the pin or the set_p[gm]d to fail; either will crash the kernel.
>
> vmalloc_sync_all() should be called rarely, so this extra use of
> page_table_lock should not interfere with its normal users.
>
> The mm pointer is stashed in the pgd page's index field, as that won't
> be otherwise used for pgd pages.
>
> Bug reported by Ian Campbell <ian.cambell@eu.citrix.com>
> Derived from a patch by Jan Beulich <jbeulich@novell.com>
>
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index a34c785..422b363 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
>  extern spinlock_t pgd_lock;
>  extern struct list_head pgd_list;
>  
> +extern struct mm_struct *pgd_page_get_mm(struct page *page);
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <asm/paravirt.h>
>  #else  /* !CONFIG_PARAVIRT */
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 4c4508e..b7f9ae1 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
>  
>  		spin_lock_irqsave(&pgd_lock, flags);
>  		list_for_each_entry(page, &pgd_list, lru) {
> -			if (!vmalloc_sync_one(page_address(page), address))
> +			spinlock_t *pgt_lock;
> +			int ret;
> +
> +			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
> +
> +			spin_lock(pgt_lock);
> +			ret = vmalloc_sync_one(page_address(page), address);
> +			spin_unlock(pgt_lock);
> +
> +			if (!ret)
>  				break;
>  		}
>  		spin_unlock_irqrestore(&pgd_lock, flags);
> @@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
>  		spin_lock_irqsave(&pgd_lock, flags);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
> +			spinlock_t *pgt_lock;
> +
>  			pgd = (pgd_t *)page_address(page) + pgd_index(address);
> +
> +			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
> +			spin_lock(pgt_lock);
> +
>  			if (pgd_none(*pgd))
>  				set_pgd(pgd, *pgd_ref);
>  			else
>  				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
> +
> +			spin_unlock(pgt_lock);
>  		}
>  		spin_unlock_irqrestore(&pgd_lock, flags);
>  	}
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> index 5c4ee42..c70e57d 100644
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
>  #define UNSHARED_PTRS_PER_PGD				\
>  	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
>  
> -static void pgd_ctor(pgd_t *pgd)
> +
> +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
> +{
> +	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
> +	virt_to_page(pgd)->index = (pgoff_t)mm;
> +}
> +
> +struct mm_struct *pgd_page_get_mm(struct page *page)
> +{
> +	return (struct mm_struct *)page->index;
> +}
> +
> +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
>  {
>  	/* If the pgd points to a shared pagetable level (either the
>  	   ptes in non-PAE, or shared PMD in PAE), then just copy the
> @@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
>  	}
>  
>  	/* list required to sync kernel mapping updates */
> -	if (!SHARED_KERNEL_PMD)
> +	if (!SHARED_KERNEL_PMD) {
> +		pgd_set_mm(pgd, mm);
>  		pgd_list_add(pgd);
> +	}
>  }
>  
>  static void pgd_dtor(pgd_t *pgd)
> @@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
>  	 */
>  	spin_lock_irqsave(&pgd_lock, flags);
>  
> -	pgd_ctor(pgd);
> +	pgd_ctor(mm, pgd);
>  	pgd_prepopulate_pmd(mm, pgd, pmds);
>  
>  	spin_unlock_irqrestore(&pgd_lock, flags);
>
>

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2010-10-21 21:06   ` Jeremy Fitzhardinge
@ 2010-10-21 21:26     ` H. Peter Anvin
  -1 siblings, 0 replies; 68+ messages in thread
From: H. Peter Anvin @ 2010-10-21 21:26 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich

On 10/21/2010 02:06 PM, Jeremy Fitzhardinge wrote:
>  Ping?  Have you had any thoughts about possible x86-64 problems with this?
> 
> Thanks,
>     J

I didn't find any, so I already committed your patch to tip:x86/mm two
days ago.

Now, it's worth noting that Linus has asked us to get rid of
vmalloc_sync_all() completely, which would be a good thing in the longer
term.

	-hpa

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
@ 2010-10-21 21:26     ` H. Peter Anvin
  0 siblings, 0 replies; 68+ messages in thread
From: H. Peter Anvin @ 2010-10-21 21:26 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Jan Beulich, Ian Campbell

On 10/21/2010 02:06 PM, Jeremy Fitzhardinge wrote:
>  Ping?  Have you had any thoughts about possible x86-64 problems with this?
> 
> Thanks,
>     J

I didn't find any, so I already committed your patch to tip:x86/mm two
days ago.

Now, it's worth noting that Linus has asked us to get rid of
vmalloc_sync_all() completely, which would be a good thing in the longer
term.

	-hpa

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2010-10-21 21:26     ` H. Peter Anvin
  (?)
@ 2010-10-21 21:34     ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2010-10-21 21:34 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich

 On 10/21/2010 02:26 PM, H. Peter Anvin wrote:
> On 10/21/2010 02:06 PM, Jeremy Fitzhardinge wrote:
>>  Ping?  Have you had any thoughts about possible x86-64 problems with this?
>>
>> Thanks,
>>     J
> I didn't find any, so I already committed your patch to tip:x86/mm two
> days ago.

Oh, thanks!  I didn't see the tip commit.

> Now, it's worth noting that Linus has asked us to get rid of
> vmalloc_sync_all() completely, which would be a good thing in the longer
> term.

Fine by me.

    J

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2010-10-21 21:06   ` Jeremy Fitzhardinge
  (?)
  (?)
@ 2011-02-03  2:48   ` Andrea Arcangeli
  2011-02-03 20:44       ` Jeremy Fitzhardinge
  2011-02-03 20:59     ` [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync Larry Woodman
  -1 siblings, 2 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-03  2:48 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman

Hello,

Larry (CC'ed) found a problem with the patch in subject. When
USE_SPLIT_PTLOCKS is not defined (NR_CPUS == 2) it will deadlock in
ptep_clear_flush_notify in rmap.c because it's sending IPIs with the
page_table_lock already held, and the other CPUs now spins on the
page_table_lock with irq disabled, so the IPI never runs. With
CONFIG_TRANSPARENT_HUGEPAGE=y this deadlocks happens even with
USE_SPLIT_PTLOCKS defined so it become visible but it needs to be
fixed regardless (for NR_CPUS == 2).

I'd like to understand why the pgd_lock needs irq disabled, it sounds
too easy that I can just remove the _irqsave, doesn't it?

A pgd_free comment says it can run from irq. page_table_lock having to
be taken there is for Xen only, but other archs also uses
spin_lock_irqsave(pgd_lock) so I guess it's either common code, or
it's superfluous and not another Xen special requirement.

If we could remove that _irqsave like below it'd solve it... But
clearly something must be taking the pgd_lock from irq. (using a
rwlock would also be possible as long as nobody takes it in write mode
during irq, but if it's pgd_free that really runs in irq, that would
need the write_lock so it wouldn't be a solution).

I'm trying this fix and the VM_BUG_ON never triggered yet.

In short: who takes the pgd_lock from an irq? (__mmdrop shouldn't but
maybe I'm overlooking some aio bit?)

======
Subject: fix pgd_lock deadlock

From: Andrea Arcangeli <aarcange@redhat.com>

It's forbidden to take the page_table_lock with the irq disabled or if there's
contention the IPIs (for tlb flushes) sent with the page_table_lock held will
never run leading to a deadlock.

Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/mm/fault.c    |    7 ++++---
 arch/x86/mm/init_64.c  |    7 ++++---
 arch/x86/mm/pageattr.c |   21 +++++++++++----------
 arch/x86/mm/pgtable.c  |   10 ++++++----
 arch/x86/xen/mmu.c     |   10 ++++------
 5 files changed, 29 insertions(+), 26 deletions(-)

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -230,14 +230,15 @@ void vmalloc_sync_all(void)
 	     address >= TASK_SIZE && address < FIXADDR_TOP;
 	     address += PMD_SIZE) {
 
-		unsigned long flags;
 		struct page *page;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		VM_BUG_ON(in_interrupt());
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
 			pmd_t *ret;
 
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
 			spin_lock(pgt_lock);
@@ -247,7 +248,7 @@ void vmalloc_sync_all(void)
 			if (!ret)
 				break;
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock, flags);
 	}
 }
 
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -105,18 +105,19 @@ void sync_global_pgds(unsigned long star
 
 	for (address = start; address <= end; address += PGDIR_SIZE) {
 		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
 		struct page *page;
 
 		if (pgd_none(*pgd_ref))
 			continue;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		VM_BUG_ON(in_interrupt());
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 			spin_lock(pgt_lock);
 
@@ -128,7 +129,7 @@ void sync_global_pgds(unsigned long star
 
 			spin_unlock(pgt_lock);
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -57,12 +57,11 @@ static unsigned long direct_pages_count[
 
 void update_page_count(int level, unsigned long pages)
 {
-	unsigned long flags;
-
 	/* Protect against CPA */
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	direct_pages_count[level] += pages;
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 static void split_page_count(int level)
@@ -402,7 +401,7 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
-	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
@@ -411,7 +410,8 @@ try_preserve_large_page(pte_t *kpte, uns
 	if (cpa->force_split)
 		return 1;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
@@ -506,14 +506,14 @@ try_preserve_large_page(pte_t *kpte, uns
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return do_split;
 }
 
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-	unsigned long flags, pfn, pfninc = 1;
+	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
@@ -527,7 +527,8 @@ static int split_large_page(pte_t *kpte,
 	if (!base)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -599,7 +600,7 @@ out_unlock:
 	 */
 	if (base)
 		__free_page(base);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return 0;
 }
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -126,9 +126,10 @@ static void pgd_dtor(pgd_t *pgd)
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -280,12 +281,13 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 * respect to anything walking the pgd_list, so that they
 	 * never see a partially populated pgd.
 	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 
 	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return pgd;
 
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
  */
 void xen_mm_pin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (!PagePinned(page)) {
@@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
  */
 void xen_mm_unpin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
@@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2011-02-03  2:48   ` Andrea Arcangeli
@ 2011-02-03 20:44       ` Jeremy Fitzhardinge
  2011-02-03 20:59     ` [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync Larry Woodman
  1 sibling, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2011-02-03 20:44 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman

On 02/02/2011 06:48 PM, Andrea Arcangeli wrote:
> Hello,
>
> Larry (CC'ed) found a problem with the patch in subject. When
> USE_SPLIT_PTLOCKS is not defined (NR_CPUS == 2) it will deadlock in
> ptep_clear_flush_notify in rmap.c because it's sending IPIs with the
> page_table_lock already held, and the other CPUs now spins on the
> page_table_lock with irq disabled, so the IPI never runs. With
> CONFIG_TRANSPARENT_HUGEPAGE=y this deadlocks happens even with
> USE_SPLIT_PTLOCKS defined so it become visible but it needs to be
> fixed regardless (for NR_CPUS == 2).

What's "it" here?  Do you mean vmalloc_sync_all?  vmalloc_sync_one?
What's the callchain?

> I'd like to understand why the pgd_lock needs irq disabled, it sounds
> too easy that I can just remove the _irqsave, doesn't it?
>
> A pgd_free comment says it can run from irq. page_table_lock having to
> be taken there is for Xen only, but other archs also uses
> spin_lock_irqsave(pgd_lock) so I guess it's either common code, or
> it's superfluous and not another Xen special requirement.

There's no special Xen requirement here.

> If we could remove that _irqsave like below it'd solve it... But
> clearly something must be taking the pgd_lock from irq. (using a
> rwlock would also be possible as long as nobody takes it in write mode
> during irq, but if it's pgd_free that really runs in irq, that would
> need the write_lock so it wouldn't be a solution).

mmdrop() can be called from interrupt context, but I don't know if it
will ever drop the last reference from interrupt, so maybe you can get
away with it.

> I'm trying this fix and the VM_BUG_ON never triggered yet.
>
> In short: who takes the pgd_lock from an irq? (__mmdrop shouldn't but
> maybe I'm overlooking some aio bit?)
>
> ======
> Subject: fix pgd_lock deadlock
>
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.
>
> Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.

I'm pretty sure this is OK from a Xen perspective.

> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> ---
>  arch/x86/mm/fault.c    |    7 ++++---
>  arch/x86/mm/init_64.c  |    7 ++++---
>  arch/x86/mm/pageattr.c |   21 +++++++++++----------
>  arch/x86/mm/pgtable.c  |   10 ++++++----
>  arch/x86/xen/mmu.c     |   10 ++++------
>  5 files changed, 29 insertions(+), 26 deletions(-)
>
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -230,14 +230,15 @@ void vmalloc_sync_all(void)
>  	     address >= TASK_SIZE && address < FIXADDR_TOP;
>  	     address += PMD_SIZE) {
>  
> -		unsigned long flags;
>  		struct page *page;
>  
> -		spin_lock_irqsave(&pgd_lock, flags);
> +		VM_BUG_ON(in_interrupt());
> +		spin_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			spinlock_t *pgt_lock;
>  			pmd_t *ret;
>  
> +			/* the pgt_lock only for Xen */
>  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
>  
>  			spin_lock(pgt_lock);
> @@ -247,7 +248,7 @@ void vmalloc_sync_all(void)
>  			if (!ret)
>  				break;
>  		}
> -		spin_unlock_irqrestore(&pgd_lock, flags);
> +		spin_unlock(&pgd_lock, flags);

Urp.  Did this compile?

Thanks,
    J

>  	}
>  }
>  
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -105,18 +105,19 @@ void sync_global_pgds(unsigned long star
>  
>  	for (address = start; address <= end; address += PGDIR_SIZE) {
>  		const pgd_t *pgd_ref = pgd_offset_k(address);
> -		unsigned long flags;
>  		struct page *page;
>  
>  		if (pgd_none(*pgd_ref))
>  			continue;
>  
> -		spin_lock_irqsave(&pgd_lock, flags);
> +		VM_BUG_ON(in_interrupt());
> +		spin_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
>  			spinlock_t *pgt_lock;
>  
>  			pgd = (pgd_t *)page_address(page) + pgd_index(address);
> +			/* the pgt_lock only for Xen */
>  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
>  			spin_lock(pgt_lock);
>  
> @@ -128,7 +129,7 @@ void sync_global_pgds(unsigned long star
>  
>  			spin_unlock(pgt_lock);
>  		}
> -		spin_unlock_irqrestore(&pgd_lock, flags);
> +		spin_unlock(&pgd_lock);
>  	}
>  }
>  
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -57,12 +57,11 @@ static unsigned long direct_pages_count[
>  
>  void update_page_count(int level, unsigned long pages)
>  {
> -	unsigned long flags;
> -
>  	/* Protect against CPA */
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	direct_pages_count[level] += pages;
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  static void split_page_count(int level)
> @@ -402,7 +401,7 @@ static int
>  try_preserve_large_page(pte_t *kpte, unsigned long address,
>  			struct cpa_data *cpa)
>  {
> -	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
> +	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
>  	pte_t new_pte, old_pte, *tmp;
>  	pgprot_t old_prot, new_prot, req_prot;
>  	int i, do_split = 1;
> @@ -411,7 +410,8 @@ try_preserve_large_page(pte_t *kpte, uns
>  	if (cpa->force_split)
>  		return 1;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	/*
>  	 * Check for races, another CPU might have split this page
>  	 * up already:
> @@ -506,14 +506,14 @@ try_preserve_large_page(pte_t *kpte, uns
>  	}
>  
>  out_unlock:
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  
>  	return do_split;
>  }
>  
>  static int split_large_page(pte_t *kpte, unsigned long address)
>  {
> -	unsigned long flags, pfn, pfninc = 1;
> +	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
>  	pte_t *pbase, *tmp;
>  	pgprot_t ref_prot;
> @@ -527,7 +527,8 @@ static int split_large_page(pte_t *kpte,
>  	if (!base)
>  		return -ENOMEM;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	/*
>  	 * Check for races, another CPU might have split this page
>  	 * up for us already:
> @@ -599,7 +600,7 @@ out_unlock:
>  	 */
>  	if (base)
>  		__free_page(base);
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -126,9 +126,10 @@ static void pgd_dtor(pgd_t *pgd)
>  	if (SHARED_KERNEL_PMD)
>  		return;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	pgd_list_del(pgd);
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  /*
> @@ -280,12 +281,13 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
>  	 * respect to anything walking the pgd_list, so that they
>  	 * never see a partially populated pgd.
>  	 */
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  
>  	pgd_ctor(mm, pgd);
>  	pgd_prepopulate_pmd(mm, pgd, pmds);
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  
>  	return pgd;
>  
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
>   */
>  void xen_mm_pin_all(void)
>  {
> -	unsigned long flags;
>  	struct page *page;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock(&pgd_lock);
>  
>  	list_for_each_entry(page, &pgd_list, lru) {
>  		if (!PagePinned(page)) {
> @@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
>  		}
>  	}
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  /*
> @@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
>   */
>  void xen_mm_unpin_all(void)
>  {
> -	unsigned long flags;
>  	struct page *page;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock(&pgd_lock);
>  
>  	list_for_each_entry(page, &pgd_list, lru) {
>  		if (PageSavePinned(page)) {
> @@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
>  		}
>  	}
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
>


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
@ 2011-02-03 20:44       ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2011-02-03 20:44 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Xen-devel, Ian Campbell, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jan Beulich, H. Peter Anvin,
	Larry Woodman

On 02/02/2011 06:48 PM, Andrea Arcangeli wrote:
> Hello,
>
> Larry (CC'ed) found a problem with the patch in subject. When
> USE_SPLIT_PTLOCKS is not defined (NR_CPUS == 2) it will deadlock in
> ptep_clear_flush_notify in rmap.c because it's sending IPIs with the
> page_table_lock already held, and the other CPUs now spins on the
> page_table_lock with irq disabled, so the IPI never runs. With
> CONFIG_TRANSPARENT_HUGEPAGE=y this deadlocks happens even with
> USE_SPLIT_PTLOCKS defined so it become visible but it needs to be
> fixed regardless (for NR_CPUS == 2).

What's "it" here?  Do you mean vmalloc_sync_all?  vmalloc_sync_one?
What's the callchain?

> I'd like to understand why the pgd_lock needs irq disabled, it sounds
> too easy that I can just remove the _irqsave, doesn't it?
>
> A pgd_free comment says it can run from irq. page_table_lock having to
> be taken there is for Xen only, but other archs also uses
> spin_lock_irqsave(pgd_lock) so I guess it's either common code, or
> it's superfluous and not another Xen special requirement.

There's no special Xen requirement here.

> If we could remove that _irqsave like below it'd solve it... But
> clearly something must be taking the pgd_lock from irq. (using a
> rwlock would also be possible as long as nobody takes it in write mode
> during irq, but if it's pgd_free that really runs in irq, that would
> need the write_lock so it wouldn't be a solution).

mmdrop() can be called from interrupt context, but I don't know if it
will ever drop the last reference from interrupt, so maybe you can get
away with it.

> I'm trying this fix and the VM_BUG_ON never triggered yet.
>
> In short: who takes the pgd_lock from an irq? (__mmdrop shouldn't but
> maybe I'm overlooking some aio bit?)
>
> ======
> Subject: fix pgd_lock deadlock
>
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.
>
> Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.

I'm pretty sure this is OK from a Xen perspective.

> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> ---
>  arch/x86/mm/fault.c    |    7 ++++---
>  arch/x86/mm/init_64.c  |    7 ++++---
>  arch/x86/mm/pageattr.c |   21 +++++++++++----------
>  arch/x86/mm/pgtable.c  |   10 ++++++----
>  arch/x86/xen/mmu.c     |   10 ++++------
>  5 files changed, 29 insertions(+), 26 deletions(-)
>
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -230,14 +230,15 @@ void vmalloc_sync_all(void)
>  	     address >= TASK_SIZE && address < FIXADDR_TOP;
>  	     address += PMD_SIZE) {
>  
> -		unsigned long flags;
>  		struct page *page;
>  
> -		spin_lock_irqsave(&pgd_lock, flags);
> +		VM_BUG_ON(in_interrupt());
> +		spin_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			spinlock_t *pgt_lock;
>  			pmd_t *ret;
>  
> +			/* the pgt_lock only for Xen */
>  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
>  
>  			spin_lock(pgt_lock);
> @@ -247,7 +248,7 @@ void vmalloc_sync_all(void)
>  			if (!ret)
>  				break;
>  		}
> -		spin_unlock_irqrestore(&pgd_lock, flags);
> +		spin_unlock(&pgd_lock, flags);

Urp.  Did this compile?

Thanks,
    J

>  	}
>  }
>  
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -105,18 +105,19 @@ void sync_global_pgds(unsigned long star
>  
>  	for (address = start; address <= end; address += PGDIR_SIZE) {
>  		const pgd_t *pgd_ref = pgd_offset_k(address);
> -		unsigned long flags;
>  		struct page *page;
>  
>  		if (pgd_none(*pgd_ref))
>  			continue;
>  
> -		spin_lock_irqsave(&pgd_lock, flags);
> +		VM_BUG_ON(in_interrupt());
> +		spin_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
>  			spinlock_t *pgt_lock;
>  
>  			pgd = (pgd_t *)page_address(page) + pgd_index(address);
> +			/* the pgt_lock only for Xen */
>  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
>  			spin_lock(pgt_lock);
>  
> @@ -128,7 +129,7 @@ void sync_global_pgds(unsigned long star
>  
>  			spin_unlock(pgt_lock);
>  		}
> -		spin_unlock_irqrestore(&pgd_lock, flags);
> +		spin_unlock(&pgd_lock);
>  	}
>  }
>  
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -57,12 +57,11 @@ static unsigned long direct_pages_count[
>  
>  void update_page_count(int level, unsigned long pages)
>  {
> -	unsigned long flags;
> -
>  	/* Protect against CPA */
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	direct_pages_count[level] += pages;
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  static void split_page_count(int level)
> @@ -402,7 +401,7 @@ static int
>  try_preserve_large_page(pte_t *kpte, unsigned long address,
>  			struct cpa_data *cpa)
>  {
> -	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
> +	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
>  	pte_t new_pte, old_pte, *tmp;
>  	pgprot_t old_prot, new_prot, req_prot;
>  	int i, do_split = 1;
> @@ -411,7 +410,8 @@ try_preserve_large_page(pte_t *kpte, uns
>  	if (cpa->force_split)
>  		return 1;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	/*
>  	 * Check for races, another CPU might have split this page
>  	 * up already:
> @@ -506,14 +506,14 @@ try_preserve_large_page(pte_t *kpte, uns
>  	}
>  
>  out_unlock:
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  
>  	return do_split;
>  }
>  
>  static int split_large_page(pte_t *kpte, unsigned long address)
>  {
> -	unsigned long flags, pfn, pfninc = 1;
> +	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
>  	pte_t *pbase, *tmp;
>  	pgprot_t ref_prot;
> @@ -527,7 +527,8 @@ static int split_large_page(pte_t *kpte,
>  	if (!base)
>  		return -ENOMEM;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	/*
>  	 * Check for races, another CPU might have split this page
>  	 * up for us already:
> @@ -599,7 +600,7 @@ out_unlock:
>  	 */
>  	if (base)
>  		__free_page(base);
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -126,9 +126,10 @@ static void pgd_dtor(pgd_t *pgd)
>  	if (SHARED_KERNEL_PMD)
>  		return;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  	pgd_list_del(pgd);
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  /*
> @@ -280,12 +281,13 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
>  	 * respect to anything walking the pgd_list, so that they
>  	 * never see a partially populated pgd.
>  	 */
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	VM_BUG_ON(in_interrupt());
> +	spin_lock(&pgd_lock);
>  
>  	pgd_ctor(mm, pgd);
>  	pgd_prepopulate_pmd(mm, pgd, pmds);
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  
>  	return pgd;
>  
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
>   */
>  void xen_mm_pin_all(void)
>  {
> -	unsigned long flags;
>  	struct page *page;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock(&pgd_lock);
>  
>  	list_for_each_entry(page, &pgd_list, lru) {
>  		if (!PagePinned(page)) {
> @@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
>  		}
>  	}
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  /*
> @@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
>   */
>  void xen_mm_unpin_all(void)
>  {
> -	unsigned long flags;
>  	struct page *page;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock(&pgd_lock);
>  
>  	list_for_each_entry(page, &pgd_list, lru) {
>  		if (PageSavePinned(page)) {
> @@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
>  		}
>  	}
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock(&pgd_lock);
>  }
>  
>  void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
>

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2011-02-03  2:48   ` Andrea Arcangeli
  2011-02-03 20:44       ` Jeremy Fitzhardinge
@ 2011-02-03 20:59     ` Larry Woodman
  1 sibling, 0 replies; 68+ messages in thread
From: Larry Woodman @ 2011-02-03 20:59 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich

On 02/02/2011 09:48 PM, Andrea Arcangeli wrote:
> Hello,
>
> Larry (CC'ed) found a problem with the patch in subject. When
> USE_SPLIT_PTLOCKS is not defined (NR_CPUS == 2) it will deadlock in
> ptep_clear_flush_notify in rmap.c because it's sending IPIs with the
> page_table_lock already held, and the other CPUs now spins on the
> page_table_lock with irq disabled, so the IPI never runs. With
> CONFIG_TRANSPARENT_HUGEPAGE=y this deadlocks happens even with
> USE_SPLIT_PTLOCKS defined so it become visible but it needs to be
> fixed regardless (for NR_CPUS == 2).
>
> I'd like to understand why the pgd_lock needs irq disabled, it sounds
> too easy that I can just remove the _irqsave, doesn't it?
>
> A pgd_free comment says it can run from irq. page_table_lock having to
> be taken there is for Xen only, but other archs also uses
> spin_lock_irqsave(pgd_lock) so I guess it's either common code, or
> it's superfluous and not another Xen special requirement.
>
> If we could remove that _irqsave like below it'd solve it... But
> clearly something must be taking the pgd_lock from irq. (using a
> rwlock would also be possible as long as nobody takes it in write mode
> during irq, but if it's pgd_free that really runs in irq, that would
> need the write_lock so it wouldn't be a solution).
>
> I'm trying this fix and the VM_BUG_ON never triggered yet.
>
> In short: who takes the pgd_lock from an irq? (__mmdrop shouldn't but
> maybe I'm overlooking some aio bit?)
>
>
This is the specifics:

The problem is with THP.  The page reclaim code calls page_referenced_one()
which takes the mm->page_table_lock on one CPU before sending an IPI to other
CPU(s):

On CPU1 we take the mm->page_table_lock, send IPIs and wait for a response:
page_referenced_one(...)
         if (unlikely(PageTransHuge(page))) {
                 pmd_t *pmd;

                 spin_lock(&mm->page_table_lock);
                 pmd = page_check_address_pmd(page, mm, address,
                                              PAGE_CHECK_ADDRESS_PMD_FLAG);
                 if (pmd&&  !pmd_trans_splitting(*pmd)&&
                     pmdp_clear_flush_young_notify(vma, address, pmd))
                         referenced++;
                 spin_unlock(&mm->page_table_lock);
         } else {


CPU2 can race in vmalloc_sync_all() because it disables interrupt(preventing a
response to the IPI from CPU1) and takes the pgd_lock then spins in the
mm->page_table_lock which is already held on CPU1.

                 spin_lock_irqsave(&pgd_lock, flags);
                 list_for_each_entry(page,&pgd_list, lru) {
                         pgd_t *pgd;
                         spinlock_t *pgt_lock;

                         pgd = (pgd_t *)page_address(page) + pgd_index(address);

                         pgt_lock =&pgd_page_get_mm(page)->page_table_lock;
                         spin_lock(pgt_lock);


At this point the system is deadlocked.  The pmdp_clear_flush_young_notify
needs to do its PDG business with the page_table_lock held then release that
lock before sending the IPIs to the other CPUs.



^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2011-02-03 20:44       ` Jeremy Fitzhardinge
  (?)
@ 2011-02-04  1:21       ` Andrea Arcangeli
  2011-02-04 21:27         ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-04  1:21 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman

On Thu, Feb 03, 2011 at 12:44:02PM -0800, Jeremy Fitzhardinge wrote:
> On 02/02/2011 06:48 PM, Andrea Arcangeli wrote:
> > Hello,
> >
> > Larry (CC'ed) found a problem with the patch in subject. When
> > USE_SPLIT_PTLOCKS is not defined (NR_CPUS == 2) it will deadlock in
> > ptep_clear_flush_notify in rmap.c because it's sending IPIs with the
> > page_table_lock already held, and the other CPUs now spins on the
> > page_table_lock with irq disabled, so the IPI never runs. With
> > CONFIG_TRANSPARENT_HUGEPAGE=y this deadlocks happens even with
> > USE_SPLIT_PTLOCKS defined so it become visible but it needs to be
> > fixed regardless (for NR_CPUS == 2).
> 
> What's "it" here?  Do you mean vmalloc_sync_all?  vmalloc_sync_one?
> What's the callchain?

Larry just answered to that. If something is unclear let me know. I
never reproduced it, but it also can happen without THP enabled, you
just need to set NR_CPUS to 2 during "make menuconfig".

> > spin_lock_irqsave(pgd_lock) so I guess it's either common code, or
> > it's superfluous and not another Xen special requirement.
> 
> There's no special Xen requirement here.

That was my thought too considering the other archs...

> mmdrop() can be called from interrupt context, but I don't know if it
> will ever drop the last reference from interrupt, so maybe you can get
> away with it.

Yes the issue is __mmdrop, so it'd be nice to figure if __mmdrop can
also run from irq (or only mmdrop fast path which would be safe even
without _irqsave).

Is this a Xen only thing? Or is mmdrop called from regular
linux. Considering other archs also _irqsave I assume it's common code
calling mmdrop (otherwise it means they cut-and-pasted a Xen
dependency). This comment doesn't really tell me much.

static void pgd_dtor(pgd_t *pgd)
{
	unsigned long flags; /* can be called from interrupt context 	*/

	if (SHARED_KERNEL_PMD)
	   return;

	   VM_BUG_ON(in_interrupt());
	   spin_lock(&pgd_lock);

This comment tells the very __mmdrop can be called from irq context,
not just mmdrop. But I didn't find where yet... Can you tell me?

> > @@ -247,7 +248,7 @@ void vmalloc_sync_all(void)
> >  			if (!ret)
> >  				break;
> >  		}
> > -		spin_unlock_irqrestore(&pgd_lock, flags);
> > +		spin_unlock(&pgd_lock, flags);
> 
> Urp.  Did this compile?

Yes it builds and it also runs fine still (I left it running since I
posted the email and no problems yet, but this may not be reproducible
and we really need to know who calls __mmdrop from irq context to
tell). The above is under CONFIG_X86_32 and I did a 64bit build ;).

I'm not reposting a version that builds for 32bit x86 too until we
figure out the mmdrop thing...

Thanks,
Andrea

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2011-02-04  1:21       ` Andrea Arcangeli
@ 2011-02-04 21:27         ` Jeremy Fitzhardinge
  2011-02-07 23:20           ` Andrea Arcangeli
  0 siblings, 1 reply; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2011-02-04 21:27 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman

On 02/03/2011 05:21 PM, Andrea Arcangeli wrote:
> On Thu, Feb 03, 2011 at 12:44:02PM -0800, Jeremy Fitzhardinge wrote:
>> On 02/02/2011 06:48 PM, Andrea Arcangeli wrote:
>>> Hello,
>>>
>>> Larry (CC'ed) found a problem with the patch in subject. When
>>> USE_SPLIT_PTLOCKS is not defined (NR_CPUS == 2) it will deadlock in
>>> ptep_clear_flush_notify in rmap.c because it's sending IPIs with the
>>> page_table_lock already held, and the other CPUs now spins on the
>>> page_table_lock with irq disabled, so the IPI never runs. With
>>> CONFIG_TRANSPARENT_HUGEPAGE=y this deadlocks happens even with
>>> USE_SPLIT_PTLOCKS defined so it become visible but it needs to be
>>> fixed regardless (for NR_CPUS == 2).
>> What's "it" here?  Do you mean vmalloc_sync_all?  vmalloc_sync_one?
>> What's the callchain?
> Larry just answered to that. If something is unclear let me know. I
> never reproduced it, but it also can happen without THP enabled, you
> just need to set NR_CPUS to 2 during "make menuconfig".
>
>>> spin_lock_irqsave(pgd_lock) so I guess it's either common code, or
>>> it's superfluous and not another Xen special requirement.
>> There's no special Xen requirement here.
> That was my thought too considering the other archs...
>
>> mmdrop() can be called from interrupt context, but I don't know if it
>> will ever drop the last reference from interrupt, so maybe you can get
>> away with it.
> Yes the issue is __mmdrop, so it'd be nice to figure if __mmdrop can
> also run from irq (or only mmdrop fast path which would be safe even
> without _irqsave).
>
> Is this a Xen only thing? Or is mmdrop called from regular
> linux. Considering other archs also _irqsave I assume it's common code
> calling mmdrop (otherwise it means they cut-and-pasted a Xen
> dependency). This comment doesn't really tell me much.

No, I don't think there's any xen-specific code which calls mmdrop (at
all, let alone in interrupt context).  Erm, but I'm not sure where it
does.  I had a thinko that "schedule" would be one of those places, but
calling that from interrupt context would cause much bigger problems :/
> static void pgd_dtor(pgd_t *pgd)
> {
> 	unsigned long flags; /* can be called from interrupt context 	*/
>
> 	if (SHARED_KERNEL_PMD)
> 	   return;
>
> 	   VM_BUG_ON(in_interrupt());
> 	   spin_lock(&pgd_lock);
>
> This comment tells the very __mmdrop can be called from irq context,
> not just mmdrop. But I didn't find where yet... Can you tell me?

No.  I don't think I wrote that comment.  It possibly just some ancient
lore that could have been correct at one point, or perhaps it never true.

>>> @@ -247,7 +248,7 @@ void vmalloc_sync_all(void)
>>>  			if (!ret)
>>>  				break;
>>>  		}
>>> -		spin_unlock_irqrestore(&pgd_lock, flags);
>>> +		spin_unlock(&pgd_lock, flags);
>> Urp.  Did this compile?
> Yes it builds

(spin_unlock() shouldn't take a "flags" arg.)


> I'm not reposting a version that builds for 32bit x86 too until we
> figure out the mmdrop thing...

Stick it in next and look for explosion reports?

    J


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
  2011-02-04 21:27         ` Jeremy Fitzhardinge
@ 2011-02-07 23:20           ` Andrea Arcangeli
  2011-02-15 19:07             ` [PATCH] fix pgd_lock deadlock Andrea Arcangeli
  0 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-07 23:20 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman

On Fri, Feb 04, 2011 at 01:27:33PM -0800, Jeremy Fitzhardinge wrote:
> No, I don't think there's any xen-specific code which calls mmdrop (at
> all, let alone in interrupt context).  Erm, but I'm not sure where it
> does.  I had a thinko that "schedule" would be one of those places, but
> calling that from interrupt context would cause much bigger problems :/
> > static void pgd_dtor(pgd_t *pgd)

I checked again and I don't see exactly where mmdrop or __mmdrop are
called from irq context.

> No.  I don't think I wrote that comment.  It possibly just some ancient
> lore that could have been correct at one point, or perhaps it never true.

I agree with that. But it'd be nice of more people could look into
that so we at least can remove the misleading comment.

Where else can the pgd_lock be taken from irq context? Can we fix the
deadlock with NR_CPUS < 4 with my patch? (with the ,flags removed from below?)


> 
> >>> @@ -247,7 +248,7 @@ void vmalloc_sync_all(void)
> >>>  			if (!ret)
> >>>  				break;
> >>>  		}
> >>> -		spin_unlock_irqrestore(&pgd_lock, flags);
> >>> +		spin_unlock(&pgd_lock, flags);
> >> Urp.  Did this compile?
> > Yes it builds
> 
> (spin_unlock() shouldn't take a "flags" arg.)
> 
> 
> > I'm not reposting a version that builds for 32bit x86 too until we
> > figure out the mmdrop thing...
> 
> Stick it in next and look for explosion reports?

I intended to correct that of course, I just meant it is no problem
for 64bit builds and that's why I didn't notice the build failure
before posting the patch. Clearly 32bit build would have failed ;).

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH] fix pgd_lock deadlock
  2011-02-07 23:20           ` Andrea Arcangeli
@ 2011-02-15 19:07             ` Andrea Arcangeli
  2011-02-15 19:26               ` Thomas Gleixner
  0 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-15 19:07 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

Hello,

Without this patch we can deadlock in the page_table_lock with NR_CPUS
< 4 or THP on, with this patch we hopefully won't deadlock in the
pgd_lock (if taken from irq). I can't see anything taking it from irq
(maybe aio? to check I also tried the libaio testuite with no apparent
VM_BUG_ON triggering), so unless somebody sees it, I think we should
apply it. I've been running for a while with this patch applied
without apparent problems. Other archs may follow suit if it's proven
that there's nothing taking the pgd_lock from irq.

===
Subject: fix pgd_lock deadlock

From: Andrea Arcangeli <aarcange@redhat.com>

It's forbidden to take the page_table_lock with the irq disabled or if there's
contention the IPIs (for tlb flushes) sent with the page_table_lock held will
never run leading to a deadlock.

Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/mm/fault.c    |    8 ++++----
 arch/x86/mm/init_64.c  |    7 ++++---
 arch/x86/mm/pageattr.c |   21 +++++++++++----------
 arch/x86/mm/pgtable.c  |   13 ++++++-------
 arch/x86/xen/mmu.c     |   10 ++++------
 5 files changed, 29 insertions(+), 30 deletions(-)

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,15 +229,15 @@ void vmalloc_sync_all(void)
 	for (address = VMALLOC_START & PMD_MASK;
 	     address >= TASK_SIZE && address < FIXADDR_TOP;
 	     address += PMD_SIZE) {
-
-		unsigned long flags;
 		struct page *page;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		VM_BUG_ON(in_interrupt());
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
 			pmd_t *ret;
 
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
 			spin_lock(pgt_lock);
@@ -247,7 +247,7 @@ void vmalloc_sync_all(void)
 			if (!ret)
 				break;
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -105,18 +105,19 @@ void sync_global_pgds(unsigned long star
 
 	for (address = start; address <= end; address += PGDIR_SIZE) {
 		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
 		struct page *page;
 
 		if (pgd_none(*pgd_ref))
 			continue;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		VM_BUG_ON(in_interrupt());
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 			spin_lock(pgt_lock);
 
@@ -128,7 +129,7 @@ void sync_global_pgds(unsigned long star
 
 			spin_unlock(pgt_lock);
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -57,12 +57,11 @@ static unsigned long direct_pages_count[
 
 void update_page_count(int level, unsigned long pages)
 {
-	unsigned long flags;
-
 	/* Protect against CPA */
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	direct_pages_count[level] += pages;
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 static void split_page_count(int level)
@@ -394,7 +393,7 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
-	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
@@ -403,7 +402,8 @@ try_preserve_large_page(pte_t *kpte, uns
 	if (cpa->force_split)
 		return 1;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
@@ -498,14 +498,14 @@ try_preserve_large_page(pte_t *kpte, uns
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return do_split;
 }
 
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-	unsigned long flags, pfn, pfninc = 1;
+	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
@@ -519,7 +519,8 @@ static int split_large_page(pte_t *kpte,
 	if (!base)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -591,7 +592,7 @@ out_unlock:
 	 */
 	if (base)
 		__free_page(base);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return 0;
 }
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -121,14 +121,13 @@ static void pgd_ctor(struct mm_struct *m
 
 static void pgd_dtor(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
-
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -260,7 +259,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
-	unsigned long flags;
 
 	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
 
@@ -280,12 +278,13 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 * respect to anything walking the pgd_list, so that they
 	 * never see a partially populated pgd.
 	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	VM_BUG_ON(in_interrupt());
+	spin_lock(&pgd_lock);
 
 	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return pgd;
 
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
  */
 void xen_mm_pin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (!PagePinned(page)) {
@@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
  */
 void xen_mm_unpin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
@@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 19:07             ` [PATCH] fix pgd_lock deadlock Andrea Arcangeli
@ 2011-02-15 19:26               ` Thomas Gleixner
  2011-02-15 19:31                 ` Larry Woodman
  2011-02-15 19:54                   ` Andrea Arcangeli
  0 siblings, 2 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 19:26 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, 15 Feb 2011, Andrea Arcangeli wrote:

> Hello,
> 
> Without this patch we can deadlock in the page_table_lock with NR_CPUS
> < 4 or THP on, with this patch we hopefully won't deadlock in the
> pgd_lock (if taken from irq). I can't see anything taking it from irq
> (maybe aio? to check I also tried the libaio testuite with no apparent
> VM_BUG_ON triggering), so unless somebody sees it, I think we should
> apply it. I've been running for a while with this patch applied
> without apparent problems. Other archs may follow suit if it's proven
> that there's nothing taking the pgd_lock from irq.
> 
> ===
> Subject: fix pgd_lock deadlock
> 
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.

I really read this thing 5 times and still cannot make any sense of it.

You talk about page_table_lock and then fiddle with pgd_lock.

-ENOSENSE

	tglx

 

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 19:26               ` Thomas Gleixner
@ 2011-02-15 19:31                 ` Larry Woodman
  2011-02-15 19:54                   ` Andrea Arcangeli
  1 sibling, 0 replies; 68+ messages in thread
From: Larry Woodman @ 2011-02-15 19:31 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andrea Arcangeli, Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	H. Peter Anvin, Andrew Morton, Johannes Weiner


[-- Attachment #1.1: Type: text/plain, Size: 2840 bytes --]

On 02/15/2011 02:26 PM, Thomas Gleixner wrote:
> On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
>
>> Hello,
>>
>> Without this patch we can deadlock in the page_table_lock with NR_CPUS
>> <  4 or THP on, with this patch we hopefully won't deadlock in the
>> pgd_lock (if taken from irq). I can't see anything taking it from irq
>> (maybe aio? to check I also tried the libaio testuite with no apparent
>> VM_BUG_ON triggering), so unless somebody sees it, I think we should
>> apply it. I've been running for a while with this patch applied
>> without apparent problems. Other archs may follow suit if it's proven
>> that there's nothing taking the pgd_lock from irq.
>>
>> ===
>> Subject: fix pgd_lock deadlock
>>
>> From: Andrea Arcangeli<aarcange@redhat.com>
>>
>> It's forbidden to take the page_table_lock with the irq disabled or if there's
>> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
>> never run leading to a deadlock.
> I really read this thing 5 times and still cannot make any sense of it.
>
> You talk about page_table_lock and then fiddle with pgd_lock.
>
> -ENOSENSE
>
> 	tglx
>
>
I put this expanation in the redhat BZ, says it all:


Larry Woodman <mailto:lwoodman@redhat.com> 2011-01-21 15:54:58 EST

The problem is with THP.  The page reclaim code calls page_referenced_one()
which takes the mm->page_table_lock on one CPU before sending an IPI to other
CPU(s):

On CPU1 we take the mm->page_table_lock, send IPIs and wait for a response:
page_referenced_one(...)
         if (unlikely(PageTransHuge(page))) {
                 pmd_t *pmd;

                 spin_lock(&mm->page_table_lock);
                 pmd = page_check_address_pmd(page, mm, address,
                                              PAGE_CHECK_ADDRESS_PMD_FLAG);
                 if (pmd&&  !pmd_trans_splitting(*pmd)&&
                     pmdp_clear_flush_young_notify(vma, address, pmd))
                         referenced++;
                 spin_unlock(&mm->page_table_lock);
         } else {


CPU2 can race in vmalloc_sync_all() because it disables interrupt(preventing a
response to the IPI from CPU1) and takes the pgd_lock then spins in the
mm->page_table_lock which is already held on CPU1.

                 spin_lock_irqsave(&pgd_lock, flags);
                 list_for_each_entry(page,&pgd_list, lru) {
                         pgd_t *pgd;
                         spinlock_t *pgt_lock;

                         pgd = (pgd_t *)page_address(page) + pgd_index(address);

                         pgt_lock =&pgd_page_get_mm(page)->page_table_lock;
                         spin_lock(pgt_lock);


At this point the system is deadlocked.  The pmdp_clear_flush_young_notify
needs to do its PDG business with the page_table_lock held then release that
lock before sending the IPIs to the other CPUs.


Larry


[-- Attachment #1.2: Type: text/html, Size: 3838 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 19:26               ` Thomas Gleixner
@ 2011-02-15 19:54                   ` Andrea Arcangeli
  2011-02-15 19:54                   ` Andrea Arcangeli
  1 sibling, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-15 19:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, Feb 15, 2011 at 08:26:51PM +0100, Thomas Gleixner wrote:
> On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
> 
> > Hello,
> > 
> > Without this patch we can deadlock in the page_table_lock with NR_CPUS
> > < 4 or THP on, with this patch we hopefully won't deadlock in the
> > pgd_lock (if taken from irq). I can't see anything taking it from irq
> > (maybe aio? to check I also tried the libaio testuite with no apparent
> > VM_BUG_ON triggering), so unless somebody sees it, I think we should
> > apply it. I've been running for a while with this patch applied
> > without apparent problems. Other archs may follow suit if it's proven
> > that there's nothing taking the pgd_lock from irq.
> > 
> > ===
> > Subject: fix pgd_lock deadlock
> > 
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > It's forbidden to take the page_table_lock with the irq disabled or if there's
> > contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> > never run leading to a deadlock.
> 
> I really read this thing 5 times and still cannot make any sense of it.
> 
> You talk about page_table_lock and then fiddle with pgd_lock.
> 
> -ENOSENSE

With NR_CPUs < 4, or with THP enabled, rmap.c will do
spin_lock(&mm->page_table_lock) (or pte_offset_map_lock where the lock
is still mm->page_table_lock and not the PT lock). Then it will send
IPIs to flush the tlb of the other CPUs.

But the other CPU is running the vmalloc_sync_all, and it is trying to
take the page_table_lock with irq disabled. It will never take the
lock because the CPU waiting the IPI delivery holds it. And it will
never run the IPI because it has irqs disabled.

Now the big question is if anything is taking the pgd_lock from
irqs. Normal testing could never reveal it as even if it happens it
has a slim chance to happen while the pgd_lock is already hold by
normal kernel context. But the VM_BUG_ON(in_interrupt()) should
hopefully have revealed it already if it ever happened, I hope.

Clearly we could try to fix it in other ways, but still if there's no
reason to do the _irqsave this sounds a good idea to apply my fix
anyway.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-15 19:54                   ` Andrea Arcangeli
  0 siblings, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-15 19:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Johannes Weiner, H. Peter Anvin, Andrew Morton, Larry Woodman

On Tue, Feb 15, 2011 at 08:26:51PM +0100, Thomas Gleixner wrote:
> On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
> 
> > Hello,
> > 
> > Without this patch we can deadlock in the page_table_lock with NR_CPUS
> > < 4 or THP on, with this patch we hopefully won't deadlock in the
> > pgd_lock (if taken from irq). I can't see anything taking it from irq
> > (maybe aio? to check I also tried the libaio testuite with no apparent
> > VM_BUG_ON triggering), so unless somebody sees it, I think we should
> > apply it. I've been running for a while with this patch applied
> > without apparent problems. Other archs may follow suit if it's proven
> > that there's nothing taking the pgd_lock from irq.
> > 
> > ===
> > Subject: fix pgd_lock deadlock
> > 
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > It's forbidden to take the page_table_lock with the irq disabled or if there's
> > contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> > never run leading to a deadlock.
> 
> I really read this thing 5 times and still cannot make any sense of it.
> 
> You talk about page_table_lock and then fiddle with pgd_lock.
> 
> -ENOSENSE

With NR_CPUs < 4, or with THP enabled, rmap.c will do
spin_lock(&mm->page_table_lock) (or pte_offset_map_lock where the lock
is still mm->page_table_lock and not the PT lock). Then it will send
IPIs to flush the tlb of the other CPUs.

But the other CPU is running the vmalloc_sync_all, and it is trying to
take the page_table_lock with irq disabled. It will never take the
lock because the CPU waiting the IPI delivery holds it. And it will
never run the IPI because it has irqs disabled.

Now the big question is if anything is taking the pgd_lock from
irqs. Normal testing could never reveal it as even if it happens it
has a slim chance to happen while the pgd_lock is already hold by
normal kernel context. But the VM_BUG_ON(in_interrupt()) should
hopefully have revealed it already if it ever happened, I hope.

Clearly we could try to fix it in other ways, but still if there's no
reason to do the _irqsave this sounds a good idea to apply my fix
anyway.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 19:54                   ` Andrea Arcangeli
@ 2011-02-15 20:05                     ` Thomas Gleixner
  -1 siblings, 0 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 20:05 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
> On Tue, Feb 15, 2011 at 08:26:51PM +0100, Thomas Gleixner wrote:
> 
> With NR_CPUs < 4, or with THP enabled, rmap.c will do
> spin_lock(&mm->page_table_lock) (or pte_offset_map_lock where the lock
> is still mm->page_table_lock and not the PT lock). Then it will send
> IPIs to flush the tlb of the other CPUs.
> 
> But the other CPU is running the vmalloc_sync_all, and it is trying to
> take the page_table_lock with irq disabled. It will never take the
> lock because the CPU waiting the IPI delivery holds it. And it will
> never run the IPI because it has irqs disabled.

Ok, that makes sense :)
 
> Now the big question is if anything is taking the pgd_lock from
> irqs. Normal testing could never reveal it as even if it happens it
> has a slim chance to happen while the pgd_lock is already hold by
> normal kernel context. But the VM_BUG_ON(in_interrupt()) should
> hopefully have revealed it already if it ever happened, I hope.
> 
> Clearly we could try to fix it in other ways, but still if there's no
> reason to do the _irqsave this sounds a good idea to apply my fix
> anyway.

Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
lot?

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-15 20:05                     ` Thomas Gleixner
  0 siblings, 0 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 20:05 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Johannes Weiner, H. Peter Anvin, Andrew Morton, Larry Woodman

On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
> On Tue, Feb 15, 2011 at 08:26:51PM +0100, Thomas Gleixner wrote:
> 
> With NR_CPUs < 4, or with THP enabled, rmap.c will do
> spin_lock(&mm->page_table_lock) (or pte_offset_map_lock where the lock
> is still mm->page_table_lock and not the PT lock). Then it will send
> IPIs to flush the tlb of the other CPUs.
> 
> But the other CPU is running the vmalloc_sync_all, and it is trying to
> take the page_table_lock with irq disabled. It will never take the
> lock because the CPU waiting the IPI delivery holds it. And it will
> never run the IPI because it has irqs disabled.

Ok, that makes sense :)
 
> Now the big question is if anything is taking the pgd_lock from
> irqs. Normal testing could never reveal it as even if it happens it
> has a slim chance to happen while the pgd_lock is already hold by
> normal kernel context. But the VM_BUG_ON(in_interrupt()) should
> hopefully have revealed it already if it ever happened, I hope.
> 
> Clearly we could try to fix it in other ways, but still if there's no
> reason to do the _irqsave this sounds a good idea to apply my fix
> anyway.

Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
lot?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 20:05                     ` Thomas Gleixner
@ 2011-02-15 20:26                       ` Thomas Gleixner
  -1 siblings, 0 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 20:26 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, 15 Feb 2011, Thomas Gleixner wrote:

> On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
> > On Tue, Feb 15, 2011 at 08:26:51PM +0100, Thomas Gleixner wrote:
> > 
> > With NR_CPUs < 4, or with THP enabled, rmap.c will do
> > spin_lock(&mm->page_table_lock) (or pte_offset_map_lock where the lock
> > is still mm->page_table_lock and not the PT lock). Then it will send
> > IPIs to flush the tlb of the other CPUs.
> > 
> > But the other CPU is running the vmalloc_sync_all, and it is trying to
> > take the page_table_lock with irq disabled. It will never take the
> > lock because the CPU waiting the IPI delivery holds it. And it will
> > never run the IPI because it has irqs disabled.
> 
> Ok, that makes sense :)
>  
> > Now the big question is if anything is taking the pgd_lock from
> > irqs. Normal testing could never reveal it as even if it happens it
> > has a slim chance to happen while the pgd_lock is already hold by
> > normal kernel context. But the VM_BUG_ON(in_interrupt()) should
> > hopefully have revealed it already if it ever happened, I hope.
> > 
> > Clearly we could try to fix it in other ways, but still if there's no
> > reason to do the _irqsave this sounds a good idea to apply my fix
> > anyway.
> 
> Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
> lot?

Another thing. You check for in_interrupt(), but what makes sure that
the code which takes pgd_lock is never taken with interrupts disabled
except during early boot ?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-15 20:26                       ` Thomas Gleixner
  0 siblings, 0 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 20:26 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Johannes Weiner, H. Peter Anvin, Andrew Morton, Larry Woodman

On Tue, 15 Feb 2011, Thomas Gleixner wrote:

> On Tue, 15 Feb 2011, Andrea Arcangeli wrote:
> > On Tue, Feb 15, 2011 at 08:26:51PM +0100, Thomas Gleixner wrote:
> > 
> > With NR_CPUs < 4, or with THP enabled, rmap.c will do
> > spin_lock(&mm->page_table_lock) (or pte_offset_map_lock where the lock
> > is still mm->page_table_lock and not the PT lock). Then it will send
> > IPIs to flush the tlb of the other CPUs.
> > 
> > But the other CPU is running the vmalloc_sync_all, and it is trying to
> > take the page_table_lock with irq disabled. It will never take the
> > lock because the CPU waiting the IPI delivery holds it. And it will
> > never run the IPI because it has irqs disabled.
> 
> Ok, that makes sense :)
>  
> > Now the big question is if anything is taking the pgd_lock from
> > irqs. Normal testing could never reveal it as even if it happens it
> > has a slim chance to happen while the pgd_lock is already hold by
> > normal kernel context. But the VM_BUG_ON(in_interrupt()) should
> > hopefully have revealed it already if it ever happened, I hope.
> > 
> > Clearly we could try to fix it in other ways, but still if there's no
> > reason to do the _irqsave this sounds a good idea to apply my fix
> > anyway.
> 
> Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
> lot?

Another thing. You check for in_interrupt(), but what makes sure that
the code which takes pgd_lock is never taken with interrupts disabled
except during early boot ?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 20:26                       ` Thomas Gleixner
  (?)
@ 2011-02-15 22:52                       ` Andrea Arcangeli
  2011-02-15 23:03                           ` Thomas Gleixner
  -1 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-15 22:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, Feb 15, 2011 at 09:26:35PM +0100, Thomas Gleixner wrote:
> Another thing. You check for in_interrupt(), but what makes sure that
> the code which takes pgd_lock is never taken with interrupts disabled
> except during early boot ?

It's perfectly fine to take pgd_lock with irq disabled, as long as you
don't pretend to take the page_table_lock too after that. So that's
not a concern.

I removed _irqsave from all pgd_lock, and I doubt there's any code
protected by pgd_lock that runs with irq disabled, but if there is,
it's still ok and it especially shouldn't have used _irqsave.

The only real issue here to sort out, is if pgd_lock is ever taken
from irq or not, and to me it looks like in_interrupt() should trigger
if it is ever taken from irq, so it won't go unnoticed for long if
this isn't ok.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 22:52                       ` Andrea Arcangeli
@ 2011-02-15 23:03                           ` Thomas Gleixner
  0 siblings, 0 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 23:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, 15 Feb 2011, Andrea Arcangeli wrote:

> On Tue, Feb 15, 2011 at 09:26:35PM +0100, Thomas Gleixner wrote:
> > Another thing. You check for in_interrupt(), but what makes sure that
> > the code which takes pgd_lock is never taken with interrupts disabled
> > except during early boot ?
> 
> It's perfectly fine to take pgd_lock with irq disabled, as long as you
> don't pretend to take the page_table_lock too after that. So that's
> not a concern.
> 
> I removed _irqsave from all pgd_lock, and I doubt there's any code
> protected by pgd_lock that runs with irq disabled, but if there is,
> it's still ok and it especially shouldn't have used _irqsave.
> 
> The only real issue here to sort out, is if pgd_lock is ever taken
> from irq or not, and to me it looks like in_interrupt() should trigger
> if it is ever taken from irq, so it won't go unnoticed for long if
> this isn't ok.

I assume you run it with a lockdep enabled kernel as well, right ?

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-15 23:03                           ` Thomas Gleixner
  0 siblings, 0 replies; 68+ messages in thread
From: Thomas Gleixner @ 2011-02-15 23:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Johannes Weiner, H. Peter Anvin, Andrew Morton, Larry Woodman

On Tue, 15 Feb 2011, Andrea Arcangeli wrote:

> On Tue, Feb 15, 2011 at 09:26:35PM +0100, Thomas Gleixner wrote:
> > Another thing. You check for in_interrupt(), but what makes sure that
> > the code which takes pgd_lock is never taken with interrupts disabled
> > except during early boot ?
> 
> It's perfectly fine to take pgd_lock with irq disabled, as long as you
> don't pretend to take the page_table_lock too after that. So that's
> not a concern.
> 
> I removed _irqsave from all pgd_lock, and I doubt there's any code
> protected by pgd_lock that runs with irq disabled, but if there is,
> it's still ok and it especially shouldn't have used _irqsave.
> 
> The only real issue here to sort out, is if pgd_lock is ever taken
> from irq or not, and to me it looks like in_interrupt() should trigger
> if it is ever taken from irq, so it won't go unnoticed for long if
> this isn't ok.

I assume you run it with a lockdep enabled kernel as well, right ?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 23:03                           ` Thomas Gleixner
  (?)
@ 2011-02-15 23:17                           ` Andrea Arcangeli
  2011-02-16  9:58                             ` Peter Zijlstra
  -1 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-15 23:17 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Wed, Feb 16, 2011 at 12:03:30AM +0100, Thomas Gleixner wrote:
> I assume you run it with a lockdep enabled kernel as well, right ?

Yes, I always run with lockdep and prove locking enabled on my test
box, not sure how it's meant to trigger more bugs in this case, the
debug check that should be relevant for this is DEBUG_VM and that is
enabled too of course. I didn't try DEBUG_PAGEALLOC yet.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 23:17                           ` Andrea Arcangeli
@ 2011-02-16  9:58                             ` Peter Zijlstra
  2011-02-16 10:15                                 ` Andrea Arcangeli
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2011-02-16  9:58 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Jeremy Fitzhardinge, H. Peter Anvin,
	the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich, Larry Woodman, Andrew Morton,
	Andi Kleen, Johannes Weiner, Hugh Dickins, Rik van Riel

On Wed, 2011-02-16 at 00:17 +0100, Andrea Arcangeli wrote:
> On Wed, Feb 16, 2011 at 12:03:30AM +0100, Thomas Gleixner wrote:
> > I assume you run it with a lockdep enabled kernel as well, right ?
> 
> Yes, I always run with lockdep and prove locking enabled on my test
> box, not sure how it's meant to trigger more bugs in this case, the
> debug check that should be relevant for this is DEBUG_VM and that is
> enabled too of course. I didn't try DEBUG_PAGEALLOC yet.

I think what Thomas tried to tell you is that your
VM_BUG_ON(in_interrupt()) is fully redundant if you have lockdep
enabled.

Lockdep will warn you if a !irqsave lock is taken from IRQ context,
since that is a clear inversion problem.



^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16  9:58                             ` Peter Zijlstra
@ 2011-02-16 10:15                                 ` Andrea Arcangeli
  0 siblings, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-16 10:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Jeremy Fitzhardinge, H. Peter Anvin,
	the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich, Larry Woodman, Andrew Morton,
	Andi Kleen, Johannes Weiner, Hugh Dickins, Rik van Riel

On Wed, Feb 16, 2011 at 10:58:14AM +0100, Peter Zijlstra wrote:
> On Wed, 2011-02-16 at 00:17 +0100, Andrea Arcangeli wrote:
> > On Wed, Feb 16, 2011 at 12:03:30AM +0100, Thomas Gleixner wrote:
> > > I assume you run it with a lockdep enabled kernel as well, right ?
> > 
> > Yes, I always run with lockdep and prove locking enabled on my test
> > box, not sure how it's meant to trigger more bugs in this case, the
> > debug check that should be relevant for this is DEBUG_VM and that is
> > enabled too of course. I didn't try DEBUG_PAGEALLOC yet.
> 
> I think what Thomas tried to tell you is that your
> VM_BUG_ON(in_interrupt()) is fully redundant if you have lockdep
> enabled.
> 
> Lockdep will warn you if a !irqsave lock is taken from IRQ context,
> since that is a clear inversion problem.

Ah I get it now, but I prefer to have it on an all my builds, and
I don't keep lockdep on for all builds (but I keep DEBUG_VM on). It's
still only debug code that no production system will ever deal with,
so it should be good to exercise it in more than on debug .config
considering it's very low overhead (pgd_lock is never taken in fast
paths) so it's suitable for a VM_BUG_ON.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-16 10:15                                 ` Andrea Arcangeli
  0 siblings, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-16 10:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Andrew Morton, Johannes Weiner, H. Peter Anvin, Thomas Gleixner,
	Larry Woodman

On Wed, Feb 16, 2011 at 10:58:14AM +0100, Peter Zijlstra wrote:
> On Wed, 2011-02-16 at 00:17 +0100, Andrea Arcangeli wrote:
> > On Wed, Feb 16, 2011 at 12:03:30AM +0100, Thomas Gleixner wrote:
> > > I assume you run it with a lockdep enabled kernel as well, right ?
> > 
> > Yes, I always run with lockdep and prove locking enabled on my test
> > box, not sure how it's meant to trigger more bugs in this case, the
> > debug check that should be relevant for this is DEBUG_VM and that is
> > enabled too of course. I didn't try DEBUG_PAGEALLOC yet.
> 
> I think what Thomas tried to tell you is that your
> VM_BUG_ON(in_interrupt()) is fully redundant if you have lockdep
> enabled.
> 
> Lockdep will warn you if a !irqsave lock is taken from IRQ context,
> since that is a clear inversion problem.

Ah I get it now, but I prefer to have it on an all my builds, and
I don't keep lockdep on for all builds (but I keep DEBUG_VM on). It's
still only debug code that no production system will ever deal with,
so it should be good to exercise it in more than on debug .config
considering it's very low overhead (pgd_lock is never taken in fast
paths) so it's suitable for a VM_BUG_ON.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16 10:15                                 ` Andrea Arcangeli
@ 2011-02-16 10:28                                   ` Ingo Molnar
  -1 siblings, 0 replies; 68+ messages in thread
From: Ingo Molnar @ 2011-02-16 10:28 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Thomas Gleixner, Jeremy Fitzhardinge,
	H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel


* Andrea Arcangeli <aarcange@redhat.com> wrote:

> On Wed, Feb 16, 2011 at 10:58:14AM +0100, Peter Zijlstra wrote:
> > On Wed, 2011-02-16 at 00:17 +0100, Andrea Arcangeli wrote:
> > > On Wed, Feb 16, 2011 at 12:03:30AM +0100, Thomas Gleixner wrote:
> > > > I assume you run it with a lockdep enabled kernel as well, right ?
> > > 
> > > Yes, I always run with lockdep and prove locking enabled on my test
> > > box, not sure how it's meant to trigger more bugs in this case, the
> > > debug check that should be relevant for this is DEBUG_VM and that is
> > > enabled too of course. I didn't try DEBUG_PAGEALLOC yet.
> > 
> > I think what Thomas tried to tell you is that your
> > VM_BUG_ON(in_interrupt()) is fully redundant if you have lockdep
> > enabled.
> > 
> > Lockdep will warn you if a !irqsave lock is taken from IRQ context,
> > since that is a clear inversion problem.
> 
> Ah I get it now, but I prefer to have it on an all my builds, and
> I don't keep lockdep on for all builds (but I keep DEBUG_VM on). It's
> still only debug code that no production system will ever deal with,
> so it should be good to exercise it in more than on debug .config
> considering it's very low overhead (pgd_lock is never taken in fast
> paths) so it's suitable for a VM_BUG_ON.

The point both Thomas and Peter tried to point out to you, that adding 7 instances 
of redundant debug checks:

+		VM_BUG_ON(in_interrupt());
+		VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());

to arch/x86/ is not acceptable, for the reasons stated. Please remove it from the 
patch.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-16 10:28                                   ` Ingo Molnar
  0 siblings, 0 replies; 68+ messages in thread
From: Ingo Molnar @ 2011-02-16 10:28 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell, Peter Zijlstra,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Andrew Morton, Johannes Weiner, H. Peter Anvin, Thomas Gleixner,
	Larry Woodman


* Andrea Arcangeli <aarcange@redhat.com> wrote:

> On Wed, Feb 16, 2011 at 10:58:14AM +0100, Peter Zijlstra wrote:
> > On Wed, 2011-02-16 at 00:17 +0100, Andrea Arcangeli wrote:
> > > On Wed, Feb 16, 2011 at 12:03:30AM +0100, Thomas Gleixner wrote:
> > > > I assume you run it with a lockdep enabled kernel as well, right ?
> > > 
> > > Yes, I always run with lockdep and prove locking enabled on my test
> > > box, not sure how it's meant to trigger more bugs in this case, the
> > > debug check that should be relevant for this is DEBUG_VM and that is
> > > enabled too of course. I didn't try DEBUG_PAGEALLOC yet.
> > 
> > I think what Thomas tried to tell you is that your
> > VM_BUG_ON(in_interrupt()) is fully redundant if you have lockdep
> > enabled.
> > 
> > Lockdep will warn you if a !irqsave lock is taken from IRQ context,
> > since that is a clear inversion problem.
> 
> Ah I get it now, but I prefer to have it on an all my builds, and
> I don't keep lockdep on for all builds (but I keep DEBUG_VM on). It's
> still only debug code that no production system will ever deal with,
> so it should be good to exercise it in more than on debug .config
> considering it's very low overhead (pgd_lock is never taken in fast
> paths) so it's suitable for a VM_BUG_ON.

The point both Thomas and Peter tried to point out to you, that adding 7 instances 
of redundant debug checks:

+		VM_BUG_ON(in_interrupt());
+		VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_interrupt());

to arch/x86/ is not acceptable, for the reasons stated. Please remove it from the 
patch.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16 10:28                                   ` Ingo Molnar
  (?)
@ 2011-02-16 14:49                                   ` Andrea Arcangeli
  2011-02-16 16:26                                       ` Rik van Riel
                                                       ` (2 more replies)
  -1 siblings, 3 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-16 14:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Thomas Gleixner, Jeremy Fitzhardinge,
	H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Wed, Feb 16, 2011 at 11:28:01AM +0100, Ingo Molnar wrote:
> The point both Thomas and Peter tried to point out to you, that adding 7 instances 
> of redundant debug checks:
> 
> +		VM_BUG_ON(in_interrupt());
> +		VM_BUG_ON(in_interrupt());
> +	VM_BUG_ON(in_interrupt());
> +	VM_BUG_ON(in_interrupt());
> +	VM_BUG_ON(in_interrupt());
> +	VM_BUG_ON(in_interrupt());
> +	VM_BUG_ON(in_interrupt());
> 
> to arch/x86/ is not acceptable, for the reasons stated. Please remove it from the 
> patch.

Ok sorry, but whenever I'm asked if I tested my patch with lockdep, I
think if it was tested for other bugs, not to remove debug code
overlapping with some lockdep feature.

===
Subject: fix pgd_lock deadlock

From: Andrea Arcangeli <aarcange@redhat.com>

It's forbidden to take the page_table_lock with the irq disabled or if there's
contention the IPIs (for tlb flushes) sent with the page_table_lock held will
never run leading to a deadlock.

Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/mm/fault.c    |    7 +++----
 arch/x86/mm/init_64.c  |    6 +++---
 arch/x86/mm/pageattr.c |   18 ++++++++----------
 arch/x86/mm/pgtable.c  |   11 ++++-------
 arch/x86/xen/mmu.c     |   10 ++++------
 5 files changed, 22 insertions(+), 30 deletions(-)

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,15 +229,14 @@ void vmalloc_sync_all(void)
 	for (address = VMALLOC_START & PMD_MASK;
 	     address >= TASK_SIZE && address < FIXADDR_TOP;
 	     address += PMD_SIZE) {
-
-		unsigned long flags;
 		struct page *page;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
 			pmd_t *ret;
 
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
 			spin_lock(pgt_lock);
@@ -247,7 +246,7 @@ void vmalloc_sync_all(void)
 			if (!ret)
 				break;
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -105,18 +105,18 @@ void sync_global_pgds(unsigned long star
 
 	for (address = start; address <= end; address += PGDIR_SIZE) {
 		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
 		struct page *page;
 
 		if (pgd_none(*pgd_ref))
 			continue;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 			spin_lock(pgt_lock);
 
@@ -128,7 +128,7 @@ void sync_global_pgds(unsigned long star
 
 			spin_unlock(pgt_lock);
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -57,12 +57,10 @@ static unsigned long direct_pages_count[
 
 void update_page_count(int level, unsigned long pages)
 {
-	unsigned long flags;
-
 	/* Protect against CPA */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	direct_pages_count[level] += pages;
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 static void split_page_count(int level)
@@ -394,7 +392,7 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
-	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
@@ -403,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, uns
 	if (cpa->force_split)
 		return 1;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
@@ -498,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, uns
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return do_split;
 }
 
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-	unsigned long flags, pfn, pfninc = 1;
+	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
@@ -519,7 +517,7 @@ static int split_large_page(pte_t *kpte,
 	if (!base)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -591,7 +589,7 @@ out_unlock:
 	 */
 	if (base)
 		__free_page(base);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return 0;
 }
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -121,14 +121,12 @@ static void pgd_ctor(struct mm_struct *m
 
 static void pgd_dtor(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
-
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -260,7 +258,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
-	unsigned long flags;
 
 	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
 
@@ -280,12 +277,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 * respect to anything walking the pgd_list, so that they
 	 * never see a partially populated pgd.
 	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return pgd;
 
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
  */
 void xen_mm_pin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (!PagePinned(page)) {
@@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
  */
 void xen_mm_unpin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
@@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16 14:49                                   ` Andrea Arcangeli
@ 2011-02-16 16:26                                       ` Rik van Riel
  2011-02-16 20:15                                     ` Ingo Molnar
  2012-04-23  9:07                                     ` [2.6.32.y][PATCH] " Philipp Hahn
  2 siblings, 0 replies; 68+ messages in thread
From: Rik van Riel @ 2011-02-16 16:26 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Ingo Molnar, Peter Zijlstra, Thomas Gleixner,
	Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins

On 02/16/2011 09:49 AM, Andrea Arcangeli wrote:

> Subject: fix pgd_lock deadlock
>
> From: Andrea Arcangeli<aarcange@redhat.com>
>
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.
>
> Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.
>
> Signed-off-by: Andrea Arcangeli<aarcange@redhat.com>


Acked-by: Rik van Riel <riel@redhat.com>

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-16 16:26                                       ` Rik van Riel
  0 siblings, 0 replies; 68+ messages in thread
From: Rik van Riel @ 2011-02-16 16:26 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell, Peter Zijlstra,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Larry Woodman,
	Andrew Morton, Johannes Weiner, H. Peter Anvin, Ingo Molnar,
	Thomas Gleixner, Andi Kleen

On 02/16/2011 09:49 AM, Andrea Arcangeli wrote:

> Subject: fix pgd_lock deadlock
>
> From: Andrea Arcangeli<aarcange@redhat.com>
>
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.
>
> Apparently nobody takes the pgd_lock from irq so the _irqsave can be removed.
>
> Signed-off-by: Andrea Arcangeli<aarcange@redhat.com>


Acked-by: Rik van Riel <riel@redhat.com>

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-15 20:05                     ` Thomas Gleixner
  (?)
  (?)
@ 2011-02-16 18:33                     ` Andrea Arcangeli
  2011-02-16 21:34                       ` Konrad Rzeszutek Wilk
  2011-02-17 10:19                       ` Johannes Weiner
  -1 siblings, 2 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-16 18:33 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jeremy Fitzhardinge, H. Peter Anvin, the arch/x86 maintainers,
	Xen-devel, Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel

On Tue, Feb 15, 2011 at 09:05:20PM +0100, Thomas Gleixner wrote:
> Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
> lot?

I tried DEBUG_PAGEALLOC and it seems to work fine (in addition to
lockdep), it doesn't spwan any debug check.

In addition to testing it (both prev patch and below one) I looked
into the code and the free_pages calling into
pageattr->split_large_page apparently happens all at boot time.

Now one doubt remains if we need change_page_attr to run from irqs
(not from DEBUG_PAGEALLOC though). Now is change_page_attr really sane
to run from irqs? I thought __flush_tlb_all was delivering IPI (in
that case it also wouldn't have been safe in the first place to run
with irq disabled) but of course the "__" version is local, so after
all maybe it's safe to run with interrupts too (I'd be amazed if
somebody is calling it from irq, if not even DEBUG_PAGEALLOC does) but
with the below patch it will remain safe from irq as far as the
pgd_lock is concerned.

I think the previous patch was safe too though, avoiding VM
manipulations from interrupts makes everything simpler. Normally only
gart drivers should call it at init time to avoid prefetching of
cachelines in the next 2m page with different (writeback) cache
attributes of the pages physically aliased in the gart and mapped with
different cache attribute, that init stuff happening from interrupt
sounds weird. Anyway I post the below patch too as an alternative to
still allow pageattr from irq.

With both patches the big dependency remains on __mmdrop not to run
from irq. The alternative approach is to remove the page_table_lock
from vmalloc_sync_all (which is only needed by Xen paravirt guest
AFIK) and solve that problem in a different way, but I don't even know
why they need it exactly, I tried not to impact that.

===
Subject: fix pgd_lock deadlock

From: Andrea Arcangeli <aarcange@redhat.com>

It's forbidden to take the page_table_lock with the irq disabled or if there's
contention the IPIs (for tlb flushes) sent with the page_table_lock held will
never run leading to a deadlock.

pageattr also seems to take advantage the pgd_lock to serialize
split_large_page which isn't necessary so split it off to a separate spinlock
(as a read_lock wouldn't enough to serialize split_large_page). Then we can use
a read-write lock to allow pageattr to keep taking it from interrupts, but
without having to always clear interrupts for readers. Writers are only safe
from regular kernel context and they must clear irqs before taking it.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/include/asm/pgtable.h |    2 +-
 arch/x86/mm/fault.c            |   23 ++++++++++++++++++-----
 arch/x86/mm/init_64.c          |    6 +++---
 arch/x86/mm/pageattr.c         |   17 ++++++++++-------
 arch/x86/mm/pgtable.c          |   10 +++++-----
 arch/x86/xen/mmu.c             |   10 ++++------
 6 files changed, 41 insertions(+), 27 deletions(-)

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -179,7 +179,21 @@ force_sig_info_fault(int si_signo, int s
 	force_sig_info(si_signo, &info, tsk);
 }
 
-DEFINE_SPINLOCK(pgd_lock);
+/*
+ * The pgd_lock only protects the pgd_list. It can be taken from
+ * interrupt context but only in read mode (there's no need to clear
+ * interrupts before taking it in read mode). Write mode can only be
+ * taken from regular kernel context (not from irqs) and it must
+ * disable irqs before taking it. This way it can still be taken in
+ * read mode from interrupt context (in case
+ * pageattr->split_large_page is ever called from interrupt context),
+ * but without forcing the pgd_list readers to clear interrupts before
+ * taking it (like vmalloc_sync_all() that then wants to take the
+ * page_table_lock while holding the pgd_lock, which can only be taken
+ * while irqs are left enabled to avoid deadlocking against the IPI
+ * delivery of an SMP TLB flush run with the page_table_lock hold).
+ */
+DEFINE_RWLOCK(pgd_lock);
 LIST_HEAD(pgd_list);
 
 #ifdef CONFIG_X86_32
@@ -229,15 +243,14 @@ void vmalloc_sync_all(void)
 	for (address = VMALLOC_START & PMD_MASK;
 	     address >= TASK_SIZE && address < FIXADDR_TOP;
 	     address += PMD_SIZE) {
-
-		unsigned long flags;
 		struct page *page;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		read_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
 			pmd_t *ret;
 
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
 			spin_lock(pgt_lock);
@@ -247,7 +260,7 @@ void vmalloc_sync_all(void)
 			if (!ret)
 				break;
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		read_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -105,18 +105,18 @@ void sync_global_pgds(unsigned long star
 
 	for (address = start; address <= end; address += PGDIR_SIZE) {
 		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
 		struct page *page;
 
 		if (pgd_none(*pgd_ref))
 			continue;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		read_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 			spin_lock(pgt_lock);
 
@@ -128,7 +128,7 @@ void sync_global_pgds(unsigned long star
 
 			spin_unlock(pgt_lock);
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		read_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -47,6 +47,7 @@ struct cpa_data {
  * splitting a large page entry along with changing the attribute.
  */
 static DEFINE_SPINLOCK(cpa_lock);
+static DEFINE_SPINLOCK(pgattr_lock);
 
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
@@ -60,9 +61,9 @@ void update_page_count(int level, unsign
 	unsigned long flags;
 
 	/* Protect against CPA */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock_irqsave(&pgattr_lock, flags);
 	direct_pages_count[level] += pages;
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock_irqrestore(&pgattr_lock, flags);
 }
 
 static void split_page_count(int level)
@@ -376,6 +377,7 @@ static void __set_pmd_pte(pte_t *kpte, u
 	if (!SHARED_KERNEL_PMD) {
 		struct page *page;
 
+		read_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			pud_t *pud;
@@ -386,6 +388,7 @@ static void __set_pmd_pte(pte_t *kpte, u
 			pmd = pmd_offset(pud, address);
 			set_pte_atomic((pte_t *)pmd, pte);
 		}
+		read_unlock(&pgd_lock);
 	}
 #endif
 }
@@ -403,7 +406,7 @@ try_preserve_large_page(pte_t *kpte, uns
 	if (cpa->force_split)
 		return 1;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock_irqsave(&pgattr_lock, flags);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
@@ -498,7 +501,7 @@ try_preserve_large_page(pte_t *kpte, uns
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock_irqrestore(&pgattr_lock, flags);
 
 	return do_split;
 }
@@ -519,7 +522,7 @@ static int split_large_page(pte_t *kpte,
 	if (!base)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock_irqsave(&pgattr_lock, flags);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -587,11 +590,11 @@ static int split_large_page(pte_t *kpte,
 out_unlock:
 	/*
 	 * If we dropped out via the lookup_address check under
-	 * pgd_lock then stick the page back into the pool:
+	 * pgattr_lock then stick the page back into the pool:
 	 */
 	if (base)
 		__free_page(base);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock_irqrestore(&pgattr_lock, flags);
 
 	return 0;
 }
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -121,14 +121,14 @@ static void pgd_ctor(struct mm_struct *m
 
 static void pgd_dtor(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
+	unsigned long flags;
 
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	write_lock_irqsave(&pgd_lock, flags);
 	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	write_unlock_irqrestore(&pgd_lock, flags);
 }
 
 /*
@@ -280,12 +280,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 * respect to anything walking the pgd_list, so that they
 	 * never see a partially populated pgd.
 	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	write_lock_irqsave(&pgd_lock, flags);
 
 	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	write_unlock_irqrestore(&pgd_lock, flags);
 
 	return pgd;
 
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
  */
 void xen_mm_pin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	read_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (!PagePinned(page)) {
@@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	read_unlock(&pgd_lock);
 }
 
 /*
@@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
  */
 void xen_mm_unpin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	read_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
@@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	read_unlock(&pgd_lock);
 }
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -25,7 +25,7 @@
 extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
-extern spinlock_t pgd_lock;
+extern rwlock_t pgd_lock;
 extern struct list_head pgd_list;
 
 extern struct mm_struct *pgd_page_get_mm(struct page *page);

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16 14:49                                   ` Andrea Arcangeli
  2011-02-16 16:26                                       ` Rik van Riel
@ 2011-02-16 20:15                                     ` Ingo Molnar
  2012-04-23  9:07                                     ` [2.6.32.y][PATCH] " Philipp Hahn
  2 siblings, 0 replies; 68+ messages in thread
From: Ingo Molnar @ 2011-02-16 20:15 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Thomas Gleixner, Jeremy Fitzhardinge,
	H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Johannes Weiner,
	Hugh Dickins, Rik van Riel


* Andrea Arcangeli <aarcange@redhat.com> wrote:

> Ok sorry, but whenever I'm asked if I tested my patch with lockdep, I think if it 
> was tested for other bugs, not to remove debug code overlapping with some lockdep 
> feature.

ah, ok. The thing is, if it was only a single VM_BUG_ON() then it wouldnt be a big 
deal. Seven seems excessive - especially since we have that very check via lockdep.

Thanks for removing them!

	Ingo

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16 18:33                     ` [PATCH] " Andrea Arcangeli
@ 2011-02-16 21:34                       ` Konrad Rzeszutek Wilk
  2011-02-17 10:19                       ` Johannes Weiner
  1 sibling, 0 replies; 68+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-02-16 21:34 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Jeremy Fitzhardinge, H. Peter Anvin,
	the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich, Larry Woodman, Andrew Morton,
	Andi Kleen, Johannes Weiner, Hugh Dickins, Rik van Riel

On Wed, Feb 16, 2011 at 07:33:04PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 15, 2011 at 09:05:20PM +0100, Thomas Gleixner wrote:
> > Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
> > lot?
> 
> I tried DEBUG_PAGEALLOC and it seems to work fine (in addition to
> lockdep), it doesn't spwan any debug check.
> 
> In addition to testing it (both prev patch and below one) I looked
I checked this under Xen running as PV and Dom0 and had no trouble.

You can stick:
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

> into the code and the free_pages calling into
> pageattr->split_large_page apparently happens all at boot time.
> 
> Now one doubt remains if we need change_page_attr to run from irqs
> (not from DEBUG_PAGEALLOC though). Now is change_page_attr really sane
> to run from irqs? I thought __flush_tlb_all was delivering IPI (in
> that case it also wouldn't have been safe in the first place to run
> with irq disabled) but of course the "__" version is local, so after
> all maybe it's safe to run with interrupts too (I'd be amazed if
> somebody is calling it from irq, if not even DEBUG_PAGEALLOC does) but
> with the below patch it will remain safe from irq as far as the
> pgd_lock is concerned.
> 
> I think the previous patch was safe too though, avoiding VM
> manipulations from interrupts makes everything simpler. Normally only
> gart drivers should call it at init time to avoid prefetching of
> cachelines in the next 2m page with different (writeback) cache
> attributes of the pages physically aliased in the gart and mapped with
> different cache attribute, that init stuff happening from interrupt
> sounds weird. Anyway I post the below patch too as an alternative to
> still allow pageattr from irq.
> 
> With both patches the big dependency remains on __mmdrop not to run
> from irq. The alternative approach is to remove the page_table_lock
> from vmalloc_sync_all (which is only needed by Xen paravirt guest
> AFIK) and solve that problem in a different way, but I don't even know
> why they need it exactly, I tried not to impact that.
> 
> ===
> Subject: fix pgd_lock deadlock
> 
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.
> 
> pageattr also seems to take advantage the pgd_lock to serialize
> split_large_page which isn't necessary so split it off to a separate spinlock
> (as a read_lock wouldn't enough to serialize split_large_page). Then we can use
> a read-write lock to allow pageattr to keep taking it from interrupts, but
> without having to always clear interrupts for readers. Writers are only safe
> from regular kernel context and they must clear irqs before taking it.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> ---
>  arch/x86/include/asm/pgtable.h |    2 +-
>  arch/x86/mm/fault.c            |   23 ++++++++++++++++++-----
>  arch/x86/mm/init_64.c          |    6 +++---
>  arch/x86/mm/pageattr.c         |   17 ++++++++++-------
>  arch/x86/mm/pgtable.c          |   10 +++++-----
>  arch/x86/xen/mmu.c             |   10 ++++------
>  6 files changed, 41 insertions(+), 27 deletions(-)
> 
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -179,7 +179,21 @@ force_sig_info_fault(int si_signo, int s
>  	force_sig_info(si_signo, &info, tsk);
>  }
>  
> -DEFINE_SPINLOCK(pgd_lock);
> +/*
> + * The pgd_lock only protects the pgd_list. It can be taken from
> + * interrupt context but only in read mode (there's no need to clear
> + * interrupts before taking it in read mode). Write mode can only be
> + * taken from regular kernel context (not from irqs) and it must
> + * disable irqs before taking it. This way it can still be taken in
> + * read mode from interrupt context (in case
> + * pageattr->split_large_page is ever called from interrupt context),
> + * but without forcing the pgd_list readers to clear interrupts before
> + * taking it (like vmalloc_sync_all() that then wants to take the
> + * page_table_lock while holding the pgd_lock, which can only be taken
> + * while irqs are left enabled to avoid deadlocking against the IPI
> + * delivery of an SMP TLB flush run with the page_table_lock hold).
> + */
> +DEFINE_RWLOCK(pgd_lock);
>  LIST_HEAD(pgd_list);
>  
>  #ifdef CONFIG_X86_32
> @@ -229,15 +243,14 @@ void vmalloc_sync_all(void)
>  	for (address = VMALLOC_START & PMD_MASK;
>  	     address >= TASK_SIZE && address < FIXADDR_TOP;
>  	     address += PMD_SIZE) {
> -
> -		unsigned long flags;
>  		struct page *page;
>  
> -		spin_lock_irqsave(&pgd_lock, flags);
> +		read_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			spinlock_t *pgt_lock;
>  			pmd_t *ret;
>  
> +			/* the pgt_lock only for Xen */
>  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
>  
>  			spin_lock(pgt_lock);
> @@ -247,7 +260,7 @@ void vmalloc_sync_all(void)
>  			if (!ret)
>  				break;
>  		}
> -		spin_unlock_irqrestore(&pgd_lock, flags);
> +		read_unlock(&pgd_lock);
>  	}
>  }
>  
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -105,18 +105,18 @@ void sync_global_pgds(unsigned long star
>  
>  	for (address = start; address <= end; address += PGDIR_SIZE) {
>  		const pgd_t *pgd_ref = pgd_offset_k(address);
> -		unsigned long flags;
>  		struct page *page;
>  
>  		if (pgd_none(*pgd_ref))
>  			continue;
>  
> -		spin_lock_irqsave(&pgd_lock, flags);
> +		read_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
>  			spinlock_t *pgt_lock;
>  
>  			pgd = (pgd_t *)page_address(page) + pgd_index(address);
> +			/* the pgt_lock only for Xen */
>  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
>  			spin_lock(pgt_lock);
>  
> @@ -128,7 +128,7 @@ void sync_global_pgds(unsigned long star
>  
>  			spin_unlock(pgt_lock);
>  		}
> -		spin_unlock_irqrestore(&pgd_lock, flags);
> +		read_unlock(&pgd_lock);
>  	}
>  }
>  
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -47,6 +47,7 @@ struct cpa_data {
>   * splitting a large page entry along with changing the attribute.
>   */
>  static DEFINE_SPINLOCK(cpa_lock);
> +static DEFINE_SPINLOCK(pgattr_lock);
>  
>  #define CPA_FLUSHTLB 1
>  #define CPA_ARRAY 2
> @@ -60,9 +61,9 @@ void update_page_count(int level, unsign
>  	unsigned long flags;
>  
>  	/* Protect against CPA */
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock_irqsave(&pgattr_lock, flags);
>  	direct_pages_count[level] += pages;
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock_irqrestore(&pgattr_lock, flags);
>  }
>  
>  static void split_page_count(int level)
> @@ -376,6 +377,7 @@ static void __set_pmd_pte(pte_t *kpte, u
>  	if (!SHARED_KERNEL_PMD) {
>  		struct page *page;
>  
> +		read_lock(&pgd_lock);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
>  			pud_t *pud;
> @@ -386,6 +388,7 @@ static void __set_pmd_pte(pte_t *kpte, u
>  			pmd = pmd_offset(pud, address);
>  			set_pte_atomic((pte_t *)pmd, pte);
>  		}
> +		read_unlock(&pgd_lock);
>  	}
>  #endif
>  }
> @@ -403,7 +406,7 @@ try_preserve_large_page(pte_t *kpte, uns
>  	if (cpa->force_split)
>  		return 1;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock_irqsave(&pgattr_lock, flags);
>  	/*
>  	 * Check for races, another CPU might have split this page
>  	 * up already:
> @@ -498,7 +501,7 @@ try_preserve_large_page(pte_t *kpte, uns
>  	}
>  
>  out_unlock:
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock_irqrestore(&pgattr_lock, flags);
>  
>  	return do_split;
>  }
> @@ -519,7 +522,7 @@ static int split_large_page(pte_t *kpte,
>  	if (!base)
>  		return -ENOMEM;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	spin_lock_irqsave(&pgattr_lock, flags);
>  	/*
>  	 * Check for races, another CPU might have split this page
>  	 * up for us already:
> @@ -587,11 +590,11 @@ static int split_large_page(pte_t *kpte,
>  out_unlock:
>  	/*
>  	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> +	 * pgattr_lock then stick the page back into the pool:
>  	 */
>  	if (base)
>  		__free_page(base);
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	spin_unlock_irqrestore(&pgattr_lock, flags);
>  
>  	return 0;
>  }
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -121,14 +121,14 @@ static void pgd_ctor(struct mm_struct *m
>  
>  static void pgd_dtor(pgd_t *pgd)
>  {
> -	unsigned long flags; /* can be called from interrupt context */
> +	unsigned long flags;
>  
>  	if (SHARED_KERNEL_PMD)
>  		return;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	write_lock_irqsave(&pgd_lock, flags);
>  	pgd_list_del(pgd);
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	write_unlock_irqrestore(&pgd_lock, flags);
>  }
>  
>  /*
> @@ -280,12 +280,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
>  	 * respect to anything walking the pgd_list, so that they
>  	 * never see a partially populated pgd.
>  	 */
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	write_lock_irqsave(&pgd_lock, flags);
>  
>  	pgd_ctor(mm, pgd);
>  	pgd_prepopulate_pmd(mm, pgd, pmds);
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	write_unlock_irqrestore(&pgd_lock, flags);
>  
>  	return pgd;
>  
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -986,10 +986,9 @@ static void xen_pgd_pin(struct mm_struct
>   */
>  void xen_mm_pin_all(void)
>  {
> -	unsigned long flags;
>  	struct page *page;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	read_lock(&pgd_lock);
>  
>  	list_for_each_entry(page, &pgd_list, lru) {
>  		if (!PagePinned(page)) {
> @@ -998,7 +997,7 @@ void xen_mm_pin_all(void)
>  		}
>  	}
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	read_unlock(&pgd_lock);
>  }
>  
>  /*
> @@ -1099,10 +1098,9 @@ static void xen_pgd_unpin(struct mm_stru
>   */
>  void xen_mm_unpin_all(void)
>  {
> -	unsigned long flags;
>  	struct page *page;
>  
> -	spin_lock_irqsave(&pgd_lock, flags);
> +	read_lock(&pgd_lock);
>  
>  	list_for_each_entry(page, &pgd_list, lru) {
>  		if (PageSavePinned(page)) {
> @@ -1112,7 +1110,7 @@ void xen_mm_unpin_all(void)
>  		}
>  	}
>  
> -	spin_unlock_irqrestore(&pgd_lock, flags);
> +	read_unlock(&pgd_lock);
>  }
>  
>  void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -25,7 +25,7 @@
>  extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
>  #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
>  
> -extern spinlock_t pgd_lock;
> +extern rwlock_t pgd_lock;
>  extern struct list_head pgd_list;
>  
>  extern struct mm_struct *pgd_page_get_mm(struct page *page);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-16 18:33                     ` [PATCH] " Andrea Arcangeli
  2011-02-16 21:34                       ` Konrad Rzeszutek Wilk
@ 2011-02-17 10:19                       ` Johannes Weiner
  2011-02-21 14:30                         ` Andrea Arcangeli
  2011-02-21 17:40                         ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 68+ messages in thread
From: Johannes Weiner @ 2011-02-17 10:19 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Jeremy Fitzhardinge, H. Peter Anvin,
	the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich, Larry Woodman, Andrew Morton,
	Andi Kleen, Hugh Dickins, Rik van Riel

On Wed, Feb 16, 2011 at 07:33:04PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 15, 2011 at 09:05:20PM +0100, Thomas Gleixner wrote:
> > Did you try with DEBUG_PAGEALLOC, which is calling into cpa quite a
> > lot?
> 
> I tried DEBUG_PAGEALLOC and it seems to work fine (in addition to
> lockdep), it doesn't spwan any debug check.
> 
> In addition to testing it (both prev patch and below one) I looked
> into the code and the free_pages calling into
> pageattr->split_large_page apparently happens all at boot time.
> 
> Now one doubt remains if we need change_page_attr to run from irqs
> (not from DEBUG_PAGEALLOC though). Now is change_page_attr really sane
> to run from irqs? I thought __flush_tlb_all was delivering IPI (in
> that case it also wouldn't have been safe in the first place to run
> with irq disabled) but of course the "__" version is local, so after
> all maybe it's safe to run with interrupts too (I'd be amazed if
> somebody is calling it from irq, if not even DEBUG_PAGEALLOC does) but
> with the below patch it will remain safe from irq as far as the
> pgd_lock is concerned.
> 
> I think the previous patch was safe too though, avoiding VM
> manipulations from interrupts makes everything simpler. Normally only
> gart drivers should call it at init time to avoid prefetching of
> cachelines in the next 2m page with different (writeback) cache
> attributes of the pages physically aliased in the gart and mapped with
> different cache attribute, that init stuff happening from interrupt
> sounds weird. Anyway I post the below patch too as an alternative to
> still allow pageattr from irq.
> 
> With both patches the big dependency remains on __mmdrop not to run
> from irq. The alternative approach is to remove the page_table_lock
> from vmalloc_sync_all (which is only needed by Xen paravirt guest
> AFIK) and solve that problem in a different way, but I don't even know
> why they need it exactly, I tried not to impact that.

So Xen needs all page tables protected when pinning/unpinning and
extended page_table_lock to cover kernel range, which it does nowhere
else AFAICS.  But the places it extended are also taking the pgd_lock,
so I wonder if Xen could just take the pgd_lock itself in these paths
and we could revert page_table_lock back to cover user va only?
Jeremy, could this work?  Untested.

	Hannes

---
 arch/x86/include/asm/pgtable.h |    2 --
 arch/x86/mm/fault.c            |   14 ++------------
 arch/x86/mm/init_64.c          |    6 ------
 arch/x86/mm/pgtable.c          |   20 +++-----------------
 arch/x86/xen/mmu.c             |    8 ++++++++
 5 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18601c8..8c0335a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,8 +28,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
-extern struct mm_struct *pgd_page_get_mm(struct page *page);
-
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb..5da4155 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -234,19 +234,9 @@ void vmalloc_sync_all(void)
 		struct page *page;
 
 		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			spinlock_t *pgt_lock;
-			pmd_t *ret;
-
-			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-
-			spin_lock(pgt_lock);
-			ret = vmalloc_sync_one(page_address(page), address);
-			spin_unlock(pgt_lock);
-
-			if (!ret)
+		list_for_each_entry(page, &pgd_list, lru)
+			if (!vmalloc_sync_one(page_address(page), address))
 				break;
-		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a5929..9332f21 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -114,19 +114,13 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
-			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-			spin_lock(pgt_lock);
-
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd)
 				       != pgd_page_vaddr(*pgd_ref));
-
-			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 500242d..72107ab 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,19 +87,7 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-
-static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
-{
-	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
-	virt_to_page(pgd)->index = (pgoff_t)mm;
-}
-
-struct mm_struct *pgd_page_get_mm(struct page *page)
-{
-	return (struct mm_struct *)page->index;
-}
-
-static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+static void pgd_ctor(pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -113,10 +101,8 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD) {
-		pgd_set_mm(pgd, mm);
+	if (!SHARED_KERNEL_PMD)
 		pgd_list_add(pgd);
-	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -282,7 +268,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(mm, pgd);
+	pgd_ctor(pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e22810..97fbfce 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1021,7 +1021,11 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 
 static void xen_pgd_pin(struct mm_struct *mm)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&pgd_lock, flags);
 	__xen_pgd_pin(mm, mm->pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 /*
@@ -1140,7 +1144,11 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 
 static void xen_pgd_unpin(struct mm_struct *mm)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&pgd_lock, flags);
 	__xen_pgd_unpin(mm, mm->pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-17 10:19                       ` Johannes Weiner
@ 2011-02-21 14:30                         ` Andrea Arcangeli
  2011-02-21 14:53                           ` Johannes Weiner
  2011-02-21 17:40                         ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-21 14:30 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Thomas Gleixner, Jeremy Fitzhardinge, H. Peter Anvin,
	the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich, Larry Woodman, Andrew Morton,
	Andi Kleen, Hugh Dickins, Rik van Riel

On Thu, Feb 17, 2011 at 11:19:41AM +0100, Johannes Weiner wrote:
> So Xen needs all page tables protected when pinning/unpinning and
> extended page_table_lock to cover kernel range, which it does nowhere
> else AFAICS.  But the places it extended are also taking the pgd_lock,
> so I wonder if Xen could just take the pgd_lock itself in these paths
> and we could revert page_table_lock back to cover user va only?
> Jeremy, could this work?  Untested.

If this works for Xen, I definitely prefer this.

Still there's no point to insist on _irqsave if nothing takes the
pgd_lock from irq, so my patch probably should be applied anyway or
it's confusing and there's even a comment saying pgd_dtor is called
from irq, if it's not it should be corrected. But then it becomes a
cleanup notafix.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-21 14:30                         ` Andrea Arcangeli
@ 2011-02-21 14:53                           ` Johannes Weiner
  2011-02-22  7:48                             ` Jan Beulich
  0 siblings, 1 reply; 68+ messages in thread
From: Johannes Weiner @ 2011-02-21 14:53 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Thomas Gleixner, Jeremy Fitzhardinge, Konrad Rzeszutek Wilk,
	H. Peter Anvin, the arch/x86 maintainers, Xen-devel,
	Linux Kernel Mailing List, Ian Campbell, Jan Beulich,
	Larry Woodman, Andrew Morton, Andi Kleen, Hugh Dickins,
	Rik van Riel

On Mon, Feb 21, 2011 at 03:30:23PM +0100, Andrea Arcangeli wrote:
> On Thu, Feb 17, 2011 at 11:19:41AM +0100, Johannes Weiner wrote:
> > So Xen needs all page tables protected when pinning/unpinning and
> > extended page_table_lock to cover kernel range, which it does nowhere
> > else AFAICS.  But the places it extended are also taking the pgd_lock,
> > so I wonder if Xen could just take the pgd_lock itself in these paths
> > and we could revert page_table_lock back to cover user va only?
> > Jeremy, could this work?  Untested.
> 
> If this works for Xen, I definitely prefer this.

Below is real submission, with changelog and sign-off and all (except
testing on Xen itself, sorry).  I moved pgd_lock acquisition in this
version to make the lock ordering perfectly clear.  Xen people, could
you have a look at this?

> Still there's no point to insist on _irqsave if nothing takes the
> pgd_lock from irq, so my patch probably should be applied anyway or
> it's confusing and there's even a comment saying pgd_dtor is called
> from irq, if it's not it should be corrected. But then it becomes a
> cleanup notafix.

Absolutely agreed, I like your clean-up but would prefer it not being
a requirement for this fix.

---
From: Johannes Weiner <hannes@cmpxchg.org>
Subject: [patch] x86, mm: fix mm->page_table_lock deadlock

'617d34d x86, mm: Hold mm->page_table_lock while doing vmalloc_sync'
made two paths grab mm->page_table_lock while having IRQs disabled.
This is not safe as rmap waits for IPI responses with this lock held
when clearing young bits and flushing the TLB.

What 617d34d wanted was to exclude any changes to the page tables,
including demand kernel page table propagation from init_mm, so that
Xen could pin and unpin page tables atomically.

Kernel page table propagation takes the pgd_lock to protect the list
of page directories, however, so instead of adding mm->page_table_lock
to this side, this patch has Xen exclude it by taking pgd_lock itself.

The pgd_lock will then nest within mm->page_table_lock to exclude rmap
before disabling IRQs, thus fixing the deadlock.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 arch/x86/include/asm/pgtable.h |    2 --
 arch/x86/mm/fault.c            |   14 ++------------
 arch/x86/mm/init_64.c          |    6 ------
 arch/x86/mm/pgtable.c          |   20 +++-----------------
 arch/x86/xen/mmu.c             |   12 ++++++++++++
 5 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18601c8..8c0335a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,8 +28,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
-extern struct mm_struct *pgd_page_get_mm(struct page *page);
-
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb..5da4155 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -234,19 +234,9 @@ void vmalloc_sync_all(void)
 		struct page *page;
 
 		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			spinlock_t *pgt_lock;
-			pmd_t *ret;
-
-			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-
-			spin_lock(pgt_lock);
-			ret = vmalloc_sync_one(page_address(page), address);
-			spin_unlock(pgt_lock);
-
-			if (!ret)
+		list_for_each_entry(page, &pgd_list, lru)
+			if (!vmalloc_sync_one(page_address(page), address))
 				break;
-		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a5929..9332f21 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -114,19 +114,13 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
-			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-			spin_lock(pgt_lock);
-
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd)
 				       != pgd_page_vaddr(*pgd_ref));
-
-			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 500242d..72107ab 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,19 +87,7 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-
-static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
-{
-	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
-	virt_to_page(pgd)->index = (pgoff_t)mm;
-}
-
-struct mm_struct *pgd_page_get_mm(struct page *page)
-{
-	return (struct mm_struct *)page->index;
-}
-
-static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+static void pgd_ctor(pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -113,10 +101,8 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD) {
-		pgd_set_mm(pgd, mm);
+	if (!SHARED_KERNEL_PMD)
 		pgd_list_add(pgd);
-	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -282,7 +268,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(mm, pgd);
+	pgd_ctor(pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61..498e6ae 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1117,15 +1117,23 @@ void xen_mm_unpin_all(void)
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
+	unsigned long flags;
+
 	spin_lock(&next->page_table_lock);
+	spin_lock_irqsave(&pgd_lock, flags);
 	xen_pgd_pin(next);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 	spin_unlock(&next->page_table_lock);
 }
 
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
+	unsigned long flags;
+
 	spin_lock(&mm->page_table_lock);
+	spin_lock_irqsave(&pgd_lock, flags);
 	xen_pgd_pin(mm);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 	spin_unlock(&mm->page_table_lock);
 }
 
@@ -1211,16 +1219,20 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
  */
 void xen_exit_mmap(struct mm_struct *mm)
 {
+	unsigned long flags;
+
 	get_cpu();		/* make sure we don't move around */
 	xen_drop_mm_ref(mm);
 	put_cpu();
 
 	spin_lock(&mm->page_table_lock);
+	spin_lock_irqsave(&pgd_lock, flags);
 
 	/* pgd may not be pinned in the error exit path of execve */
 	if (xen_page_pinned(mm->pgd))
 		xen_pgd_unpin(mm);
 
+	spin_unlock_irqrestore(&pgd_lock, flags);
 	spin_unlock(&mm->page_table_lock);
 }
 
-- 
1.7.4


^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-17 10:19                       ` Johannes Weiner
  2011-02-21 14:30                         ` Andrea Arcangeli
@ 2011-02-21 17:40                         ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2011-02-21 17:40 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrea Arcangeli, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Xen-devel, Linux Kernel Mailing List,
	Ian Campbell, Jan Beulich, Larry Woodman, Andrew Morton,
	Andi Kleen, Hugh Dickins, Rik van Riel

On 02/17/2011 02:19 AM, Johannes Weiner wrote:
> So Xen needs all page tables protected when pinning/unpinning and
> extended page_table_lock to cover kernel range, which it does nowhere
> else AFAICS.  But the places it extended are also taking the pgd_lock,
> so I wonder if Xen could just take the pgd_lock itself in these paths
> and we could revert page_table_lock back to cover user va only?
> Jeremy, could this work?  Untested.

Yes, this looks pretty plausible, but I need to go back and check what
the original bug was to make sure.  Oh, and test it I guess.

But xen_pgd_pin/unpin only operate on the usermode parts of the address
space (since the kernel part is shared and always pinned), so there
shouldn't be any contention there.

Hm, and I don't see why pin/unpin really care about pgd_lock either. 
They're called at well-defined places (fork/exec/exit) on a single pgd. 
pin/unpin_all are a different matter - since they walk the pgd list -
but they were taking the lock anyway.

Will need to think about this a bit.

    J

> 	Hannes
>
> ---
>  arch/x86/include/asm/pgtable.h |    2 --
>  arch/x86/mm/fault.c            |   14 ++------------
>  arch/x86/mm/init_64.c          |    6 ------
>  arch/x86/mm/pgtable.c          |   20 +++-----------------
>  arch/x86/xen/mmu.c             |    8 ++++++++
>  5 files changed, 13 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 18601c8..8c0335a 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -28,8 +28,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
>  extern spinlock_t pgd_lock;
>  extern struct list_head pgd_list;
>  
> -extern struct mm_struct *pgd_page_get_mm(struct page *page);
> -
>  #ifdef CONFIG_PARAVIRT
>  #include <asm/paravirt.h>
>  #else  /* !CONFIG_PARAVIRT */
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 7d90ceb..5da4155 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -234,19 +234,9 @@ void vmalloc_sync_all(void)
>  		struct page *page;
>  
>  		spin_lock_irqsave(&pgd_lock, flags);
> -		list_for_each_entry(page, &pgd_list, lru) {
> -			spinlock_t *pgt_lock;
> -			pmd_t *ret;
> -
> -			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
> -
> -			spin_lock(pgt_lock);
> -			ret = vmalloc_sync_one(page_address(page), address);
> -			spin_unlock(pgt_lock);
> -
> -			if (!ret)
> +		list_for_each_entry(page, &pgd_list, lru)
> +			if (!vmalloc_sync_one(page_address(page), address))
>  				break;
> -		}
>  		spin_unlock_irqrestore(&pgd_lock, flags);
>  	}
>  }
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 71a5929..9332f21 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -114,19 +114,13 @@ void sync_global_pgds(unsigned long start, unsigned long end)
>  		spin_lock_irqsave(&pgd_lock, flags);
>  		list_for_each_entry(page, &pgd_list, lru) {
>  			pgd_t *pgd;
> -			spinlock_t *pgt_lock;
>  
>  			pgd = (pgd_t *)page_address(page) + pgd_index(address);
> -			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
> -			spin_lock(pgt_lock);
> -
>  			if (pgd_none(*pgd))
>  				set_pgd(pgd, *pgd_ref);
>  			else
>  				BUG_ON(pgd_page_vaddr(*pgd)
>  				       != pgd_page_vaddr(*pgd_ref));
> -
> -			spin_unlock(pgt_lock);
>  		}
>  		spin_unlock_irqrestore(&pgd_lock, flags);
>  	}
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> index 500242d..72107ab 100644
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -87,19 +87,7 @@ static inline void pgd_list_del(pgd_t *pgd)
>  #define UNSHARED_PTRS_PER_PGD				\
>  	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
>  
> -
> -static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
> -{
> -	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
> -	virt_to_page(pgd)->index = (pgoff_t)mm;
> -}
> -
> -struct mm_struct *pgd_page_get_mm(struct page *page)
> -{
> -	return (struct mm_struct *)page->index;
> -}
> -
> -static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
> +static void pgd_ctor(pgd_t *pgd)
>  {
>  	/* If the pgd points to a shared pagetable level (either the
>  	   ptes in non-PAE, or shared PMD in PAE), then just copy the
> @@ -113,10 +101,8 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
>  	}
>  
>  	/* list required to sync kernel mapping updates */
> -	if (!SHARED_KERNEL_PMD) {
> -		pgd_set_mm(pgd, mm);
> +	if (!SHARED_KERNEL_PMD)
>  		pgd_list_add(pgd);
> -	}
>  }
>  
>  static void pgd_dtor(pgd_t *pgd)
> @@ -282,7 +268,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
>  	 */
>  	spin_lock_irqsave(&pgd_lock, flags);
>  
> -	pgd_ctor(mm, pgd);
> +	pgd_ctor(pgd);
>  	pgd_prepopulate_pmd(mm, pgd, pmds);
>  
>  	spin_unlock_irqrestore(&pgd_lock, flags);
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index 5e22810..97fbfce 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1021,7 +1021,11 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
>  
>  static void xen_pgd_pin(struct mm_struct *mm)
>  {
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pgd_lock, flags);
>  	__xen_pgd_pin(mm, mm->pgd);
> +	spin_unlock_irqrestore(&pgd_lock, flags);
>  }
>  
>  /*
> @@ -1140,7 +1144,11 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
>  
>  static void xen_pgd_unpin(struct mm_struct *mm)
>  {
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pgd_lock, flags);
>  	__xen_pgd_unpin(mm, mm->pgd);
> +	spin_unlock_irqrestore(&pgd_lock, flags);
>  }
>  
>  /*
>


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-21 14:53                           ` Johannes Weiner
@ 2011-02-22  7:48                             ` Jan Beulich
  2011-02-22 13:49                               ` Andrea Arcangeli
  0 siblings, 1 reply; 68+ messages in thread
From: Jan Beulich @ 2011-02-22  7:48 UTC (permalink / raw)
  To: Andrea Arcangeli, Johannes Weiner
  Cc: Ian Campbell, Andi Kleen, Hugh Dickins, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Larry Woodman, Rik van Riel,
	Linux Kernel Mailing List, H. Peter Anvin

>>> On 21.02.11 at 15:53, Johannes Weiner <jweiner@redhat.com> wrote:
> On Mon, Feb 21, 2011 at 03:30:23PM +0100, Andrea Arcangeli wrote:
>> On Thu, Feb 17, 2011 at 11:19:41AM +0100, Johannes Weiner wrote:
>> > So Xen needs all page tables protected when pinning/unpinning and
>> > extended page_table_lock to cover kernel range, which it does nowhere
>> > else AFAICS.  But the places it extended are also taking the pgd_lock,
>> > so I wonder if Xen could just take the pgd_lock itself in these paths
>> > and we could revert page_table_lock back to cover user va only?
>> > Jeremy, could this work?  Untested.
>> 
>> If this works for Xen, I definitely prefer this.
> 
> Below is real submission, with changelog and sign-off and all (except
> testing on Xen itself, sorry).  I moved pgd_lock acquisition in this
> version to make the lock ordering perfectly clear.  Xen people, could
> you have a look at this?

While I think that it would be correct, it doesn't look like a
reasonable fix to me: It effectively serializes process (address
space) construction and destruction.

A possible alternative would be to acquire the page table lock
in vmalloc_sync_all() only in the Xen case (perhaps by storing
NULL into page->index in pgd_set_mm() when not running on
Xen). This is utilizing the fact that there aren't (supposed to
be - for non-pvops this is definitely the case) any TLB flush IPIs
under Xen, and hence the race you're trying to fix doesn't
exist there (while non-Xen doesn't need the extra locking).

Jan


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-22  7:48                             ` Jan Beulich
@ 2011-02-22 13:49                               ` Andrea Arcangeli
  2011-02-22 14:22                                   ` Jan Beulich
  0 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-22 13:49 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Johannes Weiner, Ian Campbell, Andi Kleen, Hugh Dickins,
	Jeremy Fitzhardinge, the arch/x86 maintainers, Thomas Gleixner,
	Andrew Morton, Xen-devel, Konrad Rzeszutek Wilk, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
> A possible alternative would be to acquire the page table lock
> in vmalloc_sync_all() only in the Xen case (perhaps by storing
> NULL into page->index in pgd_set_mm() when not running on
> Xen). This is utilizing the fact that there aren't (supposed to
> be - for non-pvops this is definitely the case) any TLB flush IPIs
> under Xen, and hence the race you're trying to fix doesn't
> exist there (while non-Xen doesn't need the extra locking).

That's sure ok with me. Can we use a global runtime to check if the
guest is running under Xen paravirt, instead of passing that info
through page->something?

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-22 13:49                               ` Andrea Arcangeli
@ 2011-02-22 14:22                                   ` Jan Beulich
  0 siblings, 0 replies; 68+ messages in thread
From: Jan Beulich @ 2011-02-22 14:22 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Ian Campbell, Andi Kleen, Hugh Dickins, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

>>> On 22.02.11 at 14:49, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
>> A possible alternative would be to acquire the page table lock
>> in vmalloc_sync_all() only in the Xen case (perhaps by storing
>> NULL into page->index in pgd_set_mm() when not running on
>> Xen). This is utilizing the fact that there aren't (supposed to
>> be - for non-pvops this is definitely the case) any TLB flush IPIs
>> under Xen, and hence the race you're trying to fix doesn't
>> exist there (while non-Xen doesn't need the extra locking).
> 
> That's sure ok with me. Can we use a global runtime to check if the
> guest is running under Xen paravirt, instead of passing that info
> through page->something?

If everyone's okay with putting a couple of "if (xen_pv_domain())"
into mm/fault.c - sure. I would have thought that this wouldn't be
liked, hence the suggestion to make this depend on seeing the
backlink be non-NULL.

Jan


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-22 14:22                                   ` Jan Beulich
  0 siblings, 0 replies; 68+ messages in thread
From: Jan Beulich @ 2011-02-22 14:22 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeremy Fitzhardinge, Xen-devel, Ian Campbell,
	Konrad Rzeszutek Wilk, Larry Woodman, Linux Kernel Mailing List,
	the arch/x86 maintainers, Hugh Dickins, Johannes Weiner,
	Andi Kleen, H. Peter Anvin, Thomas Gleixner, Andrew Morton

>>> On 22.02.11 at 14:49, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
>> A possible alternative would be to acquire the page table lock
>> in vmalloc_sync_all() only in the Xen case (perhaps by storing
>> NULL into page->index in pgd_set_mm() when not running on
>> Xen). This is utilizing the fact that there aren't (supposed to
>> be - for non-pvops this is definitely the case) any TLB flush IPIs
>> under Xen, and hence the race you're trying to fix doesn't
>> exist there (while non-Xen doesn't need the extra locking).
> 
> That's sure ok with me. Can we use a global runtime to check if the
> guest is running under Xen paravirt, instead of passing that info
> through page->something?

If everyone's okay with putting a couple of "if (xen_pv_domain())"
into mm/fault.c - sure. I would have thought that this wouldn't be
liked, hence the suggestion to make this depend on seeing the
backlink be non-NULL.

Jan

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-22 14:22                                   ` Jan Beulich
  (?)
@ 2011-02-22 14:34                                   ` Andrea Arcangeli
  2011-02-22 17:08                                       ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-22 14:34 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Ian Campbell, Andi Kleen, Hugh Dickins, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
> If everyone's okay with putting a couple of "if (xen_pv_domain())"
> into mm/fault.c - sure. I would have thought that this wouldn't be
> liked, hence the suggestion to make this depend on seeing the
> backlink be non-NULL.

I prefer the if (xen_pv_domain) so it also gets optimized away
at build time when CONFIG_XEN=n. I think it's also more self
explanatory to read.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-22 14:34                                   ` Andrea Arcangeli
@ 2011-02-22 17:08                                       ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2011-02-22 17:08 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jan Beulich, Ian Campbell, Andi Kleen, Hugh Dickins,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

On 02/22/2011 06:34 AM, Andrea Arcangeli wrote:
> On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
>> If everyone's okay with putting a couple of "if (xen_pv_domain())"
>> into mm/fault.c - sure. I would have thought that this wouldn't be
>> liked, hence the suggestion to make this depend on seeing the
>> backlink be non-NULL.
> I prefer the if (xen_pv_domain) so it also gets optimized away
> at build time when CONFIG_XEN=n. I think it's also more self
> explanatory to read.

Eh, very not keen about that.  I'd only consider it as a last resort.

In practice CONFIG_XEN is always enabled in distros, so the conditional
would always be done at runtime.

    J


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-22 17:08                                       ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 68+ messages in thread
From: Jeremy Fitzhardinge @ 2011-02-22 17:08 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Xen-devel, Ian Campbell, Konrad Rzeszutek Wilk, Larry Woodman,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Johannes Weiner,
	Andi Kleen, H. Peter Anvin, Thomas Gleixner, Andrew Morton

On 02/22/2011 06:34 AM, Andrea Arcangeli wrote:
> On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
>> If everyone's okay with putting a couple of "if (xen_pv_domain())"
>> into mm/fault.c - sure. I would have thought that this wouldn't be
>> liked, hence the suggestion to make this depend on seeing the
>> backlink be non-NULL.
> I prefer the if (xen_pv_domain) so it also gets optimized away
> at build time when CONFIG_XEN=n. I think it's also more self
> explanatory to read.

Eh, very not keen about that.  I'd only consider it as a last resort.

In practice CONFIG_XEN is always enabled in distros, so the conditional
would always be done at runtime.

    J

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-22 17:08                                       ` Jeremy Fitzhardinge
@ 2011-02-22 17:13                                         ` Andrea Arcangeli
  -1 siblings, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-22 17:13 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Jan Beulich, Ian Campbell, Andi Kleen, Hugh Dickins,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

On Tue, Feb 22, 2011 at 09:08:54AM -0800, Jeremy Fitzhardinge wrote:
> On 02/22/2011 06:34 AM, Andrea Arcangeli wrote:
> > On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
> >> If everyone's okay with putting a couple of "if (xen_pv_domain())"
> >> into mm/fault.c - sure. I would have thought that this wouldn't be
> >> liked, hence the suggestion to make this depend on seeing the
> >> backlink be non-NULL.
> > I prefer the if (xen_pv_domain) so it also gets optimized away
> > at build time when CONFIG_XEN=n. I think it's also more self
> > explanatory to read.
> 
> Eh, very not keen about that.  I'd only consider it as a last resort.
> 
> In practice CONFIG_XEN is always enabled in distros, so the conditional
> would always be done at runtime.

That's ok, it's still better than setting a page filed IMHO. As the
page flag would be set conditional to xen_pv_domain() anyway. I
wouldn't like to make things more complicated than they already are,
this will make it more explicit why that spinlock is needed too.
Unless we foresee other hypervisors using a generic functionality we
don't need one.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
@ 2011-02-22 17:13                                         ` Andrea Arcangeli
  0 siblings, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-22 17:13 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, Ian Campbell, Konrad Rzeszutek Wilk, Larry Woodman,
	the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Johannes Weiner,
	Andi Kleen, H. Peter Anvin, Thomas Gleixner, Andrew Morton

On Tue, Feb 22, 2011 at 09:08:54AM -0800, Jeremy Fitzhardinge wrote:
> On 02/22/2011 06:34 AM, Andrea Arcangeli wrote:
> > On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
> >> If everyone's okay with putting a couple of "if (xen_pv_domain())"
> >> into mm/fault.c - sure. I would have thought that this wouldn't be
> >> liked, hence the suggestion to make this depend on seeing the
> >> backlink be non-NULL.
> > I prefer the if (xen_pv_domain) so it also gets optimized away
> > at build time when CONFIG_XEN=n. I think it's also more self
> > explanatory to read.
> 
> Eh, very not keen about that.  I'd only consider it as a last resort.
> 
> In practice CONFIG_XEN is always enabled in distros, so the conditional
> would always be done at runtime.

That's ok, it's still better than setting a page filed IMHO. As the
page flag would be set conditional to xen_pv_domain() anyway. I
wouldn't like to make things more complicated than they already are,
this will make it more explicit why that spinlock is needed too.
Unless we foresee other hypervisors using a generic functionality we
don't need one.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-22 14:22                                   ` Jan Beulich
  (?)
  (?)
@ 2011-02-24  4:22                                   ` Andrea Arcangeli
  2011-02-24  8:23                                     ` Jan Beulich
  -1 siblings, 1 reply; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-24  4:22 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Ian Campbell, Andi Kleen, Hugh Dickins, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
> >>> On 22.02.11 at 14:49, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
> >> A possible alternative would be to acquire the page table lock
> >> in vmalloc_sync_all() only in the Xen case (perhaps by storing
> >> NULL into page->index in pgd_set_mm() when not running on
> >> Xen). This is utilizing the fact that there aren't (supposed to
> >> be - for non-pvops this is definitely the case) any TLB flush IPIs
> >> under Xen, and hence the race you're trying to fix doesn't
> >> exist there (while non-Xen doesn't need the extra locking).
> > 
> > That's sure ok with me. Can we use a global runtime to check if the
> > guest is running under Xen paravirt, instead of passing that info
> > through page->something?
> 
> If everyone's okay with putting a couple of "if (xen_pv_domain())"
> into mm/fault.c - sure. I would have thought that this wouldn't be
> liked, hence the suggestion to make this depend on seeing the
> backlink be non-NULL.

What about this? The page->private logic gets optimized away at
compile time with XEN=n.

The removal of _irqsave from pgd_lock, I'll delay it as it's no bugfix
anymore.

===
Subject: xen: stop taking the page_table_lock with irq disabled

From: Andrea Arcangeli <aarcange@redhat.com>

It's forbidden to take the page_table_lock with the irq disabled or if there's
contention the IPIs (for tlb flushes) sent with the page_table_lock held will
never run leading to a deadlock.

Only Xen needs the page_table_lock and Xen won't need IPI TLB flushes hence the
deadlock doesn't exist for Xen.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 arch/x86/include/asm/pgtable.h |    5 +++--
 arch/x86/mm/fault.c            |   10 ++++++----
 arch/x86/mm/init_64.c          |   10 ++++++----
 arch/x86/mm/pgtable.c          |    9 +++------
 4 files changed, 18 insertions(+), 16 deletions(-)

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -235,14 +235,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			spinlock_t *pgt_lock;
+			struct mm_struct *mm;
 			pmd_t *ret;
 
-			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			mm = pgd_page_get_mm(page);
 
-			spin_lock(pgt_lock);
+			if (mm)
+				spin_lock(&mm->page_table_lock);
 			ret = vmalloc_sync_one(page_address(page), address);
-			spin_unlock(pgt_lock);
+			if (mm)
+				spin_unlock(&mm->page_table_lock);
 
 			if (!ret)
 				break;
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -114,11 +114,12 @@ void sync_global_pgds(unsigned long star
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
-			spinlock_t *pgt_lock;
+			struct mm_struct *mm;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-			spin_lock(pgt_lock);
+			mm = pgd_page_get_mm(page);
+			if (mm)
+				spin_lock(&mm->page_table_lock);
 
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
@@ -126,7 +127,8 @@ void sync_global_pgds(unsigned long star
 				BUG_ON(pgd_page_vaddr(*pgd)
 				       != pgd_page_vaddr(*pgd_ref));
 
-			spin_unlock(pgt_lock);
+			if (mm)
+				spin_unlock(&mm->page_table_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
+#include <xen/xen.h>
 
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 
@@ -91,12 +92,8 @@ static inline void pgd_list_del(pgd_t *p
 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 {
 	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
-	virt_to_page(pgd)->index = (pgoff_t)mm;
-}
-
-struct mm_struct *pgd_page_get_mm(struct page *page)
-{
-	return (struct mm_struct *)page->index;
+	if (xen_pv_domain())
+		virt_to_page(pgd)->index = (pgoff_t)mm;
 }
 
 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,8 +28,6 @@ extern unsigned long empty_zero_page[PAG
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
-extern struct mm_struct *pgd_page_get_mm(struct page *page);
-
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
@@ -83,6 +81,9 @@ extern struct mm_struct *pgd_page_get_mm
 
 #endif	/* CONFIG_PARAVIRT */
 
+#define pgd_page_get_mm(__page) \
+	((struct mm_struct *)(xen_pv_domain() ? (__page)->index : 0))
+
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-24  4:22                                   ` Andrea Arcangeli
@ 2011-02-24  8:23                                     ` Jan Beulich
  2011-02-24 14:11                                       ` Andrea Arcangeli
  0 siblings, 1 reply; 68+ messages in thread
From: Jan Beulich @ 2011-02-24  8:23 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Ian Campbell, Andi Kleen, Hugh Dickins, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

>>> On 24.02.11 at 05:22, Andrea Arcangeli <aarcange@redhat.com> wrote:
> On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
>> >>> On 22.02.11 at 14:49, Andrea Arcangeli <aarcange@redhat.com> wrote:
>> > On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
>> >> A possible alternative would be to acquire the page table lock
>> >> in vmalloc_sync_all() only in the Xen case (perhaps by storing
>> >> NULL into page->index in pgd_set_mm() when not running on
>> >> Xen). This is utilizing the fact that there aren't (supposed to
>> >> be - for non-pvops this is definitely the case) any TLB flush IPIs
>> >> under Xen, and hence the race you're trying to fix doesn't
>> >> exist there (while non-Xen doesn't need the extra locking).
>> > 
>> > That's sure ok with me. Can we use a global runtime to check if the
>> > guest is running under Xen paravirt, instead of passing that info
>> > through page->something?
>> 
>> If everyone's okay with putting a couple of "if (xen_pv_domain())"
>> into mm/fault.c - sure. I would have thought that this wouldn't be
>> liked, hence the suggestion to make this depend on seeing the
>> backlink be non-NULL.
> 
> What about this? The page->private logic gets optimized away at
> compile time with XEN=n.
> 
> The removal of _irqsave from pgd_lock, I'll delay it as it's no bugfix
> anymore.
> 
> ===
> Subject: xen: stop taking the page_table_lock with irq disabled
> 
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> It's forbidden to take the page_table_lock with the irq disabled or if there's
> contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> never run leading to a deadlock.
> 
> Only Xen needs the page_table_lock and Xen won't need IPI TLB flushes hence 
> the deadlock doesn't exist for Xen.

Looks reasonable to me, except for the implementation no longer
matching subject and description (the lock still gets taken with
IRQs disabled, just that - as far as we can tell so far - doesn't
matter for Xen).

With the conditional on the reader side I also wonder whether
the conditional on the writer side is really a good thing to have,
considering that generally distro kernels are likely to have XEN
enabled.

Jan


^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] fix pgd_lock deadlock
  2011-02-24  8:23                                     ` Jan Beulich
@ 2011-02-24 14:11                                       ` Andrea Arcangeli
  0 siblings, 0 replies; 68+ messages in thread
From: Andrea Arcangeli @ 2011-02-24 14:11 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Ian Campbell, Andi Kleen, Hugh Dickins, Jeremy Fitzhardinge,
	the arch/x86 maintainers, Thomas Gleixner, Andrew Morton,
	Xen-devel, Konrad Rzeszutek Wilk, Johannes Weiner, Larry Woodman,
	Rik van Riel, Linux Kernel Mailing List, H. Peter Anvin

On Thu, Feb 24, 2011 at 08:23:37AM +0000, Jan Beulich wrote:
> >>> On 24.02.11 at 05:22, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > On Tue, Feb 22, 2011 at 02:22:53PM +0000, Jan Beulich wrote:
> >> >>> On 22.02.11 at 14:49, Andrea Arcangeli <aarcange@redhat.com> wrote:
> >> > On Tue, Feb 22, 2011 at 07:48:54AM +0000, Jan Beulich wrote:
> >> >> A possible alternative would be to acquire the page table lock
> >> >> in vmalloc_sync_all() only in the Xen case (perhaps by storing
> >> >> NULL into page->index in pgd_set_mm() when not running on
> >> >> Xen). This is utilizing the fact that there aren't (supposed to
> >> >> be - for non-pvops this is definitely the case) any TLB flush IPIs
> >> >> under Xen, and hence the race you're trying to fix doesn't
> >> >> exist there (while non-Xen doesn't need the extra locking).
> >> > 
> >> > That's sure ok with me. Can we use a global runtime to check if the
> >> > guest is running under Xen paravirt, instead of passing that info
> >> > through page->something?
> >> 
> >> If everyone's okay with putting a couple of "if (xen_pv_domain())"
> >> into mm/fault.c - sure. I would have thought that this wouldn't be
> >> liked, hence the suggestion to make this depend on seeing the
> >> backlink be non-NULL.
> > 
> > What about this? The page->private logic gets optimized away at
> > compile time with XEN=n.
> > 
> > The removal of _irqsave from pgd_lock, I'll delay it as it's no bugfix
> > anymore.
> > 
> > ===
> > Subject: xen: stop taking the page_table_lock with irq disabled
> > 
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > It's forbidden to take the page_table_lock with the irq disabled or if there's
> > contention the IPIs (for tlb flushes) sent with the page_table_lock held will
> > never run leading to a deadlock.
> > 
> > Only Xen needs the page_table_lock and Xen won't need IPI TLB flushes hence 
> > the deadlock doesn't exist for Xen.
> 
> Looks reasonable to me, except for the implementation no longer
> matching subject and description (the lock still gets taken with
> IRQs disabled, just that - as far as we can tell so far - doesn't
> matter for Xen).
> 
> With the conditional on the reader side I also wonder whether
> the conditional on the writer side is really a good thing to have,
> considering that generally distro kernels are likely to have XEN
> enabled.

Well there is no point to keep the writer side functional. There
aren't only distro kernels out there, I really like features to go
away completely at build time when they're not needed. Not because
it's Xen (I recently did the same thing for THP too for example,
making sure every sign of it gone away with a =n setting, with the
exception perhaps of the put_page/get_page compound logic but at least
the compound_lock goes away). It simply makes no sense to page->index
= mm if nobody could possibly read it so I prefer this. Otherwise I
wouldn't need to put in a macro for the reader side to workaround the
fact the xen.h isn't available in pgtable.h and I could leave it in
pgtable.c (and I didn't want to add it to pgtable.h). It seems to
build on i386 but it's better to re-verify i386, because on older
kernels I had to add a xen/xen.h include to x86/mm/fault.c too to
x86_64 (but upstream fault.c seems not to need it). I'll try to re-run
some build with XEN on and off x86 32/64 to be really sure all
includes are ok.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [2.6.32.y][PATCH] fix pgd_lock deadlock
  2011-02-16 14:49                                   ` Andrea Arcangeli
  2011-02-16 16:26                                       ` Rik van Riel
  2011-02-16 20:15                                     ` Ingo Molnar
@ 2012-04-23  9:07                                     ` Philipp Hahn
  2012-04-23 19:09                                       ` Willy Tarreau
  2 siblings, 1 reply; 68+ messages in thread
From: Philipp Hahn @ 2012-04-23  9:07 UTC (permalink / raw)
  To: stable
  Cc: Andrea Arcangeli, Ingo Molnar, Jeremy Fitzhardinge,
	Peter Zijlstra, the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Andrew Morton, Johannes Weiner, H. Peter Anvin, Thomas Gleixner,
	Larry Woodman, Rik van Riel, Konrad Rzeszutek Wilk,
	Linus Torvalds, 669335


[-- Attachment #1.1: Type: text/plain, Size: 2790 bytes --]

Hello,

On Wednesday 16 February 2011 15:49:47 Andrea Arcangeli wrote:
> Subject: fix pgd_lock deadlock
>
> From: Andrea Arcangeli <aarcange@redhat.com>
>
> It's forbidden to take the page_table_lock with the irq disabled or if
> there's contention the IPIs (for tlb flushes) sent with the page_table_lock
> held will never run leading to a deadlock.
>
> Apparently nobody takes the pgd_lock from irq so the _irqsave can be
> removed.
>
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

This patch (original commit Id for 2.6.38 
a79e53d85683c6dd9f99c90511028adc2043031f) needs to be back-ported to 2.6.32.x 
as well.
I observed a dead-lock problem when running a PAE enabled Debian 2.6.32.46+ 
kernel with 6 VCPUs as a KVM on (2.6.32, 3.2, 3.3) kernel, which showed the 
following behaviour:

1 VCPU is stuck in
  pgd_alloc() → pgd_prepopulate_pmb() →... →  flush_tlb_others_ipi()
while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
    cpu_relax();
(gdb) print f->flush_cpumask
$5 = {1}

while all other VCPUs are stuck in
  pgd_alloc() → spin_lock_irqsave(pgd_lock)

I tracked it down to the commit
 2.6.39-rc1: 4981d01eada5354d81c8929d5b2836829ba3df7b
 2.6.32.34: ba456fd7ec1bdc31a4ad4a6bd02802dcaa730a33
 x86: Flush TLB if PGD entry is changed in i386 PAE mode
which when reverted made the bug disappear.

Comparing 3.2 to 2.6.32.34 showed that the 'pgd-deadlock'-patch went into 
2.6.38, that is before the 'PAE correctness'-patch, so the problem was 
probably never observed in the main development branch.
But for 2.6.32 the 'pgd-deadlock' patch is still missing, so the 'PAE 
corretness'-patch made the problem worse with 2.6.32.

The Patch was also back-ported to the OpenSUSE Kernel
<http://kernel.opensuse.org/cgit/kernel-source/commit/?id=ac27c01aa880c65d17043ab87249c613ac4c3635>,
Since the patch didn't apply cleanly on the current Debian kernel, I had to 
backport it for us and Debian. The patch is also available from our (German) 
Bugzilla <https://forge.univention.org/bugzilla/show_bug.cgi?id=26661> or 
from the Debian BTS at 
<http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=669335>.

I have no easy test case, but running multiple parallel builds inside the VM 
normally triggers the bug within seconds to minutes. With the patch applied 
the VM survived a night building packages without any problem.

Signed-off-by: Philipp Hahn <hahn@univention.de>

Sincerely
Philipp
-- 
Philipp Hahn           Open Source Software Engineer      hahn@univention.de
Univention GmbH        be open.                       fon: +49 421 22 232- 0
Mary-Somerville-Str.1  D-28359 Bremen                 fax: +49 421 22 232-99
                                                   http://www.univention.de/

[-- Attachment #1.2: x86-mm-Fix-pgd_lock-deadlock.patch --]
[-- Type: text/x-diff, Size: 5882 bytes --]

It's forbidden to take the page_table_lock with the irq disabled
or if there's contention the IPIs (for tlb flushes) sent with
the page_table_lock held will never run leading to a deadlock.

Nobody takes the pgd_lock from irq context so the _irqsave can be
removed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <201102162345.p1GNjMjm021738@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Git-commit: a79e53d85683c6dd9f99c90511028adc2043031f
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -223,15 +223,14 @@ void vmalloc_sync_all(void)
 	     address >= TASK_SIZE && address < FIXADDR_TOP;
 	     address += PMD_SIZE) {
 
-		unsigned long flags;
 		struct page *page;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			if (!vmalloc_sync_one(page_address(page), address))
 				break;
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
@@ -331,13 +330,12 @@ void vmalloc_sync_all(void)
 	     address += PGDIR_SIZE) {
 
 		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
 		struct page *page;
 
 		if (pgd_none(*pgd_ref))
 			continue;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
@@ -346,7 +344,7 @@ void vmalloc_sync_all(void)
 			else
 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -56,12 +56,10 @@ static unsigned long direct_pages_count[
 
 void update_page_count(int level, unsigned long pages)
 {
-	unsigned long flags;
-
 	/* Protect against CPA */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	direct_pages_count[level] += pages;
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 static void split_page_count(int level)
@@ -354,7 +352,7 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
-	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot;
 	int i, do_split = 1;
@@ -363,7 +361,7 @@ try_preserve_large_page(pte_t *kpte, uns
 	if (cpa->force_split)
 		return 1;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
@@ -458,14 +456,14 @@ try_preserve_large_page(pte_t *kpte, uns
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return do_split;
 }
 
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-	unsigned long flags, pfn, pfninc = 1;
+	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
@@ -479,7 +477,7 @@ static int split_large_page(pte_t *kpte,
 	if (!base)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -551,7 +549,7 @@ out_unlock:
 	 */
 	if (base)
 		__free_page(base);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return 0;
 }
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -110,14 +110,12 @@ static void pgd_ctor(pgd_t *pgd)
 
 static void pgd_dtor(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
-
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -248,7 +246,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
-	unsigned long flags;
 
 	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
 
@@ -268,12 +265,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 * respect to anything walking the pgd_list, so that they
 	 * never see a partially populated pgd.
 	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	pgd_ctor(pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return pgd;
 
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -988,10 +988,9 @@ static void xen_pgd_pin(struct mm_struct
  */
 void xen_mm_pin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (!PagePinned(page)) {
@@ -1000,7 +999,7 @@ void xen_mm_pin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -1101,10 +1100,9 @@ static void xen_pgd_unpin(struct mm_stru
  */
 void xen_mm_unpin_all(void)
 {
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	list_for_each_entry(page, &pgd_list, lru) {
 		if (PageSavePinned(page)) {
@@ -1114,7 +1112,7 @@ void xen_mm_unpin_all(void)
 		}
 	}
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [2.6.32.y][PATCH] fix pgd_lock deadlock
  2012-04-23  9:07                                     ` [2.6.32.y][PATCH] " Philipp Hahn
@ 2012-04-23 19:09                                       ` Willy Tarreau
  0 siblings, 0 replies; 68+ messages in thread
From: Willy Tarreau @ 2012-04-23 19:09 UTC (permalink / raw)
  To: Philipp Hahn
  Cc: stable, Andrea Arcangeli, Ingo Molnar, Jeremy Fitzhardinge,
	Peter Zijlstra, the arch/x86 maintainers, Hugh Dickins,
	Linux Kernel Mailing List, Jan Beulich, Andi Kleen,
	Andrew Morton, Johannes Weiner, H. Peter Anvin, Thomas Gleixner,
	Larry Woodman, Rik van Riel, Konrad Rzeszutek Wilk,
	Linus Torvalds, 669335

Hello Philipp,

On Mon, Apr 23, 2012 at 11:07:53AM +0200, Philipp Hahn wrote:
> Hello,
> 
> On Wednesday 16 February 2011 15:49:47 Andrea Arcangeli wrote:
> > Subject: fix pgd_lock deadlock
> >
> > From: Andrea Arcangeli <aarcange@redhat.com>
> >
> > It's forbidden to take the page_table_lock with the irq disabled or if
> > there's contention the IPIs (for tlb flushes) sent with the page_table_lock
> > held will never run leading to a deadlock.
> >
> > Apparently nobody takes the pgd_lock from irq so the _irqsave can be
> > removed.
> >
> > Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> 
> This patch (original commit Id for 2.6.38 
> a79e53d85683c6dd9f99c90511028adc2043031f) needs to be back-ported to 2.6.32.x 
> as well.
> I observed a dead-lock problem when running a PAE enabled Debian 2.6.32.46+ 
> kernel with 6 VCPUs as a KVM on (2.6.32, 3.2, 3.3) kernel, which showed the 
> following behaviour:
> 
> 1 VCPU is stuck in
>   pgd_alloc() ??? pgd_prepopulate_pmb() ???... ???  flush_tlb_others_ipi()
> while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
>     cpu_relax();
> (gdb) print f->flush_cpumask
> $5 = {1}
> 
> while all other VCPUs are stuck in
>   pgd_alloc() ??? spin_lock_irqsave(pgd_lock)
> 
> I tracked it down to the commit
>  2.6.39-rc1: 4981d01eada5354d81c8929d5b2836829ba3df7b
>  2.6.32.34: ba456fd7ec1bdc31a4ad4a6bd02802dcaa730a33
>  x86: Flush TLB if PGD entry is changed in i386 PAE mode
> which when reverted made the bug disappear.
> 
> Comparing 3.2 to 2.6.32.34 showed that the 'pgd-deadlock'-patch went into 
> 2.6.38, that is before the 'PAE correctness'-patch, so the problem was 
> probably never observed in the main development branch.
> But for 2.6.32 the 'pgd-deadlock' patch is still missing, so the 'PAE 
> corretness'-patch made the problem worse with 2.6.32.
> 
> The Patch was also back-ported to the OpenSUSE Kernel
> <http://kernel.opensuse.org/cgit/kernel-source/commit/?id=ac27c01aa880c65d17043ab87249c613ac4c3635>,
> Since the patch didn't apply cleanly on the current Debian kernel, I had to 
> backport it for us and Debian. The patch is also available from our (German) 
> Bugzilla <https://forge.univention.org/bugzilla/show_bug.cgi?id=26661> or 
> from the Debian BTS at 
> <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=669335>.
> 
> I have no easy test case, but running multiple parallel builds inside the VM 
> normally triggers the bug within seconds to minutes. With the patch applied 
> the VM survived a night building packages without any problem.
> 
> Signed-off-by: Philipp Hahn <hahn@univention.de>
> 
> Sincerely
> Philipp

Thank you, I'm queuing it for next 32-stable.

Regards,
Willy


^ permalink raw reply	[flat|nested] 68+ messages in thread

end of thread, other threads:[~2012-04-23 19:10 UTC | newest]

Thread overview: 68+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-14 20:56 [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync Jeremy Fitzhardinge
2010-10-14 20:56 ` Jeremy Fitzhardinge
2010-10-15 17:07 ` [Xen-devel] " Jeremy Fitzhardinge
2010-10-15 17:07   ` Jeremy Fitzhardinge
2010-10-19 22:17   ` [tip:x86/mm] x86, mm: Hold " tip-bot for Jeremy Fitzhardinge
2010-10-20 10:36     ` Borislav Petkov
2010-10-20 19:31       ` [tip:x86/mm] x86, mm: Fix incorrect data type in vmalloc_sync_all() tip-bot for tip-bot for Jeremy Fitzhardinge
2010-10-20 19:50         ` Borislav Petkov
2010-10-20 19:53           ` H. Peter Anvin
2010-10-20 20:10             ` Borislav Petkov
2010-10-20 20:13               ` H. Peter Anvin
2010-10-20 22:11                 ` Borislav Petkov
2010-10-20 21:26             ` Ben Pfaff
2010-10-20 19:58       ` tip-bot for Borislav Petkov
2010-10-21 21:06 ` [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync Jeremy Fitzhardinge
2010-10-21 21:06   ` Jeremy Fitzhardinge
2010-10-21 21:26   ` H. Peter Anvin
2010-10-21 21:26     ` H. Peter Anvin
2010-10-21 21:34     ` Jeremy Fitzhardinge
2011-02-03  2:48   ` Andrea Arcangeli
2011-02-03 20:44     ` Jeremy Fitzhardinge
2011-02-03 20:44       ` Jeremy Fitzhardinge
2011-02-04  1:21       ` Andrea Arcangeli
2011-02-04 21:27         ` Jeremy Fitzhardinge
2011-02-07 23:20           ` Andrea Arcangeli
2011-02-15 19:07             ` [PATCH] fix pgd_lock deadlock Andrea Arcangeli
2011-02-15 19:26               ` Thomas Gleixner
2011-02-15 19:31                 ` Larry Woodman
2011-02-15 19:54                 ` Andrea Arcangeli
2011-02-15 19:54                   ` Andrea Arcangeli
2011-02-15 20:05                   ` Thomas Gleixner
2011-02-15 20:05                     ` Thomas Gleixner
2011-02-15 20:26                     ` Thomas Gleixner
2011-02-15 20:26                       ` Thomas Gleixner
2011-02-15 22:52                       ` Andrea Arcangeli
2011-02-15 23:03                         ` Thomas Gleixner
2011-02-15 23:03                           ` Thomas Gleixner
2011-02-15 23:17                           ` Andrea Arcangeli
2011-02-16  9:58                             ` Peter Zijlstra
2011-02-16 10:15                               ` Andrea Arcangeli
2011-02-16 10:15                                 ` Andrea Arcangeli
2011-02-16 10:28                                 ` Ingo Molnar
2011-02-16 10:28                                   ` Ingo Molnar
2011-02-16 14:49                                   ` Andrea Arcangeli
2011-02-16 16:26                                     ` Rik van Riel
2011-02-16 16:26                                       ` Rik van Riel
2011-02-16 20:15                                     ` Ingo Molnar
2012-04-23  9:07                                     ` [2.6.32.y][PATCH] " Philipp Hahn
2012-04-23 19:09                                       ` Willy Tarreau
2011-02-16 18:33                     ` [PATCH] " Andrea Arcangeli
2011-02-16 21:34                       ` Konrad Rzeszutek Wilk
2011-02-17 10:19                       ` Johannes Weiner
2011-02-21 14:30                         ` Andrea Arcangeli
2011-02-21 14:53                           ` Johannes Weiner
2011-02-22  7:48                             ` Jan Beulich
2011-02-22 13:49                               ` Andrea Arcangeli
2011-02-22 14:22                                 ` Jan Beulich
2011-02-22 14:22                                   ` Jan Beulich
2011-02-22 14:34                                   ` Andrea Arcangeli
2011-02-22 17:08                                     ` Jeremy Fitzhardinge
2011-02-22 17:08                                       ` Jeremy Fitzhardinge
2011-02-22 17:13                                       ` Andrea Arcangeli
2011-02-22 17:13                                         ` Andrea Arcangeli
2011-02-24  4:22                                   ` Andrea Arcangeli
2011-02-24  8:23                                     ` Jan Beulich
2011-02-24 14:11                                       ` Andrea Arcangeli
2011-02-21 17:40                         ` Jeremy Fitzhardinge
2011-02-03 20:59     ` [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync Larry Woodman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.