linux-parisc.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] parisc: use per-pagetable spinlock
@ 2019-04-06 19:36 Mikulas Patocka
  2019-04-06 19:49 ` James Bottomley
  0 siblings, 1 reply; 18+ messages in thread
From: Mikulas Patocka @ 2019-04-06 19:36 UTC (permalink / raw)
  To: James E.J. Bottomley, Helge Deller, John David Anglin; +Cc: linux-parisc

Parisc uses a global spinlock to protect pagetable updates in the TLB
fault handlers. When multiple cores are taking TLB faults simultaneously,
the cache line containing the spinlock becomes a bottleneck.

This patch embeds the spinlock in the top level page directory, so that
every process has its own lock. It improves performance by 30% when doing
parallel compilations.

(please test it on 32-bit kernels - I don't have a machine for that)

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 arch/parisc/include/asm/pgalloc.h  |    1 +
 arch/parisc/include/asm/pgtable.h  |   35 +++++++++++++++++++++++------------
 arch/parisc/include/asm/tlbflush.h |    6 +++---
 arch/parisc/kernel/cache.c         |    2 +-
 arch/parisc/kernel/entry.S         |    8 ++------
 5 files changed, 30 insertions(+), 22 deletions(-)

Index: linux-5.1-rc3/arch/parisc/include/asm/pgtable.h
===================================================================
--- linux-5.1-rc3.orig/arch/parisc/include/asm/pgtable.h	2019-04-06 11:12:03.000000000 +0200
+++ linux-5.1-rc3/arch/parisc/include/asm/pgtable.h	2019-04-06 11:12:22.000000000 +0200
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/cache.h>
 
-extern spinlock_t pa_tlb_lock;
+static inline spinlock_t *pgd_spinlock(pgd_t *);
 
 /*
  * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
@@ -59,11 +59,11 @@ static inline void purge_tlb_entries(str
 	do {							\
 		pte_t old_pte;					\
 		unsigned long flags;				\
-		spin_lock_irqsave(&pa_tlb_lock, flags);		\
+		spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\
 		old_pte = *ptep;				\
 		set_pte(ptep, pteval);				\
 		purge_tlb_entries(mm, addr);			\
-		spin_unlock_irqrestore(&pa_tlb_lock, flags);	\
+		spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\
 	} while (0)
 
 #endif /* !__ASSEMBLY__ */
@@ -88,10 +88,10 @@ static inline void purge_tlb_entries(str
 #if CONFIG_PGTABLE_LEVELS == 3
 #define PGD_ORDER	1 /* Number of pages per pgd */
 #define PMD_ORDER	1 /* Number of pages per pmd */
-#define PGD_ALLOC_ORDER	2 /* first pgd contains pmd */
+#define PGD_ALLOC_ORDER	(2 + 1) /* first pgd contains pmd */
 #else
 #define PGD_ORDER	1 /* Number of pages per pgd */
-#define PGD_ALLOC_ORDER	PGD_ORDER
+#define PGD_ALLOC_ORDER	(PGD_ORDER + 1)
 #endif
 
 /* Definitions for 3rd level (we use PLD here for Page Lower directory
@@ -459,6 +459,17 @@ extern void update_mmu_cache(struct vm_a
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
+
+static inline spinlock_t *pgd_spinlock(pgd_t *pgd)
+{
+	extern spinlock_t pa_tlb_flush_lock;
+
+	if (unlikely(pgd == swapper_pg_dir))
+		return &pa_tlb_flush_lock;
+	return (spinlock_t *)((char *)pgd + (PAGE_SIZE << (PGD_ALLOC_ORDER - 1)));
+}
+
+
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	pte_t pte;
@@ -467,15 +478,15 @@ static inline int ptep_test_and_clear_yo
 	if (!pte_young(*ptep))
 		return 0;
 
-	spin_lock_irqsave(&pa_tlb_lock, flags);
+	spin_lock_irqsave(pgd_spinlock(vma->vm_mm->pgd), flags);
 	pte = *ptep;
 	if (!pte_young(pte)) {
-		spin_unlock_irqrestore(&pa_tlb_lock, flags);
+		spin_unlock_irqrestore(pgd_spinlock(vma->vm_mm->pgd), flags);
 		return 0;
 	}
 	set_pte(ptep, pte_mkold(pte));
 	purge_tlb_entries(vma->vm_mm, addr);
-	spin_unlock_irqrestore(&pa_tlb_lock, flags);
+	spin_unlock_irqrestore(pgd_spinlock(vma->vm_mm->pgd), flags);
 	return 1;
 }
 
@@ -485,11 +496,11 @@ static inline pte_t ptep_get_and_clear(s
 	pte_t old_pte;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pa_tlb_lock, flags);
+	spin_lock_irqsave(pgd_spinlock(mm->pgd), flags);
 	old_pte = *ptep;
 	set_pte(ptep, __pte(0));
 	purge_tlb_entries(mm, addr);
-	spin_unlock_irqrestore(&pa_tlb_lock, flags);
+	spin_unlock_irqrestore(pgd_spinlock(mm->pgd), flags);
 
 	return old_pte;
 }
@@ -497,10 +508,10 @@ static inline pte_t ptep_get_and_clear(s
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	unsigned long flags;
-	spin_lock_irqsave(&pa_tlb_lock, flags);
+	spin_lock_irqsave(pgd_spinlock(mm->pgd), flags);
 	set_pte(ptep, pte_wrprotect(*ptep));
 	purge_tlb_entries(mm, addr);
-	spin_unlock_irqrestore(&pa_tlb_lock, flags);
+	spin_unlock_irqrestore(pgd_spinlock(mm->pgd), flags);
 }
 
 #define pte_same(A,B)	(pte_val(A) == pte_val(B))
Index: linux-5.1-rc3/arch/parisc/kernel/cache.c
===================================================================
--- linux-5.1-rc3.orig/arch/parisc/kernel/cache.c	2019-04-06 11:12:03.000000000 +0200
+++ linux-5.1-rc3/arch/parisc/kernel/cache.c	2019-04-06 11:12:03.000000000 +0200
@@ -45,7 +45,7 @@ void flush_icache_page_asm(unsigned long
  * by software.  We put a spinlock around all TLB flushes  to
  * ensure this.
  */
-DEFINE_SPINLOCK(pa_tlb_lock);
+DEFINE_SPINLOCK(pa_tlb_flush_lock);
 
 struct pdc_cache_info cache_info __read_mostly;
 #ifndef CONFIG_PA20
Index: linux-5.1-rc3/arch/parisc/include/asm/tlbflush.h
===================================================================
--- linux-5.1-rc3.orig/arch/parisc/include/asm/tlbflush.h	2019-04-06 11:12:03.000000000 +0200
+++ linux-5.1-rc3/arch/parisc/include/asm/tlbflush.h	2019-04-06 11:12:03.000000000 +0200
@@ -18,10 +18,10 @@
  * It is also used to ensure PTE updates are atomic and consistent
  * with the TLB.
  */
-extern spinlock_t pa_tlb_lock;
+extern spinlock_t pa_tlb_flush_lock;
 
-#define purge_tlb_start(flags)	spin_lock_irqsave(&pa_tlb_lock, flags)
-#define purge_tlb_end(flags)	spin_unlock_irqrestore(&pa_tlb_lock, flags)
+#define purge_tlb_start(flags)	spin_lock_irqsave(&pa_tlb_flush_lock, flags)
+#define purge_tlb_end(flags)	spin_unlock_irqrestore(&pa_tlb_flush_lock, flags)
 
 extern void flush_tlb_all(void);
 extern void flush_tlb_all_local(void *);
Index: linux-5.1-rc3/arch/parisc/kernel/entry.S
===================================================================
--- linux-5.1-rc3.orig/arch/parisc/kernel/entry.S	2019-04-06 11:12:03.000000000 +0200
+++ linux-5.1-rc3/arch/parisc/kernel/entry.S	2019-04-06 11:12:22.000000000 +0200
@@ -50,12 +50,8 @@
 
 	.import		pa_tlb_lock,data
 	.macro  load_pa_tlb_lock reg
-#if __PA_LDCW_ALIGNMENT > 4
-	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
-	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
-#else
-	load32	PA(pa_tlb_lock), \reg
-#endif
+	mfctl		%cr25,\reg
+	addil		L%(PAGE_SIZE << (PGD_ALLOC_ORDER - 1)),\reg
 	.endm
 
 	/* space_to_prot macro creates a prot id from a space id */
Index: linux-5.1-rc3/arch/parisc/include/asm/pgalloc.h
===================================================================
--- linux-5.1-rc3.orig/arch/parisc/include/asm/pgalloc.h	2019-04-06 11:12:03.000000000 +0200
+++ linux-5.1-rc3/arch/parisc/include/asm/pgalloc.h	2019-04-06 11:12:03.000000000 +0200
@@ -41,6 +41,7 @@ static inline pgd_t *pgd_alloc(struct mm
 		__pgd_val_set(*pgd, PxD_FLAG_ATTACHED);
 #endif
 	}
+	spin_lock_init(pgd_spinlock(actual_pgd));
 	return actual_pgd;
 }
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 19:36 [PATCH] parisc: use per-pagetable spinlock Mikulas Patocka
@ 2019-04-06 19:49 ` James Bottomley
  2019-04-06 20:13   ` Mikulas Patocka
                     ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: James Bottomley @ 2019-04-06 19:49 UTC (permalink / raw)
  To: Mikulas Patocka, Helge Deller, John David Anglin; +Cc: linux-parisc

On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
> Parisc uses a global spinlock to protect pagetable updates in the TLB
> fault handlers. When multiple cores are taking TLB faults
> simultaneously, the cache line containing the spinlock becomes a
> bottleneck.

You can't do this.  As the comment in cache.c says: the lock is to
protect the merced bus, which runs between the CPUs on some systems. 
That means it must be a single, global lock.  Of course, on systems
without a merced bus, we don't need the lock at all, so runtime
patching might be usable to fix that case.

James


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 19:49 ` James Bottomley
@ 2019-04-06 20:13   ` Mikulas Patocka
  2019-04-06 20:32     ` James Bottomley
  2019-04-06 20:15   ` Helge Deller
  2019-04-10 19:10   ` John David Anglin
  2 siblings, 1 reply; 18+ messages in thread
From: Mikulas Patocka @ 2019-04-06 20:13 UTC (permalink / raw)
  To: James Bottomley; +Cc: Helge Deller, John David Anglin, linux-parisc



On Sat, 6 Apr 2019, James Bottomley wrote:

> On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
> > Parisc uses a global spinlock to protect pagetable updates in the TLB
> > fault handlers. When multiple cores are taking TLB faults
> > simultaneously, the cache line containing the spinlock becomes a
> > bottleneck.
> 
> You can't do this.  As the comment in cache.c says: the lock is to
> protect the merced bus, which runs between the CPUs on some systems. 
> That means it must be a single, global lock.

So - how could we detect if the Merced bus is present?

> Of course, on systems without a merced bus, we don't need the lock at 
> all, so runtime patching might be usable to fix that case.
> 
> James

The lock is still needed to synchronize TLB fault handlers with the code 
that modifies the pagetables - but we could have per-process lock for this 
purpose.

Mikulas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 19:49 ` James Bottomley
  2019-04-06 20:13   ` Mikulas Patocka
@ 2019-04-06 20:15   ` Helge Deller
  2019-04-07  2:48     ` James Bottomley
  2019-04-10 19:10   ` John David Anglin
  2 siblings, 1 reply; 18+ messages in thread
From: Helge Deller @ 2019-04-06 20:15 UTC (permalink / raw)
  To: James Bottomley, Mikulas Patocka, John David Anglin; +Cc: linux-parisc

On 06.04.19 21:49, James Bottomley wrote:
> On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
>> Parisc uses a global spinlock to protect pagetable updates in the TLB
>> fault handlers. When multiple cores are taking TLB faults
>> simultaneously, the cache line containing the spinlock becomes a
>> bottleneck.
>
> You can't do this.  As the comment in cache.c says: the lock is to
> protect the merced bus, which runs between the CPUs on some systems.
> That means it must be a single, global lock.  Of course, on systems
> without a merced bus, we don't need the lock at all, so runtime
> patching might be usable to fix that case.

Is there a way to detect if a system has the Merced bus?

See arch/parisc/include/asm/tlbflush.h too:
/* This is for the serialisation of PxTLB broadcasts.  At least on the
 * N class systems, only one PxTLB inter processor broadcast can be
 * active at any one time on the Merced bus.  This tlb purge
 * synchronisation is fairly lightweight and harmless so we activate
 * it on all systems not just the N class.

30% speed improvement by Mikulas patches don't seem lightweight...

Helge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 20:13   ` Mikulas Patocka
@ 2019-04-06 20:32     ` James Bottomley
  2019-04-06 20:40       ` Mikulas Patocka
  0 siblings, 1 reply; 18+ messages in thread
From: James Bottomley @ 2019-04-06 20:32 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Helge Deller, John David Anglin, linux-parisc

On Sat, 2019-04-06 at 16:13 -0400, Mikulas Patocka wrote:
> 
> On Sat, 6 Apr 2019, James Bottomley wrote:
> 
> > On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
> > > Parisc uses a global spinlock to protect pagetable updates in the
> TLB
> > > fault handlers. When multiple cores are taking TLB faults
> > > simultaneously, the cache line containing the spinlock becomes a
> > > bottleneck.
> > 
> > You can't do this.  As the comment in cache.c says: the lock is to
> > protect the merced bus, which runs between the CPUs on some
> systems. 
> > That means it must be a single, global lock.
> 
> So - how could we detect if the Merced bus is present?

My best recollection is that it's only N class systems.

> > Of course, on systems without a merced bus, we don't need the lock
> at 
> > all, so runtime patching might be usable to fix that case.
> > 
> > James
> 
> The lock is still needed to synchronize TLB fault handlers with the
> code that modifies the pagetables - but we could have per-process
> lock for this purpose.

It is?  I don't think we need any per-arch sync for that.  The purge
should happen after all modifications are done so the next page fault
inserts the new TLB entry ... so if there is a place where the purge
lock matters to the page table updates, we're doing something wrong.

James


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 20:32     ` James Bottomley
@ 2019-04-06 20:40       ` Mikulas Patocka
  2019-04-06 21:28         ` James Bottomley
  2019-04-10 15:05         ` John David Anglin
  0 siblings, 2 replies; 18+ messages in thread
From: Mikulas Patocka @ 2019-04-06 20:40 UTC (permalink / raw)
  To: James Bottomley; +Cc: Helge Deller, John David Anglin, linux-parisc



On Sat, 6 Apr 2019, James Bottomley wrote:

> On Sat, 2019-04-06 at 16:13 -0400, Mikulas Patocka wrote:
> > 
> > > Of course, on systems without a merced bus, we don't need the lock
> > at 
> > > all, so runtime patching might be usable to fix that case.
> > > 
> > > James
> > 
> > The lock is still needed to synchronize TLB fault handlers with the
> > code that modifies the pagetables - but we could have per-process
> > lock for this purpose.
> 
> It is?  I don't think we need any per-arch sync for that.  The purge
> should happen after all modifications are done so the next page fault
> inserts the new TLB entry ... so if there is a place where the purge
> lock matters to the page table updates, we're doing something wrong.
> 
> James

Suppose that this sequence happens:

CPU1:
(inside the TLB miss handler)
read the value XXX from the pagetables to the register

CPU2:
modify the value in the pagetables to YYY
broadcast a TLB purge

CPU1:
receives the TLB purge broadcast and flushes the TLB
... continues executing the TLB handler and inserts the value XXX from the register into the TLB

And now, CPU1 is running with stale entry in the TLB. We need the lock to 
prevent this situation.

Mikulas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 20:40       ` Mikulas Patocka
@ 2019-04-06 21:28         ` James Bottomley
  2019-04-07 17:07           ` Mikulas Patocka
  2019-04-10 15:05         ` John David Anglin
  1 sibling, 1 reply; 18+ messages in thread
From: James Bottomley @ 2019-04-06 21:28 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Helge Deller, John David Anglin, linux-parisc

On Sat, 2019-04-06 at 16:40 -0400, Mikulas Patocka wrote:
> 
> On Sat, 6 Apr 2019, James Bottomley wrote:
> 
> > On Sat, 2019-04-06 at 16:13 -0400, Mikulas Patocka wrote:
> > > 
> > > > Of course, on systems without a merced bus, we don't need the
> lock
> > > at 
> > > > all, so runtime patching might be usable to fix that case.
> > > > 
> > > > James
> > > 
> > > The lock is still needed to synchronize TLB fault handlers with
> the
> > > code that modifies the pagetables - but we could have per-process
> > > lock for this purpose.
> > 
> > It is?  I don't think we need any per-arch sync for that.  The
> purge
> > should happen after all modifications are done so the next page
> fault
> > inserts the new TLB entry ... so if there is a place where the
> purge
> > lock matters to the page table updates, we're doing something
> wrong.
> > 
> > James
> 
> Suppose that this sequence happens:
> 
> CPU1:
> (inside the TLB miss handler)
> read the value XXX from the pagetables to the register
> 
> CPU2:
> modify the value in the pagetables to YYY
> broadcast a TLB purge
> 
> CPU1:
> receives the TLB purge broadcast and flushes the TLB
> ... continues executing the TLB handler and inserts the value XXX
> from the register into the TLB
> 
> And now, CPU1 is running with stale entry in the TLB. We need the
> lock to prevent this situation.

Heh, this is the dreaded appendix F.  In general, if we're executing a
high priority interruption for a TLB miss, the address is termed relied
upon, so a purge for the same address won't be acted upon and
acknowledged by the CPU until we leave the high priority handler (and
have thus either inserted a TLB entry or dropped into full fault
handling).  This effectively makes the purge and the insertion atomic
with respect to each other as they would be if the CPU had a hardware
TLB miss handler.  In the worst case, the CPU refaults on the same
address because it got inserted then purged and the new translation
then gets inserted second time around.

James


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 20:15   ` Helge Deller
@ 2019-04-07  2:48     ` James Bottomley
  2019-04-07 17:23       ` Mikulas Patocka
  0 siblings, 1 reply; 18+ messages in thread
From: James Bottomley @ 2019-04-07  2:48 UTC (permalink / raw)
  To: Helge Deller, Mikulas Patocka, John David Anglin; +Cc: linux-parisc

On Sat, 2019-04-06 at 22:15 +0200, Helge Deller wrote:
> On 06.04.19 21:49, James Bottomley wrote:
> > On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
> > > Parisc uses a global spinlock to protect pagetable updates in the
> > > TLB
> > > fault handlers. When multiple cores are taking TLB faults
> > > simultaneously, the cache line containing the spinlock becomes a
> > > bottleneck.
> > 
> > You can't do this.  As the comment in cache.c says: the lock is to
> > protect the merced bus, which runs between the CPUs on some
> > systems.
> > That means it must be a single, global lock.  Of course, on systems
> > without a merced bus, we don't need the lock at all, so runtime
> > patching might be usable to fix that case.
> 
> Is there a way to detect if a system has the Merced bus?
> 
> See arch/parisc/include/asm/tlbflush.h too:
> /* This is for the serialisation of PxTLB broadcasts.  At least on
> the
>  * N class systems, only one PxTLB inter processor broadcast can be
>  * active at any one time on the Merced bus.  This tlb purge
>  * synchronisation is fairly lightweight and harmless so we activate
>  * it on all systems not just the N class.
> 
> 30% speed improvement by Mikulas patches don't seem lightweight...

Well, that's because when it was originally conceived the patch was
only about purging.  It never actually involved the TLB insertion hot
path.  It turns out the entanglement occurred here:

commit 01ab60570427caa24b9debc369e452e86cd9beb4
Author: John David Anglin <dave.anglin@bell.net>
Date:   Wed Jul 1 17:18:37 2015 -0400

    parisc: Fix some PTE/TLB race conditions and optimize
__flush_tlb_range based on timing results
 

Which is when the dbit lock got replaced by the tlb purge lock.  I have
some vague memories about why we needed the dbit lock which I'll try to
make more coherent.

James


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 21:28         ` James Bottomley
@ 2019-04-07 17:07           ` Mikulas Patocka
  0 siblings, 0 replies; 18+ messages in thread
From: Mikulas Patocka @ 2019-04-07 17:07 UTC (permalink / raw)
  To: James Bottomley; +Cc: Helge Deller, John David Anglin, linux-parisc



On Sat, 6 Apr 2019, James Bottomley wrote:

> On Sat, 2019-04-06 at 16:40 -0400, Mikulas Patocka wrote:
> > 
> > On Sat, 6 Apr 2019, James Bottomley wrote:
> > 
> > > On Sat, 2019-04-06 at 16:13 -0400, Mikulas Patocka wrote:
> > > > 
> > > > > Of course, on systems without a merced bus, we don't need the
> > lock
> > > > at 
> > > > > all, so runtime patching might be usable to fix that case.
> > > > > 
> > > > > James
> > > > 
> > > > The lock is still needed to synchronize TLB fault handlers with
> > the
> > > > code that modifies the pagetables - but we could have per-process
> > > > lock for this purpose.
> > > 
> > > It is?  I don't think we need any per-arch sync for that.  The
> > purge
> > > should happen after all modifications are done so the next page
> > fault
> > > inserts the new TLB entry ... so if there is a place where the
> > purge
> > > lock matters to the page table updates, we're doing something
> > wrong.
> > > 
> > > James
> > 
> > Suppose that this sequence happens:
> > 
> > CPU1:
> > (inside the TLB miss handler)
> > read the value XXX from the pagetables to the register
> > 
> > CPU2:
> > modify the value in the pagetables to YYY
> > broadcast a TLB purge
> > 
> > CPU1:
> > receives the TLB purge broadcast and flushes the TLB
> > ... continues executing the TLB handler and inserts the value XXX
> > from the register into the TLB
> > 
> > And now, CPU1 is running with stale entry in the TLB. We need the
> > lock to prevent this situation.
> 
> Heh, this is the dreaded appendix F.  In general, if we're executing a
> high priority interruption for a TLB miss, the address is termed relied
> upon, so a purge for the same address won't be acted upon and
> acknowledged by the CPU until we leave the high priority handler (and
> have thus either inserted a TLB entry or dropped into full fault
> handling).  This effectively makes the purge and the insertion atomic
> with respect to each other as they would be if the CPU had a hardware
> TLB miss handler.  In the worst case, the CPU refaults on the same
> address because it got inserted then purged and the new translation
> then gets inserted second time around.
> 
> James

I see.

But the other remaining reason why we need the lock in the TLB handler is 
updating the accessed and dirty bits.

If we changed the pte layout so that there would be dirty and accessed 
bytes instead of bits, we could update them with the stb instruction 
without any locking.

Mikulas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-07  2:48     ` James Bottomley
@ 2019-04-07 17:23       ` Mikulas Patocka
  2019-04-07 17:42         ` James Bottomley
  0 siblings, 1 reply; 18+ messages in thread
From: Mikulas Patocka @ 2019-04-07 17:23 UTC (permalink / raw)
  To: James Bottomley; +Cc: Helge Deller, John David Anglin, linux-parisc



On Sat, 6 Apr 2019, James Bottomley wrote:

> On Sat, 2019-04-06 at 22:15 +0200, Helge Deller wrote:
> > On 06.04.19 21:49, James Bottomley wrote:
> > > On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
> > > > Parisc uses a global spinlock to protect pagetable updates in the
> > > > TLB
> > > > fault handlers. When multiple cores are taking TLB faults
> > > > simultaneously, the cache line containing the spinlock becomes a
> > > > bottleneck.
> > > 
> > > You can't do this.  As the comment in cache.c says: the lock is to
> > > protect the merced bus, which runs between the CPUs on some
> > > systems.
> > > That means it must be a single, global lock.  Of course, on systems
> > > without a merced bus, we don't need the lock at all, so runtime
> > > patching might be usable to fix that case.
> > 
> > Is there a way to detect if a system has the Merced bus?
> > 
> > See arch/parisc/include/asm/tlbflush.h too:
> > /* This is for the serialisation of PxTLB broadcasts.  At least on
> > the
> >  * N class systems, only one PxTLB inter processor broadcast can be
> >  * active at any one time on the Merced bus.  This tlb purge
> >  * synchronisation is fairly lightweight and harmless so we activate
> >  * it on all systems not just the N class.
> > 
> > 30% speed improvement by Mikulas patches don't seem lightweight...
> 
> Well, that's because when it was originally conceived the patch was
> only about purging.  It never actually involved the TLB insertion hot
> path.  It turns out the entanglement occurred here:
> 
> commit 01ab60570427caa24b9debc369e452e86cd9beb4
> Author: John David Anglin <dave.anglin@bell.net>
> Date:   Wed Jul 1 17:18:37 2015 -0400
> 
>     parisc: Fix some PTE/TLB race conditions and optimize
> __flush_tlb_range based on timing results
>  
> 
> Which is when the dbit lock got replaced by the tlb purge lock.  I have
> some vague memories about why we needed the dbit lock which I'll try to
> make more coherent.
> 
> James

Before this patch, it used pa_dbit_lock for modifying pagetables and 
pa_tlb_lock for flushing.

So it still suffered the performance penalty with shared pa_dbit_lock.

Perhaps the proper thing would be to use global pa_tlb_lock for flushing 
and per-process tlb lock for pagetable updates.

Mikulas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-07 17:23       ` Mikulas Patocka
@ 2019-04-07 17:42         ` James Bottomley
  0 siblings, 0 replies; 18+ messages in thread
From: James Bottomley @ 2019-04-07 17:42 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Helge Deller, John David Anglin, linux-parisc

On Sun, 2019-04-07 at 13:23 -0400, Mikulas Patocka wrote:
> 
> On Sat, 6 Apr 2019, James Bottomley wrote:
> 
> > On Sat, 2019-04-06 at 22:15 +0200, Helge Deller wrote:
> > > On 06.04.19 21:49, James Bottomley wrote:
> > > > On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
> > > > > Parisc uses a global spinlock to protect pagetable updates in
> the
> > > > > TLB
> > > > > fault handlers. When multiple cores are taking TLB faults
> > > > > simultaneously, the cache line containing the spinlock
> becomes a
> > > > > bottleneck.
> > > > 
> > > > You can't do this.  As the comment in cache.c says: the lock is
> to
> > > > protect the merced bus, which runs between the CPUs on some
> > > > systems.
> > > > That means it must be a single, global lock.  Of course, on
> systems
> > > > without a merced bus, we don't need the lock at all, so runtime
> > > > patching might be usable to fix that case.
> > > 
> > > Is there a way to detect if a system has the Merced bus?
> > > 
> > > See arch/parisc/include/asm/tlbflush.h too:
> > > /* This is for the serialisation of PxTLB broadcasts.  At least
> on
> > > the
> > >  * N class systems, only one PxTLB inter processor broadcast can
> be
> > >  * active at any one time on the Merced bus.  This tlb purge
> > >  * synchronisation is fairly lightweight and harmless so we
> activate
> > >  * it on all systems not just the N class.
> > > 
> > > 30% speed improvement by Mikulas patches don't seem
> lightweight...
> > 
> > Well, that's because when it was originally conceived the patch was
> > only about purging.  It never actually involved the TLB insertion
> hot
> > path.  It turns out the entanglement occurred here:
> > 
> > commit 01ab60570427caa24b9debc369e452e86cd9beb4
> > Author: John David Anglin <dave.anglin@bell.net>
> > Date:   Wed Jul 1 17:18:37 2015 -0400
> > 
> >     parisc: Fix some PTE/TLB race conditions and optimize
> > __flush_tlb_range based on timing results
> >  
> > 
> > Which is when the dbit lock got replaced by the tlb purge lock.  I
> have
> > some vague memories about why we needed the dbit lock which I'll
> try to
> > make more coherent.
> > 
> > James
> 
> Before this patch, it used pa_dbit_lock for modifying pagetables and 
> pa_tlb_lock for flushing.
> 
> So it still suffered the performance penalty with shared
> pa_dbit_lock.
> 
> Perhaps the proper thing would be to use global pa_tlb_lock for
> flushing and per-process tlb lock for pagetable updates.

Actually, I'm not sure we need to go back to the dbit lock.  The design
goal of the updates is not to lose set bits.  It's slightly
inefficient, but not a problem, if we lose cleared bits (a page that
should be clean stays dirty and gets rewritten or a page that should be
made old stays young in the cache) and it's not actually a problem if
we lose setting the accessed bit ... an active page may just get lost
from the LRU list and have to be faulted on next access.  So if the
problem boils down to ensuring the D bit stays set, we can update it,
check and rewrite if it comes back clear.  We can ensure forward
progress because all clearing sequences don't repeat.

James


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 20:40       ` Mikulas Patocka
  2019-04-06 21:28         ` James Bottomley
@ 2019-04-10 15:05         ` John David Anglin
  2019-04-10 16:09           ` Mikulas Patocka
  1 sibling, 1 reply; 18+ messages in thread
From: John David Anglin @ 2019-04-10 15:05 UTC (permalink / raw)
  To: Mikulas Patocka, James Bottomley; +Cc: Helge Deller, linux-parisc

On 2019-04-06 4:40 p.m., Mikulas Patocka wrote:
>
> On Sat, 6 Apr 2019, James Bottomley wrote:
>
>> On Sat, 2019-04-06 at 16:13 -0400, Mikulas Patocka wrote:
>>>> Of course, on systems without a merced bus, we don't need the lock
>>> at 
>>>> all, so runtime patching might be usable to fix that case.
>>>>
>>>> James
>>> The lock is still needed to synchronize TLB fault handlers with the
>>> code that modifies the pagetables - but we could have per-process
>>> lock for this purpose.
>> It is?  I don't think we need any per-arch sync for that.  The purge
>> should happen after all modifications are done so the next page fault
>> inserts the new TLB entry ... so if there is a place where the purge
>> lock matters to the page table updates, we're doing something wrong.
>>
>> James
> Suppose that this sequence happens:
>
> CPU1:
> (inside the TLB miss handler)
> read the value XXX from the pagetables to the register
The value is read holding TLB lock and the value is updated after insert.  Then, lock is released.
Only concern is whether TLB inserts are strongly ordered.

>
> CPU2:
> modify the value in the pagetables to YYY
> broadcast a TLB purge
CPU2 needs to acquire TLB lock before it can modify value and broadcast TLB purge (see set_pte_at).  Lock is then released.
TLB purges are strongly ordered.

So, I don't think this scenario can happen.
>
> CPU1:
> receives the TLB purge broadcast and flushes the TLB
> ... continues executing the TLB handler and inserts the value XXX from the register into the TLB
>
> And now, CPU1 is running with stale entry in the TLB. We need the lock to 
> prevent this situation.
>
> Mikulas
>

Dave

-- 
John David Anglin  dave.anglin@bell.net



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-10 15:05         ` John David Anglin
@ 2019-04-10 16:09           ` Mikulas Patocka
       [not found]             ` <e81cc4d8-0da9-454b-6102-c89bb5cdd0b0@bell.net>
  0 siblings, 1 reply; 18+ messages in thread
From: Mikulas Patocka @ 2019-04-10 16:09 UTC (permalink / raw)
  To: John David Anglin; +Cc: James Bottomley, Helge Deller, linux-parisc

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2289 bytes --]



On Wed, 10 Apr 2019, John David Anglin wrote:

> On 2019-04-06 4:40 p.m., Mikulas Patocka wrote:
> >
> > On Sat, 6 Apr 2019, James Bottomley wrote:
> >
> >> On Sat, 2019-04-06 at 16:13 -0400, Mikulas Patocka wrote:
> >>>> Of course, on systems without a merced bus, we don't need the lock
> >>> at 
> >>>> all, so runtime patching might be usable to fix that case.
> >>>>
> >>>> James
> >>> The lock is still needed to synchronize TLB fault handlers with the
> >>> code that modifies the pagetables - but we could have per-process
> >>> lock for this purpose.
> >> It is?  I don't think we need any per-arch sync for that.  The purge
> >> should happen after all modifications are done so the next page fault
> >> inserts the new TLB entry ... so if there is a place where the purge
> >> lock matters to the page table updates, we're doing something wrong.
> >>
> >> James
> > Suppose that this sequence happens:
> >
> > CPU1:
> > (inside the TLB miss handler)
> > read the value XXX from the pagetables to the register
> The value is read holding TLB lock and the value is updated after insert.  Then, lock is released.
> Only concern is whether TLB inserts are strongly ordered.
> 
> >
> > CPU2:
> > modify the value in the pagetables to YYY
> > broadcast a TLB purge
> CPU2 needs to acquire TLB lock before it can modify value and broadcast TLB purge (see set_pte_at).  Lock is then released.
> TLB purges are strongly ordered.
> 
> So, I don't think this scenario can happen.

It can't happen in the current code. This is hypothetical scenario that 
could happen if we removed the TLB lock as suggested by James. But James 
claims that it can't happen because the purge tlb instruction will wait 
until all the other CPUs finish executing a high-priority interruption.

What do you think? Could the TLB lock go away?

Having one lock shared by all four cores degrades performance 
significantly (30%).

Mikulas

> >
> > CPU1:
> > receives the TLB purge broadcast and flushes the TLB
> > ... continues executing the TLB handler and inserts the value XXX from the register into the TLB
> >
> > And now, CPU1 is running with stale entry in the TLB. We need the lock to 
> > prevent this situation.
> >
> > Mikulas
> >
> 
> Dave
> 
> -- 
> John David Anglin  dave.anglin@bell.net
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-06 19:49 ` James Bottomley
  2019-04-06 20:13   ` Mikulas Patocka
  2019-04-06 20:15   ` Helge Deller
@ 2019-04-10 19:10   ` John David Anglin
  2019-04-10 19:27     ` John David Anglin
  2 siblings, 1 reply; 18+ messages in thread
From: John David Anglin @ 2019-04-10 19:10 UTC (permalink / raw)
  To: James Bottomley, Mikulas Patocka, Helge Deller; +Cc: linux-parisc

On 2019-04-06 3:49 p.m., James Bottomley wrote:
> On Sat, 2019-04-06 at 15:36 -0400, Mikulas Patocka wrote:
>> Parisc uses a global spinlock to protect pagetable updates in the TLB
>> fault handlers. When multiple cores are taking TLB faults
>> simultaneously, the cache line containing the spinlock becomes a
>> bottleneck.
> You can't do this.  As the comment in cache.c says: the lock is to
> protect the merced bus, which runs between the CPUs on some systems. 
> That means it must be a single, global lock.  Of course, on systems
> without a merced bus, we don't need the lock at all, so runtime
> patching might be usable to fix that case.
To me, this looks like a good improvement.  It appears easy to adopt pgd_spinlock()
and other places where lock address is loaded for merced bus case.  Is anyone
actually using N class?

There are a couple of uses of pa_tlb_lock in pacache.S that are not updated.  These
only affect PA 1.x.  Was merced bus used on PA 1.x?  If not, then the locks could be
removed.

Dave

-- 
John David Anglin  dave.anglin@bell.net



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-10 19:10   ` John David Anglin
@ 2019-04-10 19:27     ` John David Anglin
  2019-04-13 16:23       ` Helge Deller
  0 siblings, 1 reply; 18+ messages in thread
From: John David Anglin @ 2019-04-10 19:27 UTC (permalink / raw)
  To: James Bottomley, Mikulas Patocka, Helge Deller; +Cc: linux-parisc

On 2019-04-10 3:10 p.m., John David Anglin wrote:
> Was merced bus used on PA 1.x?  If not, then the locks could be
> removed.
Openpa states that merced bus was used in L1500, L3000 and N4000.  All are Pa 2.0,
so locks are not needed.

Dave

-- 
John David Anglin  dave.anglin@bell.net


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
       [not found]             ` <e81cc4d8-0da9-454b-6102-c89bb5cdd0b0@bell.net>
@ 2019-04-11 13:18               ` John David Anglin
  0 siblings, 0 replies; 18+ messages in thread
From: John David Anglin @ 2019-04-11 13:18 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: James Bottomley, Helge Deller, linux-parisc

On 2019-04-10 2:32 p.m., John David Anglin wrote:
>> Having one lock shared by all four cores degrades performance 
>> significantly (30%).
> I can believe that.
I installed patch on c8000.  It reduced my v5.0.7 kernel build time from about
51 to 39 minutes using -j6.  So, the speed improvement is definitely there.

Dave

-- 
John David Anglin  dave.anglin@bell.net



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-10 19:27     ` John David Anglin
@ 2019-04-13 16:23       ` Helge Deller
  2019-04-13 17:57         ` John David Anglin
  0 siblings, 1 reply; 18+ messages in thread
From: Helge Deller @ 2019-04-13 16:23 UTC (permalink / raw)
  To: John David Anglin, James Bottomley, Mikulas Patocka; +Cc: linux-parisc

On 10.04.19 21:27, John David Anglin wrote:
> On 2019-04-10 3:10 p.m., John David Anglin wrote:
>> Was merced bus used on PA 1.x?  If not, then the locks could be
>> removed.
> Openpa states that merced bus was used in L1500, L3000 and N4000.  All are Pa 2.0,
> so locks are not needed.

And according to https://www.openpa.net/pa-risc_chipsets_stretch.html
it seems that a IKE I/O controller is used on Stretch only.
So, detecting an IKE should be sufficient to check if we need
to avoid multiple TLB flushed on the bus...

Helge

ftp://parisc.parisc-linux.org/dmesg/rp5470_3.10.dmesg

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] parisc: use per-pagetable spinlock
  2019-04-13 16:23       ` Helge Deller
@ 2019-04-13 17:57         ` John David Anglin
  0 siblings, 0 replies; 18+ messages in thread
From: John David Anglin @ 2019-04-13 17:57 UTC (permalink / raw)
  To: Helge Deller, James Bottomley, Mikulas Patocka; +Cc: linux-parisc

On 2019-04-13 12:23 p.m., Helge Deller wrote:
> On 10.04.19 21:27, John David Anglin wrote:
>> On 2019-04-10 3:10 p.m., John David Anglin wrote:
>>> Was merced bus used on PA 1.x?  If not, then the locks could be
>>> removed.
>> Openpa states that merced bus was used in L1500, L3000 and N4000.  All are Pa 2.0,
>> so locks are not needed.
> And according to https://www.openpa.net/pa-risc_chipsets_stretch.html
> it seems that a IKE I/O controller is used on Stretch only.
> So, detecting an IKE should be sufficient to check if we need
> to avoid multiple TLB flushed on the bus...
>
In hardware.c, we have:

        {HPHW_BCPORT, 0x800, 0x0000C, 0x10, "DEW BC Merced Port"},
        {HPHW_BCPORT, 0x801, 0x0000C, 0x10, "SMC Bus Interface Merced Bus0"},
        {HPHW_BCPORT, 0x802, 0x0000C, 0x10, "SMC Bus INterface Merced Bus1"},
        {HPHW_BCPORT, 0x803, 0x0000C, 0x10, "IKE I/O BC Merced Port"},
        {HPHW_BCPORT, 0x781, 0x0000C, 0x00, "IKE I/O BC Ropes Port"},
        {HPHW_BCPORT, 0x804, 0x0000C, 0x10, "REO I/O BC Merced Port"},
        {HPHW_BCPORT, 0x782, 0x0000C, 0x00, "REO I/O BC Ropes Port"},

Dave

-- 
John David Anglin  dave.anglin@bell.net


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2019-04-13 17:57 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-06 19:36 [PATCH] parisc: use per-pagetable spinlock Mikulas Patocka
2019-04-06 19:49 ` James Bottomley
2019-04-06 20:13   ` Mikulas Patocka
2019-04-06 20:32     ` James Bottomley
2019-04-06 20:40       ` Mikulas Patocka
2019-04-06 21:28         ` James Bottomley
2019-04-07 17:07           ` Mikulas Patocka
2019-04-10 15:05         ` John David Anglin
2019-04-10 16:09           ` Mikulas Patocka
     [not found]             ` <e81cc4d8-0da9-454b-6102-c89bb5cdd0b0@bell.net>
2019-04-11 13:18               ` John David Anglin
2019-04-06 20:15   ` Helge Deller
2019-04-07  2:48     ` James Bottomley
2019-04-07 17:23       ` Mikulas Patocka
2019-04-07 17:42         ` James Bottomley
2019-04-10 19:10   ` John David Anglin
2019-04-10 19:27     ` John David Anglin
2019-04-13 16:23       ` Helge Deller
2019-04-13 17:57         ` John David Anglin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).