All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter
@ 2014-02-06 11:30 Will Deacon
  2014-02-06 11:30 ` [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers Will Deacon
                   ` (5 more replies)
  0 siblings, 6 replies; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:30 UTC (permalink / raw)
  To: linux-arm-kernel

The dsb instruction takes an option specifying both the target access
types and shareability domain.

This patch allows such an option to be passed to the dsb macro,
resulting in potentially more efficient code. Currently the option is
ignored until all callers are updated (unlike ARM, the option is
mandated by the assembler).

Signed-off-by: Will Deacon <will.deacon@arm.com>
---

Catalin -- I'd like to get this simple change in for 3.14, since it's a
           blocker for actually implementing the barrier options (we
	   need to move all of the callers over before making the switch).

 arch/arm64/include/asm/barrier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 78e20ba8806b..409ca370cfe2 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -25,7 +25,7 @@
 #define wfi()		asm volatile("wfi" : : : "memory")
 
 #define isb()		asm volatile("isb" : : : "memory")
-#define dsb()		asm volatile("dsb sy" : : : "memory")
+#define dsb(opt)	asm volatile("dsb sy" : : : "memory")
 
 #define mb()		dsb()
 #define rmb()		asm volatile("dsb ld" : : : "memory")
-- 
1.8.2.2

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers
  2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
@ 2014-02-06 11:30 ` Will Deacon
  2014-02-06 11:41   ` Catalin Marinas
  2014-02-06 11:30 ` [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed Will Deacon
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:30 UTC (permalink / raw)
  To: linux-arm-kernel

When calling our low-level barrier macros directly, we can often suffice
with more relaxed behaviour than the default "all accesses, full system"
option.

This patch updates the users of dsb() to specify the option whith they
actually require. The sev() macro is also updated so that a sevl can
be emitted if necessary.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/barrier.h    |  4 ++--
 arch/arm64/include/asm/cacheflush.h |  2 +-
 arch/arm64/include/asm/pgtable.h    |  4 ++--
 arch/arm64/include/asm/tlbflush.h   | 14 +++++++-------
 arch/arm64/kernel/process.c         |  2 +-
 arch/arm64/kvm/sys_regs.c           |  4 ++--
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 409ca370cfe2..09f4cb4b665e 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -20,14 +20,14 @@
 
 #ifndef __ASSEMBLY__
 
-#define sev()		asm volatile("sev" : : : "memory")
+#define sev(l)		asm volatile("sev" #l : : : "memory")
 #define wfe()		asm volatile("wfe" : : : "memory")
 #define wfi()		asm volatile("wfi" : : : "memory")
 
 #define isb()		asm volatile("isb" : : : "memory")
 #define dsb(opt)	asm volatile("dsb sy" : : : "memory")
 
-#define mb()		dsb()
+#define mb()		dsb(sy)
 #define rmb()		asm volatile("dsb ld" : : : "memory")
 #define wmb()		asm volatile("dsb st" : : : "memory")
 
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index fea9ee327206..b9495549cb08 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -142,7 +142,7 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end)
 	 * set_pte_at() called from vmap_pte_range() does not
 	 * have a DSB after cleaning the cache line.
 	 */
-	dsb();
+	dsb(ishst);
 }
 
 static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b524dcd17243..54aa68f61008 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -310,7 +310,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
 	*pmdp = pmd;
-	dsb();
+	dsb(ishst);
 }
 
 static inline void pmd_clear(pmd_t *pmdp)
@@ -340,7 +340,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
 	*pudp = pud;
-	dsb();
+	dsb(ishst);
 }
 
 static inline void pud_clear(pud_t *pudp)
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 8b482035cfc2..3083a08f9622 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -72,9 +72,9 @@ extern struct cpu_tlb_fns cpu_tlb;
  */
 static inline void flush_tlb_all(void)
 {
-	dsb();
+	dsb(ishst);
 	asm("tlbi	vmalle1is");
-	dsb();
+	dsb(ish);
 	isb();
 }
 
@@ -82,9 +82,9 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned long asid = (unsigned long)ASID(mm) << 48;
 
-	dsb();
+	dsb(ishst);
 	asm("tlbi	aside1is, %0" : : "r" (asid));
-	dsb();
+	dsb(ish);
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -93,9 +93,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 	unsigned long addr = uaddr >> 12 |
 		((unsigned long)ASID(vma->vm_mm) << 48);
 
-	dsb();
+	dsb(ishst);
 	asm("tlbi	vae1is, %0" : : "r" (addr));
-	dsb();
+	dsb(ish);
 }
 
 /*
@@ -114,7 +114,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
 	 * set_pte() does not have a DSB, so make sure that the page table
 	 * write is visible.
 	 */
-	dsb();
+	dsb(ishst);
 }
 
 #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1c0a9be2ffa8..8e78cb238376 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -294,7 +294,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
 	 */
-	dsb();
+	dsb(ish);
 
 	/* the actual thread switch */
 	last = cpu_switch_to(prev, next);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 02e9d09e1d80..bbaaabe8e899 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -70,13 +70,13 @@ static u32 get_ccsidr(u32 csselr)
 static void do_dc_cisw(u32 val)
 {
 	asm volatile("dc cisw, %x0" : : "r" (val));
-	dsb();
+	dsb(ishst);
 }
 
 static void do_dc_csw(u32 val)
 {
 	asm volatile("dc csw, %x0" : : "r" (val));
-	dsb();
+	dsb(ishst);
 }
 
 /* See note@ARM ARM B1.14.4 */
-- 
1.8.2.2

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
  2014-02-06 11:30 ` [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers Will Deacon
@ 2014-02-06 11:30 ` Will Deacon
  2014-02-06 11:39   ` Marc Zyngier
  2014-02-06 11:45   ` Catalin Marinas
  2014-02-06 11:30 ` [PATCH 4/6] iommu/arm-smmu: provide option to dsb macro when publishing tables Will Deacon
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:30 UTC (permalink / raw)
  To: linux-arm-kernel

When sending an SGI to another CPU, we require a DSB to ensure that
any pending stores to normal memory are made visible to the recipient
before the interrupt arrives.

Rather than use a dsb() (which will soon cause an assembly error on
arm64) followed by a writel_relaxed, we can use a writel instead, which
will emit a dsb st prior to the str.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/irqchip/irq-gic.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 341c6016812d..03fe5ef3f2fe 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -662,11 +662,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 	/*
 	 * Ensure that stores to Normal memory are visible to the
 	 * other CPUs before issuing the IPI.
+	 *
+	 * This always happens on GIC0.
 	 */
-	dsb();
-
-	/* this always happens on GIC0 */
-	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
 
 	raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
 }
-- 
1.8.2.2

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/6] iommu/arm-smmu: provide option to dsb macro when publishing tables
  2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
  2014-02-06 11:30 ` [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers Will Deacon
  2014-02-06 11:30 ` [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed Will Deacon
@ 2014-02-06 11:30 ` Will Deacon
  2014-02-06 11:51   ` Catalin Marinas
  2014-02-06 11:30 ` [PATCH 5/6] arm64: barriers: wire up new barrier options Will Deacon
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:30 UTC (permalink / raw)
  To: linux-arm-kernel

On coherent systems, publishing new page tables to the SMMU walker is
achieved with a dsb instruction. In fact, this can be a dsb(ishst) which
also provides the mandatory barrier option for arm64.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/iommu/arm-smmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 8911850c9444..8f6fee54f3b1 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1417,7 +1417,7 @@ out_unlock:
 
 	/* Ensure new page tables are visible to the hardware walker */
 	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
-		dsb();
+		dsb(ishst);
 
 	return ret;
 }
-- 
1.8.2.2

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 5/6] arm64: barriers: wire up new barrier options
  2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
                   ` (2 preceding siblings ...)
  2014-02-06 11:30 ` [PATCH 4/6] iommu/arm-smmu: provide option to dsb macro when publishing tables Will Deacon
@ 2014-02-06 11:30 ` Will Deacon
  2014-02-06 11:55   ` Catalin Marinas
  2014-02-06 11:30 ` [PATCH 6/6] arm64: barriers: use barrier() instead of smp_mb() when !SMP Will Deacon
  2014-02-06 11:39 ` [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Catalin Marinas
  5 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:30 UTC (permalink / raw)
  To: linux-arm-kernel

Now that all callers of the barrier macros are updated to pass the
mandatory options, update the macros so the option is actually used.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/barrier.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 09f4cb4b665e..b5219b651850 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -25,11 +25,12 @@
 #define wfi()		asm volatile("wfi" : : : "memory")
 
 #define isb()		asm volatile("isb" : : : "memory")
-#define dsb(opt)	asm volatile("dsb sy" : : : "memory")
+#define dmb(opt)	asm volatile("dmb " #opt : : : "memory")
+#define dsb(opt)	asm volatile("dsb " #opt : : : "memory")
 
 #define mb()		dsb(sy)
-#define rmb()		asm volatile("dsb ld" : : : "memory")
-#define wmb()		asm volatile("dsb st" : : : "memory")
+#define rmb()		dsb(ld)
+#define wmb()		dsb(st)
 
 #ifndef CONFIG_SMP
 #define smp_mb()	barrier()
@@ -53,9 +54,9 @@ do {									\
 
 #else
 
-#define smp_mb()	asm volatile("dmb ish" : : : "memory")
-#define smp_rmb()	asm volatile("dmb ishld" : : : "memory")
-#define smp_wmb()	asm volatile("dmb ishst" : : : "memory")
+#define smp_mb()	dmb(ish)
+#define smp_rmb()	dmb(ishld)
+#define smp_wmb()	dmb(ishst)
 
 #define smp_store_release(p, v)						\
 do {									\
-- 
1.8.2.2

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 6/6] arm64: barriers: use barrier() instead of smp_mb() when !SMP
  2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
                   ` (3 preceding siblings ...)
  2014-02-06 11:30 ` [PATCH 5/6] arm64: barriers: wire up new barrier options Will Deacon
@ 2014-02-06 11:30 ` Will Deacon
  2014-02-06 11:56   ` Catalin Marinas
  2014-02-06 11:39 ` [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Catalin Marinas
  5 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:30 UTC (permalink / raw)
  To: linux-arm-kernel

The recently introduced acquire/release accessors refer to smp_mb()
in the !CONFIG_SMP case. This is confusing when reading the code, so use
barrier() directly when we know we're UP.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/barrier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index b5219b651850..ef483c75ae34 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -40,7 +40,7 @@
 #define smp_store_release(p, v)						\
 do {									\
 	compiletime_assert_atomic_type(*p);				\
-	smp_mb();							\
+	barrier();							\
 	ACCESS_ONCE(*p) = (v);						\
 } while (0)
 
@@ -48,7 +48,7 @@ do {									\
 ({									\
 	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
-	smp_mb();							\
+	barrier();							\
 	___p1;								\
 })
 
-- 
1.8.2.2

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:30 ` [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed Will Deacon
@ 2014-02-06 11:39   ` Marc Zyngier
  2014-02-06 11:45   ` Catalin Marinas
  1 sibling, 0 replies; 29+ messages in thread
From: Marc Zyngier @ 2014-02-06 11:39 UTC (permalink / raw)
  To: linux-arm-kernel

On 06/02/14 11:30, Will Deacon wrote:
> When sending an SGI to another CPU, we require a DSB to ensure that
> any pending stores to normal memory are made visible to the recipient
> before the interrupt arrives.
> 
> Rather than use a dsb() (which will soon cause an assembly error on
> arm64) followed by a writel_relaxed, we can use a writel instead, which
> will emit a dsb st prior to the str.
> 
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Marc Zyngier <marc.zyngier@arm.com>
> Signed-off-by: Will Deacon <will.deacon@arm.com>

Looks sensible to me.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter
  2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
                   ` (4 preceding siblings ...)
  2014-02-06 11:30 ` [PATCH 6/6] arm64: barriers: use barrier() instead of smp_mb() when !SMP Will Deacon
@ 2014-02-06 11:39 ` Catalin Marinas
  5 siblings, 0 replies; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:39 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:30:48AM +0000, Will Deacon wrote:
> The dsb instruction takes an option specifying both the target access
> types and shareability domain.
> 
> This patch allows such an option to be passed to the dsb macro,
> resulting in potentially more efficient code. Currently the option is
> ignored until all callers are updated (unlike ARM, the option is
> mandated by the assembler).
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
> 
> Catalin -- I'd like to get this simple change in for 3.14, since it's a
>            blocker for actually implementing the barrier options (we
> 	   need to move all of the callers over before making the switch).

OK, no problem. Applied.

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers
  2014-02-06 11:30 ` [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers Will Deacon
@ 2014-02-06 11:41   ` Catalin Marinas
  2014-02-06 11:45     ` Will Deacon
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:41 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:30:49AM +0000, Will Deacon wrote:
> --- a/arch/arm64/include/asm/barrier.h
> +++ b/arch/arm64/include/asm/barrier.h
> @@ -20,14 +20,14 @@
>  
>  #ifndef __ASSEMBLY__
>  
> -#define sev()		asm volatile("sev" : : : "memory")
> +#define sev(l)		asm volatile("sev" #l : : : "memory")

Would we actually ever use sev(l) form C? And it's a new instruction
rather than an argument to the existing sev.

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers
  2014-02-06 11:41   ` Catalin Marinas
@ 2014-02-06 11:45     ` Will Deacon
  2014-02-06 11:49       ` Catalin Marinas
  0 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:41:30AM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 11:30:49AM +0000, Will Deacon wrote:
> > --- a/arch/arm64/include/asm/barrier.h
> > +++ b/arch/arm64/include/asm/barrier.h
> > @@ -20,14 +20,14 @@
> >  
> >  #ifndef __ASSEMBLY__
> >  
> > -#define sev()		asm volatile("sev" : : : "memory")
> > +#define sev(l)		asm volatile("sev" #l : : : "memory")
> 
> Would we actually ever use sev(l) form C? And it's a new instruction
> rather than an argument to the existing sev.

I don't know, but if it's not there then I'm pretty sure people will always
use sev, even if sevl could be used. In fact, sev and sevl are both aliases
of hint, so they're the same instruction with different immediate operands.

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:30 ` [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed Will Deacon
  2014-02-06 11:39   ` Marc Zyngier
@ 2014-02-06 11:45   ` Catalin Marinas
  2014-02-06 11:51     ` Will Deacon
  1 sibling, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> index 341c6016812d..03fe5ef3f2fe 100644
> --- a/drivers/irqchip/irq-gic.c
> +++ b/drivers/irqchip/irq-gic.c
> @@ -662,11 +662,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
>  	/*
>  	 * Ensure that stores to Normal memory are visible to the
>  	 * other CPUs before issuing the IPI.
> +	 *
> +	 * This always happens on GIC0.
>  	 */
> -	dsb();
> -
> -	/* this always happens on GIC0 */
> -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);

That's heavier than a dsb() since with outer caches on ARM we also get
an outer_sync() call.

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers
  2014-02-06 11:45     ` Will Deacon
@ 2014-02-06 11:49       ` Catalin Marinas
  2014-02-06 11:52         ` Will Deacon
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:45:59AM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 11:41:30AM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 11:30:49AM +0000, Will Deacon wrote:
> > > --- a/arch/arm64/include/asm/barrier.h
> > > +++ b/arch/arm64/include/asm/barrier.h
> > > @@ -20,14 +20,14 @@
> > >  
> > >  #ifndef __ASSEMBLY__
> > >  
> > > -#define sev()		asm volatile("sev" : : : "memory")
> > > +#define sev(l)		asm volatile("sev" #l : : : "memory")
> > 
> > Would we actually ever use sev(l) form C? And it's a new instruction
> > rather than an argument to the existing sev.
> 
> I don't know, but if it's not there then I'm pretty sure people will always
> use sev, even if sevl could be used. In fact, sev and sevl are both aliases
> of hint, so they're the same instruction with different immediate operands.

SEVL was pretty much introduced to avoid a branch in the spinlock
functions. I'm not aware of other uses (on arm64 we don't even use the
wfe() macro directly). I would skip this for now.

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:45   ` Catalin Marinas
@ 2014-02-06 11:51     ` Will Deacon
  2014-02-06 11:54       ` Catalin Marinas
  0 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:45:59AM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> > diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> > index 341c6016812d..03fe5ef3f2fe 100644
> > --- a/drivers/irqchip/irq-gic.c
> > +++ b/drivers/irqchip/irq-gic.c
> > @@ -662,11 +662,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
> >  	/*
> >  	 * Ensure that stores to Normal memory are visible to the
> >  	 * other CPUs before issuing the IPI.
> > +	 *
> > +	 * This always happens on GIC0.
> >  	 */
> > -	dsb();
> > -
> > -	/* this always happens on GIC0 */
> > -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> 
> That's heavier than a dsb() since with outer caches on ARM we also get
> an outer_sync() call.

Yes, which I think we actually need in this case, since we're trying to make
normal writes visible to a CPU before a device write hits the GIC.

I can update the commit message if you like?

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 4/6] iommu/arm-smmu: provide option to dsb macro when publishing tables
  2014-02-06 11:30 ` [PATCH 4/6] iommu/arm-smmu: provide option to dsb macro when publishing tables Will Deacon
@ 2014-02-06 11:51   ` Catalin Marinas
  0 siblings, 0 replies; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:30:51AM +0000, Will Deacon wrote:
> On coherent systems, publishing new page tables to the SMMU walker is
> achieved with a dsb instruction. In fact, this can be a dsb(ishst) which
> also provides the mandatory barrier option for arm64.
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers
  2014-02-06 11:49       ` Catalin Marinas
@ 2014-02-06 11:52         ` Will Deacon
  0 siblings, 0 replies; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:52 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:49:52AM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 11:45:59AM +0000, Will Deacon wrote:
> > On Thu, Feb 06, 2014 at 11:41:30AM +0000, Catalin Marinas wrote:
> > > On Thu, Feb 06, 2014 at 11:30:49AM +0000, Will Deacon wrote:
> > > > --- a/arch/arm64/include/asm/barrier.h
> > > > +++ b/arch/arm64/include/asm/barrier.h
> > > > @@ -20,14 +20,14 @@
> > > >  
> > > >  #ifndef __ASSEMBLY__
> > > >  
> > > > -#define sev()		asm volatile("sev" : : : "memory")
> > > > +#define sev(l)		asm volatile("sev" #l : : : "memory")
> > > 
> > > Would we actually ever use sev(l) form C? And it's a new instruction
> > > rather than an argument to the existing sev.
> > 
> > I don't know, but if it's not there then I'm pretty sure people will always
> > use sev, even if sevl could be used. In fact, sev and sevl are both aliases
> > of hint, so they're the same instruction with different immediate operands.
> 
> SEVL was pretty much introduced to avoid a branch in the spinlock
> functions. I'm not aware of other uses (on arm64 we don't even use the
> wfe() macro directly). I would skip this for now.

Okey doke, I'll drop this hunk for the time being.

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:51     ` Will Deacon
@ 2014-02-06 11:54       ` Catalin Marinas
  2014-02-06 11:57         ` Will Deacon
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:51:21AM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 11:45:59AM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> > > diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> > > index 341c6016812d..03fe5ef3f2fe 100644
> > > --- a/drivers/irqchip/irq-gic.c
> > > +++ b/drivers/irqchip/irq-gic.c
> > > @@ -662,11 +662,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
> > >  	/*
> > >  	 * Ensure that stores to Normal memory are visible to the
> > >  	 * other CPUs before issuing the IPI.
> > > +	 *
> > > +	 * This always happens on GIC0.
> > >  	 */
> > > -	dsb();
> > > -
> > > -	/* this always happens on GIC0 */
> > > -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > 
> > That's heavier than a dsb() since with outer caches on ARM we also get
> > an outer_sync() call.
> 
> Yes, which I think we actually need in this case, since we're trying to make
> normal writes visible to a CPU before a device write hits the GIC.

If they are all in the inner shareable domain and with the caches
enabled, we don't need to flush the outer cache (as in the PL310 case
which is common to all CPUs; other saner outer caches propagate the
barrier ;). The outer_sync is needed when the memory accesses are
non-cacheable and we need to drain both the CPU write-buffer and the
PL310 one.

For our case here, we only need to ensure the visibility of writes on a
CPU to another but assuming SMP and caches enabled, so DSB is enough.

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 5/6] arm64: barriers: wire up new barrier options
  2014-02-06 11:30 ` [PATCH 5/6] arm64: barriers: wire up new barrier options Will Deacon
@ 2014-02-06 11:55   ` Catalin Marinas
  0 siblings, 0 replies; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:30:52AM +0000, Will Deacon wrote:
> Now that all callers of the barrier macros are updated to pass the
> mandatory options, update the macros so the option is actually used.
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 6/6] arm64: barriers: use barrier() instead of smp_mb() when !SMP
  2014-02-06 11:30 ` [PATCH 6/6] arm64: barriers: use barrier() instead of smp_mb() when !SMP Will Deacon
@ 2014-02-06 11:56   ` Catalin Marinas
  0 siblings, 0 replies; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 11:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:30:53AM +0000, Will Deacon wrote:
> The recently introduced acquire/release accessors refer to smp_mb()
> in the !CONFIG_SMP case. This is confusing when reading the code, so use
> barrier() directly when we know we're UP.
> 
> Signed-off-by: Will Deacon <will.deacon@arm.com>

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:54       ` Catalin Marinas
@ 2014-02-06 11:57         ` Will Deacon
  2014-02-06 12:00           ` Catalin Marinas
  0 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 11:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:54:30AM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 11:51:21AM +0000, Will Deacon wrote:
> > On Thu, Feb 06, 2014 at 11:45:59AM +0000, Catalin Marinas wrote:
> > > On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> > > > diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> > > > index 341c6016812d..03fe5ef3f2fe 100644
> > > > --- a/drivers/irqchip/irq-gic.c
> > > > +++ b/drivers/irqchip/irq-gic.c
> > > > @@ -662,11 +662,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
> > > >  	/*
> > > >  	 * Ensure that stores to Normal memory are visible to the
> > > >  	 * other CPUs before issuing the IPI.
> > > > +	 *
> > > > +	 * This always happens on GIC0.
> > > >  	 */
> > > > -	dsb();
> > > > -
> > > > -	/* this always happens on GIC0 */
> > > > -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > 
> > > That's heavier than a dsb() since with outer caches on ARM we also get
> > > an outer_sync() call.
> > 
> > Yes, which I think we actually need in this case, since we're trying to make
> > normal writes visible to a CPU before a device write hits the GIC.
> 
> If they are all in the inner shareable domain and with the caches
> enabled, we don't need to flush the outer cache (as in the PL310 case
> which is common to all CPUs; other saner outer caches propagate the
> barrier ;). The outer_sync is needed when the memory accesses are
> non-cacheable and we need to drain both the CPU write-buffer and the
> PL310 one.
> 
> For our case here, we only need to ensure the visibility of writes on a
> CPU to another but assuming SMP and caches enabled, so DSB is enough.

Hmm, but we *do* use this for boot and need to ensure that any mailboxes are
visible. Maybe we have enough cacheflushing/barriers for that already, but
I'm really uncomfortable making this a simple dsb(ishst).

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 11:57         ` Will Deacon
@ 2014-02-06 12:00           ` Catalin Marinas
  2014-02-06 12:13             ` Will Deacon
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 12:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 11:57:39AM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 11:54:30AM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 11:51:21AM +0000, Will Deacon wrote:
> > > On Thu, Feb 06, 2014 at 11:45:59AM +0000, Catalin Marinas wrote:
> > > > On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> > > > > diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> > > > > index 341c6016812d..03fe5ef3f2fe 100644
> > > > > --- a/drivers/irqchip/irq-gic.c
> > > > > +++ b/drivers/irqchip/irq-gic.c
> > > > > @@ -662,11 +662,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
> > > > >  	/*
> > > > >  	 * Ensure that stores to Normal memory are visible to the
> > > > >  	 * other CPUs before issuing the IPI.
> > > > > +	 *
> > > > > +	 * This always happens on GIC0.
> > > > >  	 */
> > > > > -	dsb();
> > > > > -
> > > > > -	/* this always happens on GIC0 */
> > > > > -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > > +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > 
> > > > That's heavier than a dsb() since with outer caches on ARM we also get
> > > > an outer_sync() call.
> > > 
> > > Yes, which I think we actually need in this case, since we're trying to make
> > > normal writes visible to a CPU before a device write hits the GIC.
> > 
> > If they are all in the inner shareable domain and with the caches
> > enabled, we don't need to flush the outer cache (as in the PL310 case
> > which is common to all CPUs; other saner outer caches propagate the
> > barrier ;). The outer_sync is needed when the memory accesses are
> > non-cacheable and we need to drain both the CPU write-buffer and the
> > PL310 one.
> > 
> > For our case here, we only need to ensure the visibility of writes on a
> > CPU to another but assuming SMP and caches enabled, so DSB is enough.
> 
> Hmm, but we *do* use this for boot and need to ensure that any mailboxes are
> visible. Maybe we have enough cacheflushing/barriers for that already, but
> I'm really uncomfortable making this a simple dsb(ishst).

For boot we explicitly flush the caches for the shared data, so we don't
need this. The dsb() here is for standard smp_call_* etc. We didn't have
outer_sync() before, so you are slightly changing the functionality here
(arguably, ishst is relaxing the requirements but I'm not worried about
this, I consider that's the standard use-case for this function).

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 12:00           ` Catalin Marinas
@ 2014-02-06 12:13             ` Will Deacon
  2014-02-06 12:23               ` Catalin Marinas
  0 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 12:13 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 12:00:35PM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 11:57:39AM +0000, Will Deacon wrote:
> > On Thu, Feb 06, 2014 at 11:54:30AM +0000, Catalin Marinas wrote:
> > > On Thu, Feb 06, 2014 at 11:51:21AM +0000, Will Deacon wrote:
> > > > On Thu, Feb 06, 2014 at 11:45:59AM +0000, Catalin Marinas wrote:
> > > > > On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> > > > > > -	/* this always happens on GIC0 */
> > > > > > -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > > > +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > > 
> > > > > That's heavier than a dsb() since with outer caches on ARM we also get
> > > > > an outer_sync() call.
> > > > 
> > > > Yes, which I think we actually need in this case, since we're trying to make
> > > > normal writes visible to a CPU before a device write hits the GIC.
> > > 
> > > If they are all in the inner shareable domain and with the caches
> > > enabled, we don't need to flush the outer cache (as in the PL310 case
> > > which is common to all CPUs; other saner outer caches propagate the
> > > barrier ;). The outer_sync is needed when the memory accesses are
> > > non-cacheable and we need to drain both the CPU write-buffer and the
> > > PL310 one.
> > > 
> > > For our case here, we only need to ensure the visibility of writes on a
> > > CPU to another but assuming SMP and caches enabled, so DSB is enough.
> > 
> > Hmm, but we *do* use this for boot and need to ensure that any mailboxes are
> > visible. Maybe we have enough cacheflushing/barriers for that already, but
> > I'm really uncomfortable making this a simple dsb(ishst).
> 
> For boot we explicitly flush the caches for the shared data, so we don't
> need this. The dsb() here is for standard smp_call_* etc. We didn't have
> outer_sync() before, so you are slightly changing the functionality here
> (arguably, ishst is relaxing the requirements but I'm not worried about
> this, I consider that's the standard use-case for this function).

Ok, so if we assume that a dsb(ishst) is sufficient because the CPU we're
talking to is either (a) coherent in the inner-shareable domain or (b)
incoherent, and we flushed everything to PoC, then why wouldn't a dmb(ishst)
work?

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 12:13             ` Will Deacon
@ 2014-02-06 12:23               ` Catalin Marinas
  2014-02-06 13:26                 ` Will Deacon
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 12:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 12:13:50PM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 12:00:35PM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 11:57:39AM +0000, Will Deacon wrote:
> > > On Thu, Feb 06, 2014 at 11:54:30AM +0000, Catalin Marinas wrote:
> > > > On Thu, Feb 06, 2014 at 11:51:21AM +0000, Will Deacon wrote:
> > > > > On Thu, Feb 06, 2014 at 11:45:59AM +0000, Catalin Marinas wrote:
> > > > > > On Thu, Feb 06, 2014 at 11:30:50AM +0000, Will Deacon wrote:
> > > > > > > -	/* this always happens on GIC0 */
> > > > > > > -	writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > > > > +	writel(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
> > > > > > 
> > > > > > That's heavier than a dsb() since with outer caches on ARM we also get
> > > > > > an outer_sync() call.
> > > > > 
> > > > > Yes, which I think we actually need in this case, since we're trying to make
> > > > > normal writes visible to a CPU before a device write hits the GIC.
> > > > 
> > > > If they are all in the inner shareable domain and with the caches
> > > > enabled, we don't need to flush the outer cache (as in the PL310 case
> > > > which is common to all CPUs; other saner outer caches propagate the
> > > > barrier ;). The outer_sync is needed when the memory accesses are
> > > > non-cacheable and we need to drain both the CPU write-buffer and the
> > > > PL310 one.
> > > > 
> > > > For our case here, we only need to ensure the visibility of writes on a
> > > > CPU to another but assuming SMP and caches enabled, so DSB is enough.
> > > 
> > > Hmm, but we *do* use this for boot and need to ensure that any mailboxes are
> > > visible. Maybe we have enough cacheflushing/barriers for that already, but
> > > I'm really uncomfortable making this a simple dsb(ishst).
> > 
> > For boot we explicitly flush the caches for the shared data, so we don't
> > need this. The dsb() here is for standard smp_call_* etc. We didn't have
> > outer_sync() before, so you are slightly changing the functionality here
> > (arguably, ishst is relaxing the requirements but I'm not worried about
> > this, I consider that's the standard use-case for this function).
> 
> Ok, so if we assume that a dsb(ishst) is sufficient because the CPU we're
> talking to is either (a) coherent in the inner-shareable domain or (b)
> incoherent, and we flushed everything to PoC, then why wouldn't a dmb(ishst)
> work?

Because you want to guarantee the ordering between a store to Normal
Cacheable memory vs store to Device for the IPI (see the mailbox example
in the Barrier Litmus section ;)). The second is just a slave access, DMB
guarantees observability from the master access perspective.

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 12:23               ` Catalin Marinas
@ 2014-02-06 13:26                 ` Will Deacon
  2014-02-06 15:20                   ` Catalin Marinas
  0 siblings, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-06 13:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 12:23:40PM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 12:13:50PM +0000, Will Deacon wrote:
> > Ok, so if we assume that a dsb(ishst) is sufficient because the CPU we're
> > talking to is either (a) coherent in the inner-shareable domain or (b)
> > incoherent, and we flushed everything to PoC, then why wouldn't a dmb(ishst)
> > work?
> 
> Because you want to guarantee the ordering between a store to Normal
> Cacheable memory vs store to Device for the IPI (see the mailbox example
> in the Barrier Litmus section ;)). The second is just a slave access, DMB
> guarantees observability from the master access perspective.

Ok, my reasoning is as follows:

  - CPU0 tries to message CPU1. It writes to a location in normal memory,
    then writes to the GICD to send the SGI

  - We need to ensure that CPU1 observes the write to normal memory before
    the write to GICD reaches the distributor. This is *not* about end-point
    ordering (the usual non-coherent DMA example).

  - A dmb ishst ensures that the two writes are observed in order by CPU1
    (and, in fact, the inner-shareable domain containing CPU0).

so the only way this can break is if the GICD write reaches the distributor
before being observed by CPU1 (otherwise, we know the mailbox write was
observed by CPU1). I dread to think how you would build such a beast
(dual-ported GICD with no serialisation to the same locations?)...

Furthermore, if we decide that device writes can reach their endpoints
before being observed by other inner-shareable observers, then doesn't that
pose a potential problem for spinlocks? If I take a lock and write to a
device, the write can hit the device before the lock appears to be taken.
That doesn't sound right to me.

Using a dsb(ishst) will ensure that we don't issue the GICD write until the
mailbox is visible to CPU1, but may be overkill.

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 13:26                 ` Will Deacon
@ 2014-02-06 15:20                   ` Catalin Marinas
  2014-02-07 11:23                     ` Will Deacon
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-06 15:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 01:26:44PM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 12:23:40PM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 12:13:50PM +0000, Will Deacon wrote:
> > > Ok, so if we assume that a dsb(ishst) is sufficient because the CPU we're
> > > talking to is either (a) coherent in the inner-shareable domain or (b)
> > > incoherent, and we flushed everything to PoC, then why wouldn't a dmb(ishst)
> > > work?
> > 
> > Because you want to guarantee the ordering between a store to Normal
> > Cacheable memory vs store to Device for the IPI (see the mailbox example
> > in the Barrier Litmus section ;)). The second is just a slave access, DMB
> > guarantees observability from the master access perspective.
> 
> Ok, my reasoning is as follows:
> 
>   - CPU0 tries to message CPU1. It writes to a location in normal memory,
>     then writes to the GICD to send the SGI
> 
>   - We need to ensure that CPU1 observes the write to normal memory before
>     the write to GICD reaches the distributor. This is *not* about end-point
>     ordering (the usual non-coherent DMA example).
> 
>   - A dmb ishst ensures that the two writes are observed in order by CPU1
>     (and, in fact, the inner-shareable domain containing CPU0).

The last bullet point is not correct. DMB would only guarantee that the
two writes (memory and GICD) are observed by CPU1 if CPU1 actually read
the GICD (observability is defined for master accesses).

> so the only way this can break is if the GICD write reaches the distributor
> before being observed by CPU1 (otherwise, we know the mailbox write was
> observed by CPU1). I dread to think how you would build such a beast
> (dual-ported GICD with no serialisation to the same locations?)...

The above is possible because the CPU1 would never "observe" the GICD
write. It observes a side-effect of that write (interrupt) which is not
covered by DMB. You could argue that CPU1 reads GICC in the interrupt
handler but I'm not sure GICC vs GICD ordering is guaranteed (and I know
of hardware where not even accesses to the same GICD are guaranteed ;)).

> Furthermore, if we decide that device writes can reach their endpoints

Device writes are not observed according to the ARM ARM meaning of
"observability" (master accesses only; well, there is something about
Strongly Ordered memory and devices but it's not the case here and IIRC
refers to when a _device_ observes the write rather than a CPU observing
the write to that device, I need to read it again).

> before being observed by other inner-shareable observers, then doesn't that
> pose a potential problem for spinlocks? If I take a lock and write to a
> device, the write can hit the device before the lock appears to be taken.
> That doesn't sound right to me.

For simplicity, assuming that the lock acquiring was a simple STR, the
write to device can indeed hit the device before the STR is observed by
another CPU or device (that's why writel() has a DSB). This is however
not relevant as you don't care when it hit the device (unless you issue
IPIs). If considering master accesses, the below is valid even though B
could hit the device before A is observed by CPU1:

CPU0:
	STR A, [Normal]
	DMB
	STR B, [Device]

CPU1:
	LDR C, [Device]
	DMB
	LDR D, [Normal]

If on CPU1 C == B then D == A. The key here is that observability of
STR B, [Device] is done via another master access on CPU1 (LDR C,
[Device]) and not just the change to the device state.

When we talk about LDREX/STREX loop, I don't think the write to the
device can be issued before the STREX has been guaranteed to succeed
(not necessarily observed) and therefore the spinlock acquired (we don't
issue writes speculatively and the spinlock loop has a conditional
branch).

If we go to the definition of the STR observability (in short):

  1. A load from a location returns the value written by the observed
     store.
  2. A store to a location changes the value written by the observed
     store.

The "not observability" means _any_ of the above conditions is false. So
going back to your example, the write to the device can hit the device
_and_ the load of the spinlock value on another CPU1 still return the
unlocked value. However, a subsequent STREX on CPU1 would fail and the
LDREX restarted, eventually observing the STREX on CPU0.

If we talk about lock acquiring on another CPU and issuing of device
accesses, the DMB guarantees ordering (master accesses).

Another interesting scenario is the write to device followed by
spin_unlock and on another CPU spin_lock and device write. As per my
example above, I don't see any issue (change Normal with Device).

> Using a dsb(ishst) will ensure that we don't issue the GICD write until the
> mailbox is visible to CPU1, but may be overkill.

In the ARM ARM examples, the mailbox write generates the interrupt (it
is not visible to the other CPU at all).

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-06 15:20                   ` Catalin Marinas
@ 2014-02-07 11:23                     ` Will Deacon
  2014-02-07 12:57                       ` Catalin Marinas
  2014-02-14 16:30                       ` Will Deacon
  0 siblings, 2 replies; 29+ messages in thread
From: Will Deacon @ 2014-02-07 11:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 06, 2014 at 03:20:48PM +0000, Catalin Marinas wrote:
> On Thu, Feb 06, 2014 at 01:26:44PM +0000, Will Deacon wrote:
> > On Thu, Feb 06, 2014 at 12:23:40PM +0000, Catalin Marinas wrote:
> > > On Thu, Feb 06, 2014 at 12:13:50PM +0000, Will Deacon wrote:
> > > > Ok, so if we assume that a dsb(ishst) is sufficient because the CPU we're
> > > > talking to is either (a) coherent in the inner-shareable domain or (b)
> > > > incoherent, and we flushed everything to PoC, then why wouldn't a dmb(ishst)
> > > > work?
> > > 
> > > Because you want to guarantee the ordering between a store to Normal
> > > Cacheable memory vs store to Device for the IPI (see the mailbox example
> > > in the Barrier Litmus section ;)). The second is just a slave access, DMB
> > > guarantees observability from the master access perspective.
> > 
> > Ok, my reasoning is as follows:
> > 
> >   - CPU0 tries to message CPU1. It writes to a location in normal memory,
> >     then writes to the GICD to send the SGI
> > 
> >   - We need to ensure that CPU1 observes the write to normal memory before
> >     the write to GICD reaches the distributor. This is *not* about end-point
> >     ordering (the usual non-coherent DMA example).
> > 
> >   - A dmb ishst ensures that the two writes are observed in order by CPU1
> >     (and, in fact, the inner-shareable domain containing CPU0).
> 
> The last bullet point is not correct. DMB would only guarantee that the
> two writes (memory and GICD) are observed by CPU1 if CPU1 actually read
> the GICD (observability is defined for master accesses).

No, that's not how observability is defined, unfortunately.

Rather than attempt to solve this via email (your examples below are already
getting hard to follow :), how about we sit down with $drink_of_choice and
post back here with our conclusions?

In the meantime, I'll use dsb(ishst) because I think we now agree that
works. The question is whether it can be relaxed further to a dmb.

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-07 11:23                     ` Will Deacon
@ 2014-02-07 12:57                       ` Catalin Marinas
  2014-02-14 16:30                       ` Will Deacon
  1 sibling, 0 replies; 29+ messages in thread
From: Catalin Marinas @ 2014-02-07 12:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Feb 07, 2014 at 11:23:38AM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 03:20:48PM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 01:26:44PM +0000, Will Deacon wrote:
> > > On Thu, Feb 06, 2014 at 12:23:40PM +0000, Catalin Marinas wrote:
> > > > On Thu, Feb 06, 2014 at 12:13:50PM +0000, Will Deacon wrote:
> > > > > Ok, so if we assume that a dsb(ishst) is sufficient because the CPU we're
> > > > > talking to is either (a) coherent in the inner-shareable domain or (b)
> > > > > incoherent, and we flushed everything to PoC, then why wouldn't a dmb(ishst)
> > > > > work?
> > > > 
> > > > Because you want to guarantee the ordering between a store to Normal
> > > > Cacheable memory vs store to Device for the IPI (see the mailbox example
> > > > in the Barrier Litmus section ;)). The second is just a slave access, DMB
> > > > guarantees observability from the master access perspective.
> > > 
> > > Ok, my reasoning is as follows:
> > > 
> > >   - CPU0 tries to message CPU1. It writes to a location in normal memory,
> > >     then writes to the GICD to send the SGI
> > > 
> > >   - We need to ensure that CPU1 observes the write to normal memory before
> > >     the write to GICD reaches the distributor. This is *not* about end-point
> > >     ordering (the usual non-coherent DMA example).
> > > 
> > >   - A dmb ishst ensures that the two writes are observed in order by CPU1
> > >     (and, in fact, the inner-shareable domain containing CPU0).
> > 
> > The last bullet point is not correct. DMB would only guarantee that the
> > two writes (memory and GICD) are observed by CPU1 if CPU1 actually read
> > the GICD (observability is defined for master accesses).
> 
> No, that's not how observability is defined, unfortunately.

According to the ARM ARM, "an observer is a master in the system that is
capable of observing memory accesses". GICD is not memory, hence the
assumptions about DMB in this context no longer work.

> Rather than attempt to solve this via email (your examples below are already
> getting hard to follow :), how about we sit down with $drink_of_choice and
> post back here with our conclusions?

Sounds good (though not a wide choice of drinks).

> In the meantime, I'll use dsb(ishst) because I think we now agree that
> works. The question is whether it can be relaxed further to a dmb.

I'm fine with dsb(ishst).

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-07 11:23                     ` Will Deacon
  2014-02-07 12:57                       ` Catalin Marinas
@ 2014-02-14 16:30                       ` Will Deacon
  2014-02-14 16:48                         ` Catalin Marinas
  1 sibling, 1 reply; 29+ messages in thread
From: Will Deacon @ 2014-02-14 16:30 UTC (permalink / raw)
  To: linux-arm-kernel

Well, the results are in (*drum roll*)...

On Fri, Feb 07, 2014 at 11:23:37AM +0000, Will Deacon wrote:
> On Thu, Feb 06, 2014 at 03:20:48PM +0000, Catalin Marinas wrote:
> > On Thu, Feb 06, 2014 at 01:26:44PM +0000, Will Deacon wrote:
> > > Ok, my reasoning is as follows:
> > > 
> > >   - CPU0 tries to message CPU1. It writes to a location in normal memory,
> > >     then writes to the GICD to send the SGI
> > > 
> > >   - We need to ensure that CPU1 observes the write to normal memory before
> > >     the write to GICD reaches the distributor. This is *not* about end-point
> > >     ordering (the usual non-coherent DMA example).
> > > 
> > >   - A dmb ishst ensures that the two writes are observed in order by CPU1
> > >     (and, in fact, the inner-shareable domain containing CPU0).
> > 
> > The last bullet point is not correct. DMB would only guarantee that the
> > two writes (memory and GICD) are observed by CPU1 if CPU1 actually read
> > the GICD (observability is defined for master accesses).
> 
> Rather than attempt to solve this via email (your examples below are already
> getting hard to follow :), how about we sit down with $drink_of_choice and
> post back here with our conclusions?

... and it turns out that a dmb(ishst) is sufficient!

This counter-intuitive result was brought to you by ARM Ltd, purveyors of
fine grained memory barrier instructions.

I'll spin the patch again.

Will

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-14 16:30                       ` Will Deacon
@ 2014-02-14 16:48                         ` Catalin Marinas
  2014-02-14 17:18                           ` Rob Herring
  0 siblings, 1 reply; 29+ messages in thread
From: Catalin Marinas @ 2014-02-14 16:48 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Feb 14, 2014 at 04:30:39PM +0000, Will Deacon wrote:
> Well, the results are in (*drum roll*)...
> 
> On Fri, Feb 07, 2014 at 11:23:37AM +0000, Will Deacon wrote:
> > On Thu, Feb 06, 2014 at 03:20:48PM +0000, Catalin Marinas wrote:
> > > On Thu, Feb 06, 2014 at 01:26:44PM +0000, Will Deacon wrote:
> > > > Ok, my reasoning is as follows:
> > > > 
> > > >   - CPU0 tries to message CPU1. It writes to a location in normal memory,
> > > >     then writes to the GICD to send the SGI
> > > > 
> > > >   - We need to ensure that CPU1 observes the write to normal memory before
> > > >     the write to GICD reaches the distributor. This is *not* about end-point
> > > >     ordering (the usual non-coherent DMA example).
> > > > 
> > > >   - A dmb ishst ensures that the two writes are observed in order by CPU1
> > > >     (and, in fact, the inner-shareable domain containing CPU0).
> > > 
> > > The last bullet point is not correct. DMB would only guarantee that the
> > > two writes (memory and GICD) are observed by CPU1 if CPU1 actually read
> > > the GICD (observability is defined for master accesses).
> > 
> > Rather than attempt to solve this via email (your examples below are already
> > getting hard to follow :), how about we sit down with $drink_of_choice and
> > post back here with our conclusions?
> 
> ... and it turns out that a dmb(ishst) is sufficient!

Until we hear otherwise ;)

-- 
Catalin

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed
  2014-02-14 16:48                         ` Catalin Marinas
@ 2014-02-14 17:18                           ` Rob Herring
  0 siblings, 0 replies; 29+ messages in thread
From: Rob Herring @ 2014-02-14 17:18 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Feb 14, 2014 at 10:48 AM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Fri, Feb 14, 2014 at 04:30:39PM +0000, Will Deacon wrote:
>> Well, the results are in (*drum roll*)...
>>
>> On Fri, Feb 07, 2014 at 11:23:37AM +0000, Will Deacon wrote:
>> > On Thu, Feb 06, 2014 at 03:20:48PM +0000, Catalin Marinas wrote:
>> > > On Thu, Feb 06, 2014 at 01:26:44PM +0000, Will Deacon wrote:
>> > > > Ok, my reasoning is as follows:
>> > > >
>> > > >   - CPU0 tries to message CPU1. It writes to a location in normal memory,
>> > > >     then writes to the GICD to send the SGI
>> > > >
>> > > >   - We need to ensure that CPU1 observes the write to normal memory before
>> > > >     the write to GICD reaches the distributor. This is *not* about end-point
>> > > >     ordering (the usual non-coherent DMA example).
>> > > >
>> > > >   - A dmb ishst ensures that the two writes are observed in order by CPU1
>> > > >     (and, in fact, the inner-shareable domain containing CPU0).
>> > >
>> > > The last bullet point is not correct. DMB would only guarantee that the
>> > > two writes (memory and GICD) are observed by CPU1 if CPU1 actually read
>> > > the GICD (observability is defined for master accesses).
>> >
>> > Rather than attempt to solve this via email (your examples below are already
>> > getting hard to follow :), how about we sit down with $drink_of_choice and
>> > post back here with our conclusions?
>>
>> ... and it turns out that a dmb(ishst) is sufficient!
>
> Until we hear otherwise ;)

It is a shame that all this thought and effort will just end with
barrier related chicken bits getting set anyway. Maybe the
$drink_of_choice was worth it. ;)

Rob

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2014-02-14 17:18 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-06 11:30 [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Will Deacon
2014-02-06 11:30 ` [PATCH 2/6] arm64: barriers: make use of barrier options with explicit barriers Will Deacon
2014-02-06 11:41   ` Catalin Marinas
2014-02-06 11:45     ` Will Deacon
2014-02-06 11:49       ` Catalin Marinas
2014-02-06 11:52         ` Will Deacon
2014-02-06 11:30 ` [PATCH 3/6] irqchip: gic: use writel instead of dsb + writel_relaxed Will Deacon
2014-02-06 11:39   ` Marc Zyngier
2014-02-06 11:45   ` Catalin Marinas
2014-02-06 11:51     ` Will Deacon
2014-02-06 11:54       ` Catalin Marinas
2014-02-06 11:57         ` Will Deacon
2014-02-06 12:00           ` Catalin Marinas
2014-02-06 12:13             ` Will Deacon
2014-02-06 12:23               ` Catalin Marinas
2014-02-06 13:26                 ` Will Deacon
2014-02-06 15:20                   ` Catalin Marinas
2014-02-07 11:23                     ` Will Deacon
2014-02-07 12:57                       ` Catalin Marinas
2014-02-14 16:30                       ` Will Deacon
2014-02-14 16:48                         ` Catalin Marinas
2014-02-14 17:18                           ` Rob Herring
2014-02-06 11:30 ` [PATCH 4/6] iommu/arm-smmu: provide option to dsb macro when publishing tables Will Deacon
2014-02-06 11:51   ` Catalin Marinas
2014-02-06 11:30 ` [PATCH 5/6] arm64: barriers: wire up new barrier options Will Deacon
2014-02-06 11:55   ` Catalin Marinas
2014-02-06 11:30 ` [PATCH 6/6] arm64: barriers: use barrier() instead of smp_mb() when !SMP Will Deacon
2014-02-06 11:56   ` Catalin Marinas
2014-02-06 11:39 ` [PATCH 1/6] arm64: barriers: allow dsb macro to take option parameter Catalin Marinas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.