arm32: fix flushcache syscall with device address
diff mbox series

Message ID 1587456514-61156-1-git-send-email-tiantao6@hisilicon.com
State New
Headers show
Series
  • arm32: fix flushcache syscall with device address
Related show

Commit Message

Tian Tao April 21, 2020, 8:08 a.m. UTC
An issue has been observed on our Kungpeng916 systems when using a PCI
express GPU. This occurs when a 32 bit application running on a 64 bit
kernel issues a cache flush operation to a memory address that is in
a PCI BAR of the GPU.The results in an illegal operation and
subsequent crash.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Lixin Chen <chenlixin1@huawei.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
---
 arch/arm64/kernel/sys_compat.c | 69 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

Comments

Will Deacon April 21, 2020, 8:12 a.m. UTC | #1
On Tue, Apr 21, 2020 at 04:08:34PM +0800, Tian Tao wrote:
> An issue has been observed on our Kungpeng916 systems when using a PCI
> express GPU. This occurs when a 32 bit application running on a 64 bit
> kernel issues a cache flush operation to a memory address that is in
> a PCI BAR of the GPU.The results in an illegal operation and
> subsequent crash.

A kernel crash? If so, please can you include the log here?

Will
Jonathan Cameron April 21, 2020, 11:16 a.m. UTC | #2
On Tue, 21 Apr 2020 09:12:39 +0100
Will Deacon <will@kernel.org> wrote:

> On Tue, Apr 21, 2020 at 04:08:34PM +0800, Tian Tao wrote:
> > An issue has been observed on our Kungpeng916 systems when using a PCI
> > express GPU. This occurs when a 32 bit application running on a 64 bit
> > kernel issues a cache flush operation to a memory address that is in
> > a PCI BAR of the GPU.The results in an illegal operation and
> > subsequent crash.  
> 
> A kernel crash? If so, please can you include the log here?

Deploying my finest copy typing from the image Tian Tao sent out

      KERNEL: /root/vmlinux-4.19.36-3patch-00228-debuginfo
    DUMPFILE: vmcore [PARTIAL DUMP]
        CPUS: 64
        DATE: Fri Mar 20 06:59:56 2020
      UPTIME: 07:01:01
LOAD AVERAGE: 33.76, 35.45, 35.79
       TASKS: 59447
    NODENAME: cpus-new-ondemand-0509
     RELEASE: 4.19.36-3patch-0228
     VERSION: #4 SMP Fri Feb 28 15:18:51 UTC 2020
     MACHINE: aarch64 (unknown MHz)
      MEMORY: 255.7 GB
       PANIC: "kernel panic - not syncing: Asynchronous SError Interrupt"
         PID: 175108
     COMMAND: "UnityMain"
        TASK: ffff80a96999dd00 [THREAD_INFO: ffff80a96999dd00]
         CPU: 62
       STATE: TASK_RUNNING (PANIC)

crash> bt
PID: 175108 TASK: ffff80a96999dd00 CPU: 62 COMMAND: "UnityMain"
  #0 [ffff000194e1b920] machine_kexec at ffff0000080a265c
  #1 [ffff000194e1b980] __crash_kexec at ffff0000081b3ba8
  #2 [ffff000194e1bb10] panic at ffff0000080ecc98
  #3 [ffff000194e1bbf0] nmi_panic at ffff0000080ec7f4
  #4 [ffff000194e1bc10] arm64_serror_panic at fff00000809019c
  #5 [ffff000194e1bc30] do_serror at ffff00000809039c
  #6 [ffff000194e1bd90] el1_error at ffff000008083e50
  #7 [ffff000194e1bda0] __flush_icache_range at ffff0000080a9ec4
  #8 [ffff000194e1be60] el0_svc_common at fff0000080977d8
  #9 [ffff000194e1bea0] el0_svc_compat_handler at ffff0000080979b4
 #10 [ffff000194e1bff0] el0_svc_compat at ffff0000008083874

     PC: c90fe7f8  LR: c90ff09c  SP: d2afa8e0  PSTATE: 800b0010
    X12: c56e96e4 X11: d2afaa48 X10: d0ff1000  X9: d2afab68
     x8: 000000d6  X7: 000f0002  X6: d3c61840  X5: d3c61001
     X4: d3c03000  X3: 0004d54a  x2: 00000000  x1: d3c61040
     X0: d3c61000


New advanced test for Mavis Beacon teaches typing.

In summary this is all we have to hand...

> 
> Will
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Russell King - ARM Linux admin April 21, 2020, 12:22 p.m. UTC | #3
On Tue, Apr 21, 2020 at 04:08:34PM +0800, Tian Tao wrote:
> An issue has been observed on our Kungpeng916 systems when using a PCI
> express GPU. This occurs when a 32 bit application running on a 64 bit
> kernel issues a cache flush operation to a memory address that is in
> a PCI BAR of the GPU.The results in an illegal operation and
> subsequent crash.

The arm32 cacheflush interface is *NOT* for syncing memory for accesses
performed by another device.  This can't be said strongly enough.

It is _only_ to support synchronisation of the I/D caches for
applications that need to "write their own code", i.o.w.
self-modification.

Your use of this interface is therefore incorrect.

If you have the privileges of being able to map PCI BARs, then you
already have privileged access to the system.

Please stop pointing the metaphorical gun at your foot.

> 
> Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
> Signed-off-by: Lixin Chen <chenlixin1@huawei.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> ---
>  arch/arm64/kernel/sys_compat.c | 69 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 69 insertions(+)
> 
> diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
> index 3c18c24..6c07944 100644
> --- a/arch/arm64/kernel/sys_compat.c
> +++ b/arch/arm64/kernel/sys_compat.c
> @@ -15,12 +15,74 @@
>  #include <linux/slab.h>
>  #include <linux/syscalls.h>
>  #include <linux/uaccess.h>
> +#include <linux/hugetlb.h>
>  
>  #include <asm/cacheflush.h>
>  #include <asm/system_misc.h>
>  #include <asm/tlbflush.h>
>  #include <asm/unistd.h>
>  
> +static long __check_pt_cacheable(unsigned long vaddr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pudval_t pudval;
> +	pmd_t *pmd;
> +	pmdval_t pmdval;
> +	pte_t *pte;
> +	pteval_t pteval;
> +	pgprot_t pgprot;
> +
> +	spin_lock(&mm->page_table_lock);
> +	pgd = pgd_offset(mm, vaddr);
> +	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
> +		goto no_page;
> +
> +	p4d = p4d_offset(pgd, vaddr);
> +	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
> +		goto no_page;
> +
> +	pud = pud_offset(p4d, vaddr);
> +	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
> +		goto no_page;
> +	if (pud_huge(*pud)) {
> +		pudval = pud_val(*pud);
> +		pgprot = __pgprot(pudval);
> +		goto out;
> +	}
> +
> +	pmd = pmd_offset(pud, vaddr);
> +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> +		goto no_page;
> +	if (pmd_huge(*pmd)) {
> +		pmdval = pmd_val(*pmd);
> +		pgprot = __pgprot(pmdval);
> +		goto out;
> +	}
> +
> +	pte = pte_offset_map(pmd, vaddr);
> +	if (!pte_present(*pte) || pte_none(*pte))
> +		goto no_page;
> +	pteval = pte_val(*pte);
> +	pgprot = __pgprot(pteval);
> +
> +out:
> +	pgprot.pgprot &= PTE_ATTRINDX_MASK;
> +	if (pgprot.pgprot != PTE_ATTRINDX(MT_NORMAL)) {
> +		pr_debug("non-cache page pgprot value=0x%llx.\n",
> +			pgprot.pgprot);
> +		goto no_page;
> +	}
> +	spin_unlock(&mm->page_table_lock);
> +	return 1;
> +
> +no_page:
> +	spin_unlock(&mm->page_table_lock);
> +	return 0;
> +}
> +
>  static long
>  __do_compat_cache_op(unsigned long start, unsigned long end)
>  {
> @@ -32,6 +94,13 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
>  		if (fatal_signal_pending(current))
>  			return 0;
>  
> +		 /* do not flush page table is non-cacheable */
> +		if (!__check_pt_cacheable(start)) {
> +			cond_resched();
> +			start += chunk;
> +			continue;
> +		}
> +
>  		if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
>  			/*
>  			 * The workaround requires an inner-shareable tlbi.
> -- 
> 2.7.4
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
James Morse April 21, 2020, 4:50 p.m. UTC | #4
Hi guys,

(Subject Nit: arm64, as that is what your patch modifies)

On 21/04/2020 12:16, Jonathan Cameron wrote:
> On Tue, 21 Apr 2020 09:12:39 +0100
> Will Deacon <will@kernel.org> wrote:
> 
>> On Tue, Apr 21, 2020 at 04:08:34PM +0800, Tian Tao wrote:
>>> An issue has been observed on our Kungpeng916 systems when using a PCI
>>> express GPU. This occurs when a 32 bit application running on a 64 bit
>>> kernel issues a cache flush operation to a memory address that is in
>>> a PCI BAR of the GPU.The results in an illegal operation and
>>> subsequent crash.  
>>
>> A kernel crash? If so, please can you include the log here?
> 
> Deploying my finest copy typing from the image Tian Tao sent out
> 
>       KERNEL: /root/vmlinux-4.19.36-3patch-00228-debuginfo
>     DUMPFILE: vmcore [PARTIAL DUMP]
>         CPUS: 64
>         DATE: Fri Mar 20 06:59:56 2020
>       UPTIME: 07:01:01
> LOAD AVERAGE: 33.76, 35.45, 35.79
>        TASKS: 59447
>     NODENAME: cpus-new-ondemand-0509
>      RELEASE: 4.19.36-3patch-0228
>      VERSION: #4 SMP Fri Feb 28 15:18:51 UTC 2020
>      MACHINE: aarch64 (unknown MHz)
>       MEMORY: 255.7 GB
>        PANIC: "kernel panic - not syncing: Asynchronous SError Interrupt"
>          PID: 175108
>      COMMAND: "UnityMain"
>         TASK: ffff80a96999dd00 [THREAD_INFO: ffff80a96999dd00]
>          CPU: 62
>        STATE: TASK_RUNNING (PANIC)
> 
> crash> bt
> PID: 175108 TASK: ffff80a96999dd00 CPU: 62 COMMAND: "UnityMain"
>   #0 [ffff000194e1b920] machine_kexec at ffff0000080a265c
>   #1 [ffff000194e1b980] __crash_kexec at ffff0000081b3ba8
>   #2 [ffff000194e1bb10] panic at ffff0000080ecc98
>   #3 [ffff000194e1bbf0] nmi_panic at ffff0000080ec7f4
>   #4 [ffff000194e1bc10] arm64_serror_panic at fff00000809019c
>   #5 [ffff000194e1bc30] do_serror at ffff00000809039c
>   #6 [ffff000194e1bd90] el1_error at ffff000008083e50
>   #7 [ffff000194e1bda0] __flush_icache_range at ffff0000080a9ec4
>   #8 [ffff000194e1be60] el0_svc_common at fff0000080977d8
>   #9 [ffff000194e1bea0] el0_svc_compat_handler at ffff0000080979b4
>  #10 [ffff000194e1bff0] el0_svc_compat at ffff0000008083874
> 
>      PC: c90fe7f8  LR: c90ff09c  SP: d2afa8e0  PSTATE: 800b0010
>     X12: c56e96e4 X11: d2afaa48 X10: d0ff1000  X9: d2afab68
>      x8: 000000d6  X7: 000f0002  X6: d3c61840  X5: d3c61001
>      X4: d3c03000  X3: 0004d54a  x2: 00000000  x1: d3c61040
>      X0: d3c61000
> 
> 
> New advanced test for Mavis Beacon teaches typing.

Thanks for doing that!


> In summary this is all we have to hand...


Jumping back to the patch:
On 21/04/2020 09:08, Tian Tao wrote:> diff --git a/arch/arm64/kernel/sys_compat.c
b/arch/arm64/kernel/sys_compat.c
> index 3c18c24..6c07944 100644
> --- a/arch/arm64/kernel/sys_compat.c
> +++ b/arch/arm64/kernel/sys_compat.c
> @@ -32,6 +94,13 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
>  		if (fatal_signal_pending(current))
>  			return 0;
>
> +		 /* do not flush page table is non-cacheable */
> +		if (!__check_pt_cacheable(start)) {
> +			cond_resched();
> +			start += chunk;
> +			continue;
> +		}

The Arm-arm expects this to work, so we can't just skip it!

D4.4.8's "Effects of instructions that operate by VA to the PoC" has:
| For Device memory and Normal memory that is Inner Non-cacheable, Outer Non-cacheable,
| these instructions must affect the caches of all PEs in the Outer Shareable shareability
| domain of the PE on which the instruction is operating.


What does aarch64 user-space do in the same situation? Surely that also goes down in
flames too!?

I think the real problem here is you've given this kind of mapping to user-space. If the
device behind the mapping can respond like this, you must trust user-space not to poke it
inappropriately.

If its not got all the properties of memory: please don't pretend its memory.
If its a device, it probably needs to be carefully managed by a dedicated driver.


Do you know where the abort comes from and why? (The interconnect, PCIE-root-complex, or
from the GPU itself?)
Is it the wrong attributes? Too large an eviction from the cache? Wrong alignment for this
BAR? It can't handle cache-maintenance?


Thanks,

James
Russell King - ARM Linux admin April 21, 2020, 4:55 p.m. UTC | #5
On Tue, Apr 21, 2020 at 05:50:22PM +0100, James Morse wrote:
> Hi guys,
> 
> (Subject Nit: arm64, as that is what your patch modifies)

That is irrelevent.  This is a compatibility interface which is supposed
to reflect the arm32 implementation.  Augmenting a compatibility
interface to do more than what it's counterpart that it's supposed to
be compatible with is senseless.

The API concerned is an ARM32 API which is expected to only be used
for ensuring I/D cache coherency, it is not for DMA.

Augmenting it on ARM64 for DMA is senseless.
Catalin Marinas April 22, 2020, 1:56 p.m. UTC | #6
On Tue, Apr 21, 2020 at 05:55:16PM +0100, Russell King wrote:
> On Tue, Apr 21, 2020 at 05:50:22PM +0100, James Morse wrote:
> > (Subject Nit: arm64, as that is what your patch modifies)
> 
> That is irrelevent.  This is a compatibility interface which is supposed
> to reflect the arm32 implementation.  Augmenting a compatibility
> interface to do more than what it's counterpart that it's supposed to
> be compatible with is senseless.
> 
> The API concerned is an ARM32 API which is expected to only be used
> for ensuring I/D cache coherency, it is not for DMA.
> 
> Augmenting it on ARM64 for DMA is senseless.

I fully agree. I don't see any valid reason why this needs to be fixed.
It looks like some broken user process trying to do cache maintenance to
PoU on the mapped PCIe BAR (either inadvertently or for the wrong
reasons).

Patch
diff mbox series

diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 3c18c24..6c07944 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -15,12 +15,74 @@ 
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/hugetlb.h>
 
 #include <asm/cacheflush.h>
 #include <asm/system_misc.h>
 #include <asm/tlbflush.h>
 #include <asm/unistd.h>
 
+static long __check_pt_cacheable(unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pudval_t pudval;
+	pmd_t *pmd;
+	pmdval_t pmdval;
+	pte_t *pte;
+	pteval_t pteval;
+	pgprot_t pgprot;
+
+	spin_lock(&mm->page_table_lock);
+	pgd = pgd_offset(mm, vaddr);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto no_page;
+
+	p4d = p4d_offset(pgd, vaddr);
+	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+		goto no_page;
+
+	pud = pud_offset(p4d, vaddr);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto no_page;
+	if (pud_huge(*pud)) {
+		pudval = pud_val(*pud);
+		pgprot = __pgprot(pudval);
+		goto out;
+	}
+
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto no_page;
+	if (pmd_huge(*pmd)) {
+		pmdval = pmd_val(*pmd);
+		pgprot = __pgprot(pmdval);
+		goto out;
+	}
+
+	pte = pte_offset_map(pmd, vaddr);
+	if (!pte_present(*pte) || pte_none(*pte))
+		goto no_page;
+	pteval = pte_val(*pte);
+	pgprot = __pgprot(pteval);
+
+out:
+	pgprot.pgprot &= PTE_ATTRINDX_MASK;
+	if (pgprot.pgprot != PTE_ATTRINDX(MT_NORMAL)) {
+		pr_debug("non-cache page pgprot value=0x%llx.\n",
+			pgprot.pgprot);
+		goto no_page;
+	}
+	spin_unlock(&mm->page_table_lock);
+	return 1;
+
+no_page:
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
+
 static long
 __do_compat_cache_op(unsigned long start, unsigned long end)
 {
@@ -32,6 +94,13 @@  __do_compat_cache_op(unsigned long start, unsigned long end)
 		if (fatal_signal_pending(current))
 			return 0;
 
+		 /* do not flush page table is non-cacheable */
+		if (!__check_pt_cacheable(start)) {
+			cond_resched();
+			start += chunk;
+			continue;
+		}
+
 		if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
 			/*
 			 * The workaround requires an inner-shareable tlbi.