dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: Fix cat debugfs hang_hws file causes system crash bug
@ 2021-03-21  8:28 Qu Huang
  2021-03-23 14:56 ` Alex Deucher
  0 siblings, 1 reply; 3+ messages in thread
From: Qu Huang @ 2021-03-21  8:28 UTC (permalink / raw)
  To: Felix.Kuehling, alexander.deucher, christian.koenig, airlied, daniel
  Cc: jinsdb, dri-devel, amd-gfx, linux-kernel

Here is the system crash log:
[ 1272.884438] BUG: unable to handle kernel NULL pointer dereference at
(null)
[ 1272.884444] IP: [<          (null)>]           (null)
[ 1272.884447] PGD 825b09067 PUD 8267c8067 PMD 0
[ 1272.884452] Oops: 0010 [#1] SMP
[ 1272.884509] CPU: 13 PID: 3485 Comm: cat Kdump: loaded Tainted: G
[ 1272.884515] task: ffff9a38dbd4d140 ti: ffff9a37cd3b8000 task.ti:
ffff9a37cd3b8000
[ 1272.884517] RIP: 0010:[<0000000000000000>]  [<          (null)>]
(null)
[ 1272.884520] RSP: 0018:ffff9a37cd3bbe68  EFLAGS: 00010203
[ 1272.884522] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
0000000000014d5f
[ 1272.884524] RDX: fffffffffffffff4 RSI: 0000000000000001 RDI:
ffff9a38aca4d200
[ 1272.884526] RBP: ffff9a37cd3bbed0 R08: ffff9a38dcd5f1a0 R09:
ffff9a31ffc07300
[ 1272.884527] R10: ffff9a31ffc07300 R11: ffffffffaddd5e9d R12:
ffff9a38b4e0fb00
[ 1272.884529] R13: 0000000000000001 R14: ffff9a37cd3bbf18 R15:
ffff9a38aca4d200
[ 1272.884532] FS:  00007feccaa67740(0000) GS:ffff9a38dcd40000(0000)
knlGS:0000000000000000
[ 1272.884534] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1272.884536] CR2: 0000000000000000 CR3: 00000008267c0000 CR4:
00000000003407e0
[ 1272.884537] Call Trace:
[ 1272.884544]  [<ffffffffade68940>] ? seq_read+0x130/0x440
[ 1272.884548]  [<ffffffffade40f8f>] vfs_read+0x9f/0x170
[ 1272.884552]  [<ffffffffade41e4f>] SyS_read+0x7f/0xf0
[ 1272.884557]  [<ffffffffae374ddb>] system_call_fastpath+0x22/0x27
[ 1272.884558] Code:  Bad RIP value.
[ 1272.884562] RIP  [<          (null)>]           (null)
[ 1272.884564]  RSP <ffff9a37cd3bbe68>
[ 1272.884566] CR2: 0000000000000000

Signed-off-by: Qu Huang <jinsdb@126.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 511712c..673d5e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -33,6 +33,11 @@ static int kfd_debugfs_open(struct inode *inode, struct file *file)

 	return single_open(file, show, NULL);
 }
+static int kfd_debugfs_hang_hws_read(struct seq_file *m, void *data)
+{
+	seq_printf(m, "echo gpu_id > hang_hws\n");
+	return 0;
+}

 static ssize_t kfd_debugfs_hang_hws_write(struct file *file,
 	const char __user *user_buf, size_t size, loff_t *ppos)
@@ -94,7 +99,7 @@ void kfd_debugfs_init(void)
 	debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
 			    kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
 	debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
-			    NULL, &kfd_debugfs_hang_hws_fops);
+			    kfd_debugfs_hang_hws_read, &kfd_debugfs_hang_hws_fops);
 }

 void kfd_debugfs_fini(void)
--
1.8.3.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdkfd: Fix cat debugfs hang_hws file causes system crash bug
  2021-03-21  8:28 [PATCH] drm/amdkfd: Fix cat debugfs hang_hws file causes system crash bug Qu Huang
@ 2021-03-23 14:56 ` Alex Deucher
  2021-03-26  2:46   ` Felix Kuehling
  0 siblings, 1 reply; 3+ messages in thread
From: Alex Deucher @ 2021-03-23 14:56 UTC (permalink / raw)
  To: Qu Huang
  Cc: Dave Airlie, Kuehling, Felix, LKML, Maling list - DRI developers,
	amd-gfx list, Deucher, Alexander, Christian Koenig

Applied.  Thanks!

Alex

On Sun, Mar 21, 2021 at 5:33 AM Qu Huang <jinsdb@126.com> wrote:
>
> Here is the system crash log:
> [ 1272.884438] BUG: unable to handle kernel NULL pointer dereference at
> (null)
> [ 1272.884444] IP: [<          (null)>]           (null)
> [ 1272.884447] PGD 825b09067 PUD 8267c8067 PMD 0
> [ 1272.884452] Oops: 0010 [#1] SMP
> [ 1272.884509] CPU: 13 PID: 3485 Comm: cat Kdump: loaded Tainted: G
> [ 1272.884515] task: ffff9a38dbd4d140 ti: ffff9a37cd3b8000 task.ti:
> ffff9a37cd3b8000
> [ 1272.884517] RIP: 0010:[<0000000000000000>]  [<          (null)>]
> (null)
> [ 1272.884520] RSP: 0018:ffff9a37cd3bbe68  EFLAGS: 00010203
> [ 1272.884522] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
> 0000000000014d5f
> [ 1272.884524] RDX: fffffffffffffff4 RSI: 0000000000000001 RDI:
> ffff9a38aca4d200
> [ 1272.884526] RBP: ffff9a37cd3bbed0 R08: ffff9a38dcd5f1a0 R09:
> ffff9a31ffc07300
> [ 1272.884527] R10: ffff9a31ffc07300 R11: ffffffffaddd5e9d R12:
> ffff9a38b4e0fb00
> [ 1272.884529] R13: 0000000000000001 R14: ffff9a37cd3bbf18 R15:
> ffff9a38aca4d200
> [ 1272.884532] FS:  00007feccaa67740(0000) GS:ffff9a38dcd40000(0000)
> knlGS:0000000000000000
> [ 1272.884534] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1272.884536] CR2: 0000000000000000 CR3: 00000008267c0000 CR4:
> 00000000003407e0
> [ 1272.884537] Call Trace:
> [ 1272.884544]  [<ffffffffade68940>] ? seq_read+0x130/0x440
> [ 1272.884548]  [<ffffffffade40f8f>] vfs_read+0x9f/0x170
> [ 1272.884552]  [<ffffffffade41e4f>] SyS_read+0x7f/0xf0
> [ 1272.884557]  [<ffffffffae374ddb>] system_call_fastpath+0x22/0x27
> [ 1272.884558] Code:  Bad RIP value.
> [ 1272.884562] RIP  [<          (null)>]           (null)
> [ 1272.884564]  RSP <ffff9a37cd3bbe68>
> [ 1272.884566] CR2: 0000000000000000
>
> Signed-off-by: Qu Huang <jinsdb@126.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
> index 511712c..673d5e3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
> @@ -33,6 +33,11 @@ static int kfd_debugfs_open(struct inode *inode, struct file *file)
>
>         return single_open(file, show, NULL);
>  }
> +static int kfd_debugfs_hang_hws_read(struct seq_file *m, void *data)
> +{
> +       seq_printf(m, "echo gpu_id > hang_hws\n");
> +       return 0;
> +}
>
>  static ssize_t kfd_debugfs_hang_hws_write(struct file *file,
>         const char __user *user_buf, size_t size, loff_t *ppos)
> @@ -94,7 +99,7 @@ void kfd_debugfs_init(void)
>         debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
>                             kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
>         debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
> -                           NULL, &kfd_debugfs_hang_hws_fops);
> +                           kfd_debugfs_hang_hws_read, &kfd_debugfs_hang_hws_fops);
>  }
>
>  void kfd_debugfs_fini(void)
> --
> 1.8.3.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/amdkfd: Fix cat debugfs hang_hws file causes system crash bug
  2021-03-23 14:56 ` Alex Deucher
@ 2021-03-26  2:46   ` Felix Kuehling
  0 siblings, 0 replies; 3+ messages in thread
From: Felix Kuehling @ 2021-03-26  2:46 UTC (permalink / raw)
  To: Alex Deucher, Qu Huang
  Cc: Dave Airlie, LKML, Maling list - DRI developers, amd-gfx list,
	Deucher, Alexander, Christian Koenig


Am 2021-03-23 um 10:56 a.m. schrieb Alex Deucher:
> Applied.  Thanks!

Thanks. I thought we fixed this before by making the file write-only.
But I guess that's not sufficient to stop root from reading it:

commit 2bdac179e217a0c0b548a8c60524977586621b19
Author: Felix Kuehling <Felix.Kuehling@amd.com>
Date:   Thu Dec 19 22:36:55 2019 -0500

    drm/amdkfd: Fix permissions of hang_hws
    
    Reading from /sys/kernel/debug/kfd/hang_hws would cause a kernel
    oops because we didn't implement a read callback. Set the permission
    to write-only to prevent that.
    
    Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
    Reviewed-by: shaoyunl  <shaoyun.liu@amd.com>
    Signed-off-by: Alex Deucher <alexander.deucher@amd.com>


Now that we have a sensible message in the file, I guess we should
officially make it readable again.

Regards,
  Felix

>
> Alex
>
> On Sun, Mar 21, 2021 at 5:33 AM Qu Huang <jinsdb@126.com> wrote:
>> Here is the system crash log:
>> [ 1272.884438] BUG: unable to handle kernel NULL pointer dereference at
>> (null)
>> [ 1272.884444] IP: [<          (null)>]           (null)
>> [ 1272.884447] PGD 825b09067 PUD 8267c8067 PMD 0
>> [ 1272.884452] Oops: 0010 [#1] SMP
>> [ 1272.884509] CPU: 13 PID: 3485 Comm: cat Kdump: loaded Tainted: G
>> [ 1272.884515] task: ffff9a38dbd4d140 ti: ffff9a37cd3b8000 task.ti:
>> ffff9a37cd3b8000
>> [ 1272.884517] RIP: 0010:[<0000000000000000>]  [<          (null)>]
>> (null)
>> [ 1272.884520] RSP: 0018:ffff9a37cd3bbe68  EFLAGS: 00010203
>> [ 1272.884522] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> 0000000000014d5f
>> [ 1272.884524] RDX: fffffffffffffff4 RSI: 0000000000000001 RDI:
>> ffff9a38aca4d200
>> [ 1272.884526] RBP: ffff9a37cd3bbed0 R08: ffff9a38dcd5f1a0 R09:
>> ffff9a31ffc07300
>> [ 1272.884527] R10: ffff9a31ffc07300 R11: ffffffffaddd5e9d R12:
>> ffff9a38b4e0fb00
>> [ 1272.884529] R13: 0000000000000001 R14: ffff9a37cd3bbf18 R15:
>> ffff9a38aca4d200
>> [ 1272.884532] FS:  00007feccaa67740(0000) GS:ffff9a38dcd40000(0000)
>> knlGS:0000000000000000
>> [ 1272.884534] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 1272.884536] CR2: 0000000000000000 CR3: 00000008267c0000 CR4:
>> 00000000003407e0
>> [ 1272.884537] Call Trace:
>> [ 1272.884544]  [<ffffffffade68940>] ? seq_read+0x130/0x440
>> [ 1272.884548]  [<ffffffffade40f8f>] vfs_read+0x9f/0x170
>> [ 1272.884552]  [<ffffffffade41e4f>] SyS_read+0x7f/0xf0
>> [ 1272.884557]  [<ffffffffae374ddb>] system_call_fastpath+0x22/0x27
>> [ 1272.884558] Code:  Bad RIP value.
>> [ 1272.884562] RIP  [<          (null)>]           (null)
>> [ 1272.884564]  RSP <ffff9a37cd3bbe68>
>> [ 1272.884566] CR2: 0000000000000000
>>
>> Signed-off-by: Qu Huang <jinsdb@126.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 7 ++++++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
>> index 511712c..673d5e3 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
>> @@ -33,6 +33,11 @@ static int kfd_debugfs_open(struct inode *inode, struct file *file)
>>
>>         return single_open(file, show, NULL);
>>  }
>> +static int kfd_debugfs_hang_hws_read(struct seq_file *m, void *data)
>> +{
>> +       seq_printf(m, "echo gpu_id > hang_hws\n");
>> +       return 0;
>> +}
>>
>>  static ssize_t kfd_debugfs_hang_hws_write(struct file *file,
>>         const char __user *user_buf, size_t size, loff_t *ppos)
>> @@ -94,7 +99,7 @@ void kfd_debugfs_init(void)
>>         debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
>>                             kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
>>         debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
>> -                           NULL, &kfd_debugfs_hang_hws_fops);
>> +                           kfd_debugfs_hang_hws_read, &kfd_debugfs_hang_hws_fops);
>>  }
>>
>>  void kfd_debugfs_fini(void)
>> --
>> 1.8.3.1
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-03-26  2:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-21  8:28 [PATCH] drm/amdkfd: Fix cat debugfs hang_hws file causes system crash bug Qu Huang
2021-03-23 14:56 ` Alex Deucher
2021-03-26  2:46   ` Felix Kuehling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).