All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
@ 2020-05-14  9:18 jianzh
  2020-05-14 10:25 ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: jianzh @ 2020-05-14  9:18 UTC (permalink / raw)
  To: amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Christian.Koenig, Monk.Liu, Hawking.Zhang

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
    add 'app_state_mutex' for race conditions:
	(1)Only 1 user can open this file node
	(2)wait_dump() can only take effect after poll() executed.
	(3)eliminated the race condition between release() and
	   wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
    removed state checking in amdgpu_debugfs_wait_dump
    Improve on top of version 3 so that the node can be reopened.

v7: move reinit_completion into open() so that only one user
    can open it.

v8: remove complete_all() from amdgpu_debugfs_wait_dump().

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
 	char				product_number[16];
 	char				product_name[32];
 	char				serial[16];
+
+	struct amdgpu_autodump		autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..d33cb344be69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>
 
 #include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 	return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+	unsigned long timeout = 600 * HZ;
+	int ret;
+
+	wake_up_interruptible(&adev->autodump.gpu_hang);
+
+	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+	if (ret == 0) {
+		pr_err("autodump: timeout, move on to gpu recovery\n");
+		return -ETIMEDOUT;
+	}
+#endif
+	return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = inode->i_private;
+	int ret;
+
+	file->private_data = adev;
+
+	mutex_lock(&adev->lock_reset);
+	if (adev->autodump.dumping.done) {
+		reinit_completion(&adev->autodump.dumping);
+		ret = 0;
+	} else {
+		ret = -EBUSY;
+	}
+	mutex_unlock(&adev->lock_reset);
+
+	return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	complete_all(&adev->autodump.dumping);
+	return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+	if (adev->in_gpu_reset)
+		return POLLIN | POLLRDNORM | POLLWRNORM;
+
+	return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+	.owner = THIS_MODULE,
+	.open = amdgpu_debugfs_autodump_open,
+	.poll = amdgpu_debugfs_autodump_poll,
+	.release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+	init_completion(&adev->autodump.dumping);
+	complete_all(&adev->autodump.dumping);
+	init_waitqueue_head(&adev->autodump.gpu_hang);
+
+	debugfs_create_file("amdgpu_autodump", 0600,
+		adev->ddev->primary->debugfs_root,
+		adev, &autodump_debug_fops);
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
 	amdgpu_ras_debugfs_create_all(adev);
 
+	amdgpu_debugfs_autodump_init(adev);
+
 	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
 					ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..2803884d338d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
 	unsigned		num_files;
 };
 
+struct amdgpu_autodump {
+	struct completion		dumping;
+	struct wait_queue_head		gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cc41e8f5ad14..545beebcf43e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
+	amdgpu_debugfs_wait_dump(adev);
+
 	/* block all schedulers and reset given job's ring */
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-14  9:18 [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4 jianzh
@ 2020-05-14 10:25 ` Christian König
  0 siblings, 0 replies; 23+ messages in thread
From: Christian König @ 2020-05-14 10:25 UTC (permalink / raw)
  To: jianzh, amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Christian.Koenig, Monk.Liu, Hawking.Zhang

Am 14.05.20 um 11:18 schrieb jianzh@amd.com:
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
> 	(1)Only 1 user can open this file node
> 	(2)wait_dump() can only take effect after poll() executed.
> 	(3)eliminated the race condition between release() and
> 	   wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> v7: move reinit_completion into open() so that only one user
>      can open it.
>
> v8: remove complete_all() from amdgpu_debugfs_wait_dump().
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 87 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>   	char				product_number[16];
>   	char				product_name[32];
>   	char				serial[16];
> +
> +	struct amdgpu_autodump		autodump;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..d33cb344be69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>   
>   #include "amdgpu.h"
> @@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +	unsigned long timeout = 600 * HZ;
> +	int ret;
> +
> +	wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +	if (ret == 0) {
> +		pr_err("autodump: timeout, move on to gpu recovery\n");
> +		return -ETIMEDOUT;
> +	}
> +#endif
> +	return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = inode->i_private;
> +	int ret;
> +
> +	file->private_data = adev;
> +
> +	mutex_lock(&adev->lock_reset);
> +	if (adev->autodump.dumping.done) {
> +		reinit_completion(&adev->autodump.dumping);
> +		ret = 0;
> +	} else {
> +		ret = -EBUSY;
> +	}
> +	mutex_unlock(&adev->lock_reset);
> +
> +	return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	complete_all(&adev->autodump.dumping);
> +	return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +	if (adev->in_gpu_reset)
> +		return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +	.owner = THIS_MODULE,
> +	.open = amdgpu_debugfs_autodump_open,
> +	.poll = amdgpu_debugfs_autodump_poll,
> +	.release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +	init_completion(&adev->autodump.dumping);
> +	complete_all(&adev->autodump.dumping);
> +	init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +	debugfs_create_file("amdgpu_autodump", 0600,
> +		adev->ddev->primary->debugfs_root,
> +		adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   
>   	amdgpu_ras_debugfs_create_all(adev);
>   
> +	amdgpu_debugfs_autodump_init(adev);
> +
>   	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>   					ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>   	unsigned		num_files;
>   };
>   
> +struct amdgpu_autodump {
> +	struct completion		dumping;
> +	struct wait_queue_head		gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> +	amdgpu_debugfs_wait_dump(adev);
> +
>   	/* block all schedulers and reset given job's ring */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-15  2:40         ` Zhao, Jiange
@ 2020-05-15  6:51           ` Christian König
  0 siblings, 0 replies; 23+ messages in thread
From: Christian König @ 2020-05-15  6:51 UTC (permalink / raw)
  To: Zhao, Jiange, Li, Dennis, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Kuehling, Felix,
	Liu, Monk, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 13160 bytes --]

The first application to open the autodump node gets the right to use it.

All others only get -EBUSY until the first application is done with the 
hardware.

Christian.

Am 15.05.20 um 04:40 schrieb Zhao, Jiange:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Hi Dennis,
>
> This node/feature is for UMR extension. It is designed for a single user.
>
> Jiange
> ------------------------------------------------------------------------
> *From:* Li, Dennis <Dennis.Li@amd.com>
> *Sent:* Thursday, May 14, 2020 11:15 PM
> *To:* Koenig, Christian <Christian.Koenig@amd.com>; Zhao, Jiange 
> <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, 
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>
> *Subject:* RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu 
> reset v4
>
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Jiange,
>
> How to handle the case that multi-apps do the auto dump? This patch 
> seems not multi-process safety.
>
> Best Regards
>
> Dennis Li
>
> *From:*amd-gfx <amd-gfx-bounces@lists.freedesktop.org> *On Behalf Of 
> *Christian König
> *Sent:* Thursday, May 14, 2020 4:29 PM
> *To:* Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> *Cc:* Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, 
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Kuehling, Felix 
> <Felix.Kuehling@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking 
> <Hawking.Zhang@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu 
> reset v4
>
> Hi Jiange,
>
> it probably won't hurt, but I would just drop that. You need roughly 4 
> billion GPU resets until the UINT_MAX-1 becomes zero again.
>
> Christian.
>
> Am 14.05.20 um 09:14 schrieb Zhao, Jiange:
>
>     [AMD Official Use Only - Internal Distribution Only]
>
>     Hi Christian,
>
>     wait_for_completion_interruptible_timeout() would decrease
>     autodump.dumping.done to UINT_MAX-1.
>
>     complete_all() here would restore autodump.dumping to the state as
>     in amdgpu_debugfs_autodump_init().
>
>     I want to make sure every open() deals with the same situation.
>
>     Jiange
>
>     ------------------------------------------------------------------------
>
>     *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
>     <mailto:ckoenig.leichtzumerken@gmail.com>
>     *Sent:* Thursday, May 14, 2020 3:01 PM
>     *To:* Zhao, Jiange <Jiange.Zhao@amd.com>
>     <mailto:Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
>     <mailto:amd-gfx@lists.freedesktop.org>
>     <amd-gfx@lists.freedesktop.org> <mailto:amd-gfx@lists.freedesktop.org>
>     *Cc:* Pelloux-prayer, Pierre-eric
>     <Pierre-eric.Pelloux-prayer@amd.com>
>     <mailto:Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange
>     <Jiange.Zhao@amd.com> <mailto:Jiange.Zhao@amd.com>; Kuehling,
>     Felix <Felix.Kuehling@amd.com> <mailto:Felix.Kuehling@amd.com>;
>     Deucher, Alexander <Alexander.Deucher@amd.com>
>     <mailto:Alexander.Deucher@amd.com>; Koenig, Christian
>     <Christian.Koenig@amd.com> <mailto:Christian.Koenig@amd.com>; Liu,
>     Monk <Monk.Liu@amd.com> <mailto:Monk.Liu@amd.com>; Zhang, Hawking
>     <Hawking.Zhang@amd.com> <mailto:Hawking.Zhang@amd.com>
>     *Subject:* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for
>     gpu reset v4
>
>     Am 14.05.20 um 07:29 schrieb jianzh@amd.com <mailto:jianzh@amd.com>:
>     > From: Jiange Zhao <Jiange.Zhao@amd.com> <mailto:Jiange.Zhao@amd.com>
>     >
>     > When GPU got timeout, it would notify an interested part
>     > of an opportunity to dump info before actual GPU reset.
>     >
>     > A usermode app would open 'autodump' node under debugfs system
>     > and poll() for readable/writable. When a GPU reset is due,
>     > amdgpu would notify usermode app through wait_queue_head and give
>     > it 10 minutes to dump info.
>     >
>     > After usermode app has done its work, this 'autodump' node is
>     closed.
>     > On node closure, amdgpu gets to know the dump is done through
>     > the completion that is triggered in release().
>     >
>     > There is no write or read callback because necessary info can be
>     > obtained through dmesg and umr. Messages back and forth between
>     > usermode app and amdgpu are unnecessary.
>     >
>     > v2: (1) changed 'registered' to 'app_listening'
>     >      (2) add a mutex in open() to prevent race condition
>     >
>     > v3 (chk): grab the reset lock to avoid race in autodump_open,
>     >            rename debugfs file to amdgpu_autodump,
>     >            provide autodump_read as well,
>     >            style and code cleanups
>     >
>     > v4: add 'bool app_listening' to differentiate situations, so that
>     >      the node can be reopened; also, there is no need to wait for
>     >      completion when no app is waiting for a dump.
>     >
>     > v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>     >      add 'app_state_mutex' for race conditions:
>     >        (1)Only 1 user can open this file node
>     >        (2)wait_dump() can only take effect after poll() executed.
>     >        (3)eliminated the race condition between release() and
>     >           wait_dump()
>     >
>     > v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>     >      removed state checking in amdgpu_debugfs_wait_dump
>     >      Improve on top of version 3 so that the node can be reopened.
>     >
>     > v7: move reinit_completion into open() so that only one user
>     >      can open it.
>     >
>     > Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
>     <mailto:Jiange.Zhao@amd.com>
>     > ---
>     >   drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
>     >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79
>     ++++++++++++++++++++-
>     >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>     >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
>     >   4 files changed, 88 insertions(+), 1 deletion(-)
>     >
>     > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>     > index 2a806cb55b78..9e8eeddfe7ce 100644
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>     > @@ -992,6 +992,8 @@ struct amdgpu_device {
>     >        char product_number[16];
>     >        char product_name[32];
>     >        char serial[16];
>     > +
>     > +     struct amdgpu_autodump autodump;
>     >   };
>     >
>     >   static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>     ttm_bo_device *bdev)
>     > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>     > index 1a4894fa3693..efee3f1adecf 100644
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>     > @@ -27,7 +27,7 @@
>     >   #include <linux/pci.h>
>     >   #include <linux/uaccess.h>
>     >   #include <linux/pm_runtime.h>
>     > -
>     > +#include <linux/poll.h>
>     >   #include <drm/drm_debugfs.h>
>     >
>     >   #include "amdgpu.h"
>     > @@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct
>     amdgpu_device *adev,
>     >        return 0;
>     >   }
>     >
>     > +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
>     > +{
>     > +#if defined(CONFIG_DEBUG_FS)
>     > +     unsigned long timeout = 600 * HZ;
>     > +     int ret;
>     > +
>     > + wake_up_interruptible(&adev->autodump.gpu_hang);
>     > +
>     > +     ret =
>     wait_for_completion_interruptible_timeout(&adev->autodump.dumping,
>     timeout);
>     > + complete_all(&adev->autodump.dumping);
>
>     Sorry that I'm mentioning this only now. But what is this
>     complete_all()
>     here good for?
>
>     I mean we already waited for completion, didn't we?
>
>     Christian.
>
>     > +     if (ret == 0) {
>     > +             pr_err("autodump: timeout, move on to gpu
>     recovery\n");
>     > +             return -ETIMEDOUT;
>     > +     }
>     > +#endif
>     > +     return 0;
>     > +}
>     > +
>     >   #if defined(CONFIG_DEBUG_FS)
>     >
>     > +static int amdgpu_debugfs_autodump_open(struct inode *inode,
>     struct file *file)
>     > +{
>     > +     struct amdgpu_device *adev = inode->i_private;
>     > +     int ret;
>     > +
>     > +     file->private_data = adev;
>     > +
>     > +     mutex_lock(&adev->lock_reset);
>     > +     if (adev->autodump.dumping.done) {
>     > + reinit_completion(&adev->autodump.dumping);
>     > +             ret = 0;
>     > +     } else {
>     > +             ret = -EBUSY;
>     > +     }
>     > +     mutex_unlock(&adev->lock_reset);
>     > +
>     > +     return ret;
>     > +}
>     > +
>     > +static int amdgpu_debugfs_autodump_release(struct inode *inode,
>     struct file *file)
>     > +{
>     > +     struct amdgpu_device *adev = file->private_data;
>     > +
>     > + complete_all(&adev->autodump.dumping);
>     > +     return 0;
>     > +}
>     > +
>     > +static unsigned int amdgpu_debugfs_autodump_poll(struct file
>     *file, struct poll_table_struct *poll_table)
>     > +{
>     > +     struct amdgpu_device *adev = file->private_data;
>     > +
>     > +     poll_wait(file, &adev->autodump.gpu_hang, poll_table);
>     > +
>     > +     if (adev->in_gpu_reset)
>     > +             return POLLIN | POLLRDNORM | POLLWRNORM;
>     > +
>     > +     return 0;
>     > +}
>     > +
>     > +static const struct file_operations autodump_debug_fops = {
>     > +     .owner = THIS_MODULE,
>     > +     .open = amdgpu_debugfs_autodump_open,
>     > +     .poll = amdgpu_debugfs_autodump_poll,
>     > +     .release = amdgpu_debugfs_autodump_release,
>     > +};
>     > +
>     > +static void amdgpu_debugfs_autodump_init(struct amdgpu_device
>     *adev)
>     > +{
>     > + init_completion(&adev->autodump.dumping);
>     > + complete_all(&adev->autodump.dumping);
>     > + init_waitqueue_head(&adev->autodump.gpu_hang);
>     > +
>     > +     debugfs_create_file("amdgpu_autodump", 0600,
>     > + adev->ddev->primary->debugfs_root,
>     > +             adev, &autodump_debug_fops);
>     > +}
>     > +
>     >   /**
>     >    * amdgpu_debugfs_process_reg_op - Handle MMIO register
>     reads/writes
>     >    *
>     > @@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct
>     amdgpu_device *adev)
>     >
>     >        amdgpu_ras_debugfs_create_all(adev);
>     >
>     > +     amdgpu_debugfs_autodump_init(adev);
>     > +
>     >        return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>     > ARRAY_SIZE(amdgpu_debugfs_list));
>     >   }
>     > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>     > index de12d1101526..2803884d338d 100644
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>     > @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>     >        unsigned                num_files;
>     >   };
>     >
>     > +struct amdgpu_autodump {
>     > +     struct completion dumping;
>     > +     struct wait_queue_head gpu_hang;
>     > +};
>     > +
>     >   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>     >   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>     >   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
>     > @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct
>     amdgpu_device *adev,
>     >   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>     >   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>     >   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
>     > +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
>     > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     > index cc41e8f5ad14..545beebcf43e 100644
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>     > @@ -3927,6 +3927,8 @@ static int
>     amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>     >        int i, r = 0;
>     >        bool need_full_reset  = *need_full_reset_arg;
>     >
>     > +     amdgpu_debugfs_wait_dump(adev);
>     > +
>     >        /* block all schedulers and reset given job's ring */
>     >        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>     >                struct amdgpu_ring *ring = adev->rings[i];
>


[-- Attachment #1.2: Type: text/html, Size: 32889 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-14 15:15       ` Li, Dennis
@ 2020-05-15  2:40         ` Zhao, Jiange
  2020-05-15  6:51           ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: Zhao, Jiange @ 2020-05-15  2:40 UTC (permalink / raw)
  To: Li, Dennis, Koenig, Christian, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Kuehling, Felix,
	Liu, Monk, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 11216 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi Dennis,

This node/feature is for UMR extension. It is designed for a single user.

Jiange
________________________________
From: Li, Dennis <Dennis.Li@amd.com>
Sent: Thursday, May 14, 2020 11:15 PM
To: Koenig, Christian <Christian.Koenig@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4


[AMD Official Use Only - Internal Distribution Only]



Hi, Jiange,

      How to handle the case that multi-apps do the auto dump? This patch seems not multi-process safety.



Best Regards

Dennis Li

From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Thursday, May 14, 2020 4:29 PM
To: Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4



Hi Jiange,

it probably won't hurt, but I would just drop that. You need roughly 4 billion GPU resets until the UINT_MAX-1 becomes zero again.

Christian.

Am 14.05.20 um 09:14 schrieb Zhao, Jiange:

[AMD Official Use Only - Internal Distribution Only]



Hi Christian,



wait_for_completion_interruptible_timeout() would decrease autodump.dumping.done to UINT_MAX-1.



complete_all() here would restore autodump.dumping to the state as in amdgpu_debugfs_autodump_init().



I want to make sure every open() deals with the same situation.



Jiange

________________________________

From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Thursday, May 14, 2020 3:01 PM
To: Zhao, Jiange <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org><mailto:amd-gfx@lists.freedesktop.org>
Cc: Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com><mailto:Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4



Am 14.05.20 um 07:29 schrieb jianzh@amd.com<mailto:jianzh@amd.com>:
> From: Jiange Zhao <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
>        (1)Only 1 user can open this file node
>        (2)wait_dump() can only take effect after poll() executed.
>        (3)eliminated the race condition between release() and
>           wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> v7: move reinit_completion into open() so that only one user
>      can open it.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>        char                            product_number[16];
>        char                            product_name[32];
>        char                            serial[16];
> +
> +     struct amdgpu_autodump          autodump;
>   };
>
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..efee3f1adecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>
>   #include "amdgpu.h"
> @@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>        return 0;
>   }
>
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +     unsigned long timeout = 600 * HZ;
> +     int ret;
> +
> +     wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +     ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +     complete_all(&adev->autodump.dumping);

Sorry that I'm mentioning this only now. But what is this complete_all()
here good for?

I mean we already waited for completion, didn't we?

Christian.

> +     if (ret == 0) {
> +             pr_err("autodump: timeout, move on to gpu recovery\n");
> +             return -ETIMEDOUT;
> +     }
> +#endif
> +     return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +     struct amdgpu_device *adev = inode->i_private;
> +     int ret;
> +
> +     file->private_data = adev;
> +
> +     mutex_lock(&adev->lock_reset);
> +     if (adev->autodump.dumping.done) {
> +             reinit_completion(&adev->autodump.dumping);
> +             ret = 0;
> +     } else {
> +             ret = -EBUSY;
> +     }
> +     mutex_unlock(&adev->lock_reset);
> +
> +     return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +     struct amdgpu_device *adev = file->private_data;
> +
> +     complete_all(&adev->autodump.dumping);
> +     return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +     struct amdgpu_device *adev = file->private_data;
> +
> +     poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +     if (adev->in_gpu_reset)
> +             return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +     return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +     .owner = THIS_MODULE,
> +     .open = amdgpu_debugfs_autodump_open,
> +     .poll = amdgpu_debugfs_autodump_poll,
> +     .release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +     init_completion(&adev->autodump.dumping);
> +     complete_all(&adev->autodump.dumping);
> +     init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +     debugfs_create_file("amdgpu_autodump", 0600,
> +             adev->ddev->primary->debugfs_root,
> +             adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>
>        amdgpu_ras_debugfs_create_all(adev);
>
> +     amdgpu_debugfs_autodump_init(adev);
> +
>        return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>                                        ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>        unsigned                num_files;
>   };
>
> +struct amdgpu_autodump {
> +     struct completion               dumping;
> +     struct wait_queue_head          gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>        int i, r = 0;
>        bool need_full_reset  = *need_full_reset_arg;
>
> +     amdgpu_debugfs_wait_dump(adev);
> +
>        /* block all schedulers and reset given job's ring */
>        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                struct amdgpu_ring *ring = adev->rings[i];



[-- Attachment #1.2: Type: text/html, Size: 21524 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-14  8:28     ` Christian König
@ 2020-05-14 15:15       ` Li, Dennis
  2020-05-15  2:40         ` Zhao, Jiange
  0 siblings, 1 reply; 23+ messages in thread
From: Li, Dennis @ 2020-05-14 15:15 UTC (permalink / raw)
  To: Koenig, Christian, Zhao, Jiange, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Kuehling, Felix,
	Liu, Monk, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 10442 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi, Jiange,
      How to handle the case that multi-apps do the auto dump? This patch seems not multi-process safety.

Best Regards
Dennis Li
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Thursday, May 14, 2020 4:29 PM
To: Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Hi Jiange,

it probably won't hurt, but I would just drop that. You need roughly 4 billion GPU resets until the UINT_MAX-1 becomes zero again.

Christian.

Am 14.05.20 um 09:14 schrieb Zhao, Jiange:

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

wait_for_completion_interruptible_timeout() would decrease autodump.dumping.done to UINT_MAX-1.

complete_all() here would restore autodump.dumping to the state as in amdgpu_debugfs_autodump_init().

I want to make sure every open() deals with the same situation.

Jiange
________________________________
From: Christian König <ckoenig.leichtzumerken@gmail.com><mailto:ckoenig.leichtzumerken@gmail.com>
Sent: Thursday, May 14, 2020 3:01 PM
To: Zhao, Jiange <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org><mailto:amd-gfx@lists.freedesktop.org>
Cc: Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com><mailto:Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><mailto:Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com><mailto:Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com><mailto:Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><mailto:Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Am 14.05.20 um 07:29 schrieb jianzh@amd.com<mailto:jianzh@amd.com>:
> From: Jiange Zhao <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
>        (1)Only 1 user can open this file node
>        (2)wait_dump() can only take effect after poll() executed.
>        (3)eliminated the race condition between release() and
>           wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> v7: move reinit_completion into open() so that only one user
>      can open it.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com><mailto:Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>        char                            product_number[16];
>        char                            product_name[32];
>        char                            serial[16];
> +
> +     struct amdgpu_autodump          autodump;
>   };
>
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..efee3f1adecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>
>   #include "amdgpu.h"
> @@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>        return 0;
>   }
>
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +     unsigned long timeout = 600 * HZ;
> +     int ret;
> +
> +     wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +     ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +     complete_all(&adev->autodump.dumping);

Sorry that I'm mentioning this only now. But what is this complete_all()
here good for?

I mean we already waited for completion, didn't we?

Christian.

> +     if (ret == 0) {
> +             pr_err("autodump: timeout, move on to gpu recovery\n");
> +             return -ETIMEDOUT;
> +     }
> +#endif
> +     return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +     struct amdgpu_device *adev = inode->i_private;
> +     int ret;
> +
> +     file->private_data = adev;
> +
> +     mutex_lock(&adev->lock_reset);
> +     if (adev->autodump.dumping.done) {
> +             reinit_completion(&adev->autodump.dumping);
> +             ret = 0;
> +     } else {
> +             ret = -EBUSY;
> +     }
> +     mutex_unlock(&adev->lock_reset);
> +
> +     return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +     struct amdgpu_device *adev = file->private_data;
> +
> +     complete_all(&adev->autodump.dumping);
> +     return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +     struct amdgpu_device *adev = file->private_data;
> +
> +     poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +     if (adev->in_gpu_reset)
> +             return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +     return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +     .owner = THIS_MODULE,
> +     .open = amdgpu_debugfs_autodump_open,
> +     .poll = amdgpu_debugfs_autodump_poll,
> +     .release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +     init_completion(&adev->autodump.dumping);
> +     complete_all(&adev->autodump.dumping);
> +     init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +     debugfs_create_file("amdgpu_autodump", 0600,
> +             adev->ddev->primary->debugfs_root,
> +             adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>
>        amdgpu_ras_debugfs_create_all(adev);
>
> +     amdgpu_debugfs_autodump_init(adev);
> +
>        return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>                                        ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>        unsigned                num_files;
>   };
>
> +struct amdgpu_autodump {
> +     struct completion               dumping;
> +     struct wait_queue_head          gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>        int i, r = 0;
>        bool need_full_reset  = *need_full_reset_arg;
>
> +     amdgpu_debugfs_wait_dump(adev);
> +
>        /* block all schedulers and reset given job's ring */
>        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                struct amdgpu_ring *ring = adev->rings[i];


[-- Attachment #1.2: Type: text/html, Size: 21018 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-14  7:14   ` Zhao, Jiange
@ 2020-05-14  8:28     ` Christian König
  2020-05-14 15:15       ` Li, Dennis
  0 siblings, 1 reply; 23+ messages in thread
From: Christian König @ 2020-05-14  8:28 UTC (permalink / raw)
  To: Zhao, Jiange, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Kuehling, Felix,
	Liu, Monk, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 10035 bytes --]

Hi Jiange,

it probably won't hurt, but I would just drop that. You need roughly 4 
billion GPU resets until the UINT_MAX-1 becomes zero again.

Christian.

Am 14.05.20 um 09:14 schrieb Zhao, Jiange:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Hi Christian,
>
> wait_for_completion_interruptible_timeout() would decrease 
> autodump.dumping.done to UINT_MAX-1.
>
> complete_all() here would restore autodump.dumping to the state as in 
> amdgpu_debugfs_autodump_init().
>
> I want to make sure every open() deals with the same situation.
>
> Jiange
> ------------------------------------------------------------------------
> *From:* Christian König <ckoenig.leichtzumerken@gmail.com>
> *Sent:* Thursday, May 14, 2020 3:01 PM
> *To:* Zhao, Jiange <Jiange.Zhao@amd.com>; 
> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> *Cc:* Pelloux-prayer, Pierre-eric 
> <Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange 
> <Jiange.Zhao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; 
> Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, 
> Hawking <Hawking.Zhang@amd.com>
> *Subject:* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu 
> reset v4
> Am 14.05.20 um 07:29 schrieb jianzh@amd.com:
> > From: Jiange Zhao <Jiange.Zhao@amd.com>
> >
> > When GPU got timeout, it would notify an interested part
> > of an opportunity to dump info before actual GPU reset.
> >
> > A usermode app would open 'autodump' node under debugfs system
> > and poll() for readable/writable. When a GPU reset is due,
> > amdgpu would notify usermode app through wait_queue_head and give
> > it 10 minutes to dump info.
> >
> > After usermode app has done its work, this 'autodump' node is closed.
> > On node closure, amdgpu gets to know the dump is done through
> > the completion that is triggered in release().
> >
> > There is no write or read callback because necessary info can be
> > obtained through dmesg and umr. Messages back and forth between
> > usermode app and amdgpu are unnecessary.
> >
> > v2: (1) changed 'registered' to 'app_listening'
> >      (2) add a mutex in open() to prevent race condition
> >
> > v3 (chk): grab the reset lock to avoid race in autodump_open,
> >            rename debugfs file to amdgpu_autodump,
> >            provide autodump_read as well,
> >            style and code cleanups
> >
> > v4: add 'bool app_listening' to differentiate situations, so that
> >      the node can be reopened; also, there is no need to wait for
> >      completion when no app is waiting for a dump.
> >
> > v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
> >      add 'app_state_mutex' for race conditions:
> >        (1)Only 1 user can open this file node
> >        (2)wait_dump() can only take effect after poll() executed.
> >        (3)eliminated the race condition between release() and
> >           wait_dump()
> >
> > v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
> >      removed state checking in amdgpu_debugfs_wait_dump
> >      Improve on top of version 3 so that the node can be reopened.
> >
> > v7: move reinit_completion into open() so that only one user
> >      can open it.
> >
> > Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 ++++++++++++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
> >   4 files changed, 88 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 2a806cb55b78..9e8eeddfe7ce 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -992,6 +992,8 @@ struct amdgpu_device {
> >        char product_number[16];
> >        char product_name[32];
> >        char                            serial[16];
> > +
> > +     struct amdgpu_autodump          autodump;
> >   };
> >
> >   static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
> ttm_bo_device *bdev)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index 1a4894fa3693..efee3f1adecf 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -27,7 +27,7 @@
> >   #include <linux/pci.h>
> >   #include <linux/uaccess.h>
> >   #include <linux/pm_runtime.h>
> > -
> > +#include <linux/poll.h>
> >   #include <drm/drm_debugfs.h>
> >
> >   #include "amdgpu.h"
> > @@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device 
> *adev,
> >        return 0;
> >   }
> >
> > +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> > +{
> > +#if defined(CONFIG_DEBUG_FS)
> > +     unsigned long timeout = 600 * HZ;
> > +     int ret;
> > +
> > + wake_up_interruptible(&adev->autodump.gpu_hang);
> > +
> > +     ret = 
> wait_for_completion_interruptible_timeout(&adev->autodump.dumping, 
> timeout);
> > +     complete_all(&adev->autodump.dumping);
>
> Sorry that I'm mentioning this only now. But what is this complete_all()
> here good for?
>
> I mean we already waited for completion, didn't we?
>
> Christian.
>
> > +     if (ret == 0) {
> > +             pr_err("autodump: timeout, move on to gpu recovery\n");
> > +             return -ETIMEDOUT;
> > +     }
> > +#endif
> > +     return 0;
> > +}
> > +
> >   #if defined(CONFIG_DEBUG_FS)
> >
> > +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
> file *file)
> > +{
> > +     struct amdgpu_device *adev = inode->i_private;
> > +     int ret;
> > +
> > +     file->private_data = adev;
> > +
> > +     mutex_lock(&adev->lock_reset);
> > +     if (adev->autodump.dumping.done) {
> > + reinit_completion(&adev->autodump.dumping);
> > +             ret = 0;
> > +     } else {
> > +             ret = -EBUSY;
> > +     }
> > +     mutex_unlock(&adev->lock_reset);
> > +
> > +     return ret;
> > +}
> > +
> > +static int amdgpu_debugfs_autodump_release(struct inode *inode, 
> struct file *file)
> > +{
> > +     struct amdgpu_device *adev = file->private_data;
> > +
> > +     complete_all(&adev->autodump.dumping);
> > +     return 0;
> > +}
> > +
> > +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, 
> struct poll_table_struct *poll_table)
> > +{
> > +     struct amdgpu_device *adev = file->private_data;
> > +
> > +     poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> > +
> > +     if (adev->in_gpu_reset)
> > +             return POLLIN | POLLRDNORM | POLLWRNORM;
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct file_operations autodump_debug_fops = {
> > +     .owner = THIS_MODULE,
> > +     .open = amdgpu_debugfs_autodump_open,
> > +     .poll = amdgpu_debugfs_autodump_poll,
> > +     .release = amdgpu_debugfs_autodump_release,
> > +};
> > +
> > +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> > +{
> > + init_completion(&adev->autodump.dumping);
> > +     complete_all(&adev->autodump.dumping);
> > + init_waitqueue_head(&adev->autodump.gpu_hang);
> > +
> > +     debugfs_create_file("amdgpu_autodump", 0600,
> > + adev->ddev->primary->debugfs_root,
> > +             adev, &autodump_debug_fops);
> > +}
> > +
> >   /**
> >    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
> >    *
> > @@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
> *adev)
> >
> >        amdgpu_ras_debugfs_create_all(adev);
> >
> > +     amdgpu_debugfs_autodump_init(adev);
> > +
> >        return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
> > ARRAY_SIZE(amdgpu_debugfs_list));
> >   }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> > index de12d1101526..2803884d338d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> > @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
> >        unsigned                num_files;
> >   };
> >
> > +struct amdgpu_autodump {
> > +     struct completion               dumping;
> > +     struct wait_queue_head          gpu_hang;
> > +};
> > +
> >   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
> >   int amdgpu_debugfs_init(struct amdgpu_device *adev);
> >   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> > @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device 
> *adev,
> >   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
> >   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
> >   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> > +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index cc41e8f5ad14..545beebcf43e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct 
> amdgpu_device *adev,
> >        int i, r = 0;
> >        bool need_full_reset  = *need_full_reset_arg;
> >
> > +     amdgpu_debugfs_wait_dump(adev);
> > +
> >        /* block all schedulers and reset given job's ring */
> >        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >                struct amdgpu_ring *ring = adev->rings[i];
>


[-- Attachment #1.2: Type: text/html, Size: 23496 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-14  7:01 ` Christian König
@ 2020-05-14  7:14   ` Zhao, Jiange
  2020-05-14  8:28     ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: Zhao, Jiange @ 2020-05-14  7:14 UTC (permalink / raw)
  To: amd-gfx, Koenig,  Christian
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Kuehling, Felix,
	Liu, Monk, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 9081 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

wait_for_completion_interruptible_timeout() would decrease autodump.dumping.done to UINT_MAX-1.

complete_all() here would restore autodump.dumping to the state as in amdgpu_debugfs_autodump_init().

I want to make sure every open() deals with the same situation.

Jiange
________________________________
From: Christian K?nig <ckoenig.leichtzumerken@gmail.com>
Sent: Thursday, May 14, 2020 3:01 PM
To: Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Am 14.05.20 um 07:29 schrieb jianzh@amd.com:
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
>        (1)Only 1 user can open this file node
>        (2)wait_dump() can only take effect after poll() executed.
>        (3)eliminated the race condition between release() and
>           wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> v7: move reinit_completion into open() so that only one user
>      can open it.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>        char                            product_number[16];
>        char                            product_name[32];
>        char                            serial[16];
> +
> +     struct amdgpu_autodump          autodump;
>   };
>
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..efee3f1adecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>
>   #include "amdgpu.h"
> @@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>        return 0;
>   }
>
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +     unsigned long timeout = 600 * HZ;
> +     int ret;
> +
> +     wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +     ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +     complete_all(&adev->autodump.dumping);

Sorry that I'm mentioning this only now. But what is this complete_all()
here good for?

I mean we already waited for completion, didn't we?

Christian.

> +     if (ret == 0) {
> +             pr_err("autodump: timeout, move on to gpu recovery\n");
> +             return -ETIMEDOUT;
> +     }
> +#endif
> +     return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +     struct amdgpu_device *adev = inode->i_private;
> +     int ret;
> +
> +     file->private_data = adev;
> +
> +     mutex_lock(&adev->lock_reset);
> +     if (adev->autodump.dumping.done) {
> +             reinit_completion(&adev->autodump.dumping);
> +             ret = 0;
> +     } else {
> +             ret = -EBUSY;
> +     }
> +     mutex_unlock(&adev->lock_reset);
> +
> +     return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +     struct amdgpu_device *adev = file->private_data;
> +
> +     complete_all(&adev->autodump.dumping);
> +     return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +     struct amdgpu_device *adev = file->private_data;
> +
> +     poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +     if (adev->in_gpu_reset)
> +             return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +     return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +     .owner = THIS_MODULE,
> +     .open = amdgpu_debugfs_autodump_open,
> +     .poll = amdgpu_debugfs_autodump_poll,
> +     .release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +     init_completion(&adev->autodump.dumping);
> +     complete_all(&adev->autodump.dumping);
> +     init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +     debugfs_create_file("amdgpu_autodump", 0600,
> +             adev->ddev->primary->debugfs_root,
> +             adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>
>        amdgpu_ras_debugfs_create_all(adev);
>
> +     amdgpu_debugfs_autodump_init(adev);
> +
>        return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>                                        ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>        unsigned                num_files;
>   };
>
> +struct amdgpu_autodump {
> +     struct completion               dumping;
> +     struct wait_queue_head          gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>        int i, r = 0;
>        bool need_full_reset  = *need_full_reset_arg;
>
> +     amdgpu_debugfs_wait_dump(adev);
> +
>        /* block all schedulers and reset given job's ring */
>        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                struct amdgpu_ring *ring = adev->rings[i];


[-- Attachment #1.2: Type: text/html, Size: 16988 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-14  5:29 jianzh
@ 2020-05-14  7:01 ` Christian König
  2020-05-14  7:14   ` Zhao, Jiange
  0 siblings, 1 reply; 23+ messages in thread
From: Christian König @ 2020-05-14  7:01 UTC (permalink / raw)
  To: jianzh, amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Christian.Koenig, Monk.Liu, Hawking.Zhang

Am 14.05.20 um 07:29 schrieb jianzh@amd.com:
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
> 	(1)Only 1 user can open this file node
> 	(2)wait_dump() can only take effect after poll() executed.
> 	(3)eliminated the race condition between release() and
> 	   wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> v7: move reinit_completion into open() so that only one user
>      can open it.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>   	char				product_number[16];
>   	char				product_name[32];
>   	char				serial[16];
> +
> +	struct amdgpu_autodump		autodump;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..efee3f1adecf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>   
>   #include "amdgpu.h"
> @@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +	unsigned long timeout = 600 * HZ;
> +	int ret;
> +
> +	wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +	complete_all(&adev->autodump.dumping);

Sorry that I'm mentioning this only now. But what is this complete_all() 
here good for?

I mean we already waited for completion, didn't we?

Christian.

> +	if (ret == 0) {
> +		pr_err("autodump: timeout, move on to gpu recovery\n");
> +		return -ETIMEDOUT;
> +	}
> +#endif
> +	return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = inode->i_private;
> +	int ret;
> +
> +	file->private_data = adev;
> +
> +	mutex_lock(&adev->lock_reset);
> +	if (adev->autodump.dumping.done) {
> +		reinit_completion(&adev->autodump.dumping);
> +		ret = 0;
> +	} else {
> +		ret = -EBUSY;
> +	}
> +	mutex_unlock(&adev->lock_reset);
> +
> +	return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	complete_all(&adev->autodump.dumping);
> +	return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +	if (adev->in_gpu_reset)
> +		return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +	.owner = THIS_MODULE,
> +	.open = amdgpu_debugfs_autodump_open,
> +	.poll = amdgpu_debugfs_autodump_poll,
> +	.release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +	init_completion(&adev->autodump.dumping);
> +	complete_all(&adev->autodump.dumping);
> +	init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +	debugfs_create_file("amdgpu_autodump", 0600,
> +		adev->ddev->primary->debugfs_root,
> +		adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   
>   	amdgpu_ras_debugfs_create_all(adev);
>   
> +	amdgpu_debugfs_autodump_init(adev);
> +
>   	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>   					ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>   	unsigned		num_files;
>   };
>   
> +struct amdgpu_autodump {
> +	struct completion		dumping;
> +	struct wait_queue_head		gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> +	amdgpu_debugfs_wait_dump(adev);
> +
>   	/* block all schedulers and reset given job's ring */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
@ 2020-05-14  5:29 jianzh
  2020-05-14  7:01 ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: jianzh @ 2020-05-14  5:29 UTC (permalink / raw)
  To: amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Christian.Koenig, Monk.Liu, Hawking.Zhang

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
    add 'app_state_mutex' for race conditions:
	(1)Only 1 user can open this file node
	(2)wait_dump() can only take effect after poll() executed.
	(3)eliminated the race condition between release() and
	   wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
    removed state checking in amdgpu_debugfs_wait_dump
    Improve on top of version 3 so that the node can be reopened.

v7: move reinit_completion into open() so that only one user
    can open it.

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
 	char				product_number[16];
 	char				product_name[32];
 	char				serial[16];
+
+	struct amdgpu_autodump		autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..efee3f1adecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>
 
 #include "amdgpu.h"
@@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 	return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+	unsigned long timeout = 600 * HZ;
+	int ret;
+
+	wake_up_interruptible(&adev->autodump.gpu_hang);
+
+	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+	complete_all(&adev->autodump.dumping);
+	if (ret == 0) {
+		pr_err("autodump: timeout, move on to gpu recovery\n");
+		return -ETIMEDOUT;
+	}
+#endif
+	return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = inode->i_private;
+	int ret;
+
+	file->private_data = adev;
+
+	mutex_lock(&adev->lock_reset);
+	if (adev->autodump.dumping.done) {
+		reinit_completion(&adev->autodump.dumping);
+		ret = 0;
+	} else {
+		ret = -EBUSY;
+	}
+	mutex_unlock(&adev->lock_reset);
+
+	return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	complete_all(&adev->autodump.dumping);
+	return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+	if (adev->in_gpu_reset)
+		return POLLIN | POLLRDNORM | POLLWRNORM;
+
+	return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+	.owner = THIS_MODULE,
+	.open = amdgpu_debugfs_autodump_open,
+	.poll = amdgpu_debugfs_autodump_poll,
+	.release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+	init_completion(&adev->autodump.dumping);
+	complete_all(&adev->autodump.dumping);
+	init_waitqueue_head(&adev->autodump.gpu_hang);
+
+	debugfs_create_file("amdgpu_autodump", 0600,
+		adev->ddev->primary->debugfs_root,
+		adev, &autodump_debug_fops);
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1509,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
 	amdgpu_ras_debugfs_create_all(adev);
 
+	amdgpu_debugfs_autodump_init(adev);
+
 	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
 					ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..2803884d338d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
 	unsigned		num_files;
 };
 
+struct amdgpu_autodump {
+	struct completion		dumping;
+	struct wait_queue_head		gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cc41e8f5ad14..545beebcf43e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
+	amdgpu_debugfs_wait_dump(adev);
+
 	/* block all schedulers and reset given job's ring */
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-13  9:37   ` Zhao, Jiange
@ 2020-05-13 10:36     ` Christian König
  0 siblings, 0 replies; 23+ messages in thread
From: Christian König @ 2020-05-13 10:36 UTC (permalink / raw)
  To: Zhao, Jiange, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Liu, Monk

> Since usermode app might open a file , do nothing and close it.
That case is unproblematic since closing the debugfs file sets the state 
of the struct completion to completed again no matter if we waited or not.

But when you don't reset in the open() callback we open a small windows 
between open and poll where userspace could open the debugfs file twice.

Regards,
Christian.

Am 13.05.20 um 11:37 schrieb Zhao, Jiange:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi Christian,
>
> Since amdgpu_debugfs_wait_dump() would need 'audodump.dumping.done==0' to actually stop and wait for user mode app to dump.
>
> Since usermode app might open a file , do nothing and close it. I believe a poll() function would be a better indicator that the usermode app actually wants to do a dump.
>
> Also, a reset might happen between open() and poll(). The worst case would be wait_dump() would wait until timeout and usermode poll would always fail.
>
> Jiange
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Wednesday, May 13, 2020 4:20 PM
> To: Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
>
> Am 09.05.20 um 11:45 schrieb jianzh@amd.com:
>> From: Jiange Zhao <Jiange.Zhao@amd.com>
>>
>> When GPU got timeout, it would notify an interested part of an
>> opportunity to dump info before actual GPU reset.
>>
>> A usermode app would open 'autodump' node under debugfs system and
>> poll() for readable/writable. When a GPU reset is due, amdgpu would
>> notify usermode app through wait_queue_head and give it 10 minutes to
>> dump info.
>>
>> After usermode app has done its work, this 'autodump' node is closed.
>> On node closure, amdgpu gets to know the dump is done through the
>> completion that is triggered in release().
>>
>> There is no write or read callback because necessary info can be
>> obtained through dmesg and umr. Messages back and forth between
>> usermode app and amdgpu are unnecessary.
>>
>> v2: (1) changed 'registered' to 'app_listening'
>>       (2) add a mutex in open() to prevent race condition
>>
>> v3 (chk): grab the reset lock to avoid race in autodump_open,
>>             rename debugfs file to amdgpu_autodump,
>>             provide autodump_read as well,
>>             style and code cleanups
>>
>> v4: add 'bool app_listening' to differentiate situations, so that
>>       the node can be reopened; also, there is no need to wait for
>>       completion when no app is waiting for a dump.
>>
>> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>>       add 'app_state_mutex' for race conditions:
>> 	(1)Only 1 user can open this file node
>> 	(2)wait_dump() can only take effect after poll() executed.
>> 	(3)eliminated the race condition between release() and
>> 	   wait_dump()
>>
>> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>>       removed state checking in amdgpu_debugfs_wait_dump
>>       Improve on top of version 3 so that the node can be reopened.
>>
>> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>>    4 files changed, 87 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 2a806cb55b78..9e8eeddfe7ce 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -992,6 +992,8 @@ struct amdgpu_device {
>>    	char				product_number[16];
>>    	char				product_name[32];
>>    	char				serial[16];
>> +
>> +	struct amdgpu_autodump		autodump;
>>    };
>>    
>>    static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>> ttm_bo_device *bdev) diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 1a4894fa3693..261b67ece7fb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -27,7 +27,7 @@
>>    #include <linux/pci.h>
>>    #include <linux/uaccess.h>
>>    #include <linux/pm_runtime.h>
>> -
>> +#include <linux/poll.h>
>>    #include <drm/drm_debugfs.h>
>>    
>>    #include "amdgpu.h"
>> @@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>    	return 0;
>>    }
>>    
>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if
>> +defined(CONFIG_DEBUG_FS)
>> +	unsigned long timeout = 600 * HZ;
>> +	int ret;
>> +
>> +	wake_up_interruptible(&adev->autodump.gpu_hang);
>> +
>> +	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
>> +	complete_all(&adev->autodump.dumping);
>> +	if (ret == 0) {
>> +		pr_err("autodump: timeout, move on to gpu recovery\n");
>> +		return -ETIMEDOUT;
>> +	}
>> +#endif
>> +	return 0;
>> +}
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct
>> +file *file) {
>> +	struct amdgpu_device *adev = inode->i_private;
>> +	int ret;
>> +
>> +	file->private_data = adev;
>> +
>> +	mutex_lock(&adev->lock_reset);
>> +	if (adev->autodump.dumping.done)
>> +		ret = 0;
>> +	else
>> +		ret = -EBUSY;
>> +	mutex_unlock(&adev->lock_reset);
>> +
>> +	return ret;
>> +}
>> +
>> +static int amdgpu_debugfs_autodump_release(struct inode *inode,
>> +struct file *file) {
>> +	struct amdgpu_device *adev = file->private_data;
>> +
>> +	complete_all(&adev->autodump.dumping);
>> +	return 0;
>> +}
>> +
>> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file,
>> +struct poll_table_struct *poll_table) {
>> +	struct amdgpu_device *adev = file->private_data;
>> +
>> +	reinit_completion(&adev->autodump.dumping);
> Why do you have the reinit_completion here and not in open callback?
>
> Apart from that looks good to me.
>
> Regards,
> Christian.
>
>> +	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
>> +
>> +	if (adev->in_gpu_reset)
>> +		return POLLIN | POLLRDNORM | POLLWRNORM;
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct file_operations autodump_debug_fops = {
>> +	.owner = THIS_MODULE,
>> +	.open = amdgpu_debugfs_autodump_open,
>> +	.poll = amdgpu_debugfs_autodump_poll,
>> +	.release = amdgpu_debugfs_autodump_release, };
>> +
>> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
>> +{
>> +	init_completion(&adev->autodump.dumping);
>> +	complete_all(&adev->autodump.dumping);
>> +	init_waitqueue_head(&adev->autodump.gpu_hang);
>> +
>> +	debugfs_create_file("amdgpu_autodump", 0600,
>> +		adev->ddev->primary->debugfs_root,
>> +		adev, &autodump_debug_fops);
>> +}
>> +
>>    /**
>>     * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>     *
>> @@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device
>> *adev)
>>    
>>    	amdgpu_ras_debugfs_create_all(adev);
>>    
>> +	amdgpu_debugfs_autodump_init(adev);
>> +
>>    	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>>    					ARRAY_SIZE(amdgpu_debugfs_list));
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> index de12d1101526..2803884d338d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>>    	unsigned		num_files;
>>    };
>>    
>> +struct amdgpu_autodump {
>> +	struct completion		dumping;
>> +	struct wait_queue_head		gpu_hang;
>> +};
>> +
>>    int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>>    int amdgpu_debugfs_init(struct amdgpu_device *adev);
>>    void amdgpu_debugfs_fini(struct amdgpu_device *adev); @@ -40,3 +45,4
>> @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>    int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>    int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>>    int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index cc41e8f5ad14..545beebcf43e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>    	int i, r = 0;
>>    	bool need_full_reset  = *need_full_reset_arg;
>>    
>> +	amdgpu_debugfs_wait_dump(adev);
>> +
>>    	/* block all schedulers and reset given job's ring */
>>    	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>    		struct amdgpu_ring *ring = adev->rings[i];

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-13  8:19 ` Christian König
@ 2020-05-13  9:37   ` Zhao, Jiange
  2020-05-13 10:36     ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: Zhao, Jiange @ 2020-05-13  9:37 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Liu, Monk

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

Since amdgpu_debugfs_wait_dump() would need 'audodump.dumping.done==0' to actually stop and wait for user mode app to dump.

Since usermode app might open a file , do nothing and close it. I believe a poll() function would be a better indicator that the usermode app actually wants to do a dump.

Also, a reset might happen between open() and poll(). The worst case would be wait_dump() would wait until timeout and usermode poll would always fail.

Jiange

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 
Sent: Wednesday, May 13, 2020 4:20 PM
To: Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Am 09.05.20 um 11:45 schrieb jianzh@amd.com:
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part of an 
> opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system and 
> poll() for readable/writable. When a GPU reset is due, amdgpu would 
> notify usermode app through wait_queue_head and give it 10 minutes to 
> dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through the 
> completion that is triggered in release().
>
> There is no write or read callback because necessary info can be 
> obtained through dmesg and umr. Messages back and forth between 
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
> 	(1)Only 1 user can open this file node
> 	(2)wait_dump() can only take effect after poll() executed.
> 	(3)eliminated the race condition between release() and
> 	   wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 87 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>   	char				product_number[16];
>   	char				product_name[32];
>   	char				serial[16];
> +
> +	struct amdgpu_autodump		autodump;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
> ttm_bo_device *bdev) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..261b67ece7fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>   
>   #include "amdgpu.h"
> @@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if 
> +defined(CONFIG_DEBUG_FS)
> +	unsigned long timeout = 600 * HZ;
> +	int ret;
> +
> +	wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +	complete_all(&adev->autodump.dumping);
> +	if (ret == 0) {
> +		pr_err("autodump: timeout, move on to gpu recovery\n");
> +		return -ETIMEDOUT;
> +	}
> +#endif
> +	return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
> +file *file) {
> +	struct amdgpu_device *adev = inode->i_private;
> +	int ret;
> +
> +	file->private_data = adev;
> +
> +	mutex_lock(&adev->lock_reset);
> +	if (adev->autodump.dumping.done)
> +		ret = 0;
> +	else
> +		ret = -EBUSY;
> +	mutex_unlock(&adev->lock_reset);
> +
> +	return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, 
> +struct file *file) {
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	complete_all(&adev->autodump.dumping);
> +	return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, 
> +struct poll_table_struct *poll_table) {
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	reinit_completion(&adev->autodump.dumping);

Why do you have the reinit_completion here and not in open callback?

Apart from that looks good to me.

Regards,
Christian.

> +	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +	if (adev->in_gpu_reset)
> +		return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +	.owner = THIS_MODULE,
> +	.open = amdgpu_debugfs_autodump_open,
> +	.poll = amdgpu_debugfs_autodump_poll,
> +	.release = amdgpu_debugfs_autodump_release, };
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev) 
> +{
> +	init_completion(&adev->autodump.dumping);
> +	complete_all(&adev->autodump.dumping);
> +	init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +	debugfs_create_file("amdgpu_autodump", 0600,
> +		adev->ddev->primary->debugfs_root,
> +		adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
> *adev)
>   
>   	amdgpu_ras_debugfs_create_all(adev);
>   
> +	amdgpu_debugfs_autodump_init(adev);
> +
>   	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>   					ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>   	unsigned		num_files;
>   };
>   
> +struct amdgpu_autodump {
> +	struct completion		dumping;
> +	struct wait_queue_head		gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev); @@ -40,3 +45,4 
> @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> +	amdgpu_debugfs_wait_dump(adev);
> +
>   	/* block all schedulers and reset given job's ring */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-13  8:16 ` Zhao, Jiange
@ 2020-05-13  8:20   ` Christian König
  0 siblings, 0 replies; 23+ messages in thread
From: Christian König @ 2020-05-13  8:20 UTC (permalink / raw)
  To: Zhao, Jiange, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Liu, Monk


[-- Attachment #1.1: Type: text/plain, Size: 9063 bytes --]

Thanks for the reminder, had to much todo yesterday and just forgot 
about it.

Christian.

Am 13.05.20 um 10:16 schrieb Zhao, Jiange:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>
> Hi @Koenig, Christian <mailto:Christian.Koenig@amd.com>,
>
> I made some changes on top of version 3 and tested it. Can you help 
> review?
>
> Jiange
> ------------------------------------------------------------------------
> *From:* Zhao, Jiange <jianzh@amd.com>
> *Sent:* Saturday, May 9, 2020 5:45 PM
> *To:* amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> *Cc:* Koenig, Christian <Christian.Koenig@amd.com>; Pelloux-prayer, 
> Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhao, 
> Jiange <Jiange.Zhao@amd.com>
> *Subject:* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>     (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>           rename debugfs file to amdgpu_autodump,
>           provide autodump_read as well,
>           style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>     the node can be reopened; also, there is no need to wait for
>     completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>     add 'app_state_mutex' for race conditions:
>         (1)Only 1 user can open this file node
>         (2)wait_dump() can only take effect after poll() executed.
>         (3)eliminated the race condition between release() and
>            wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>     removed state checking in amdgpu_debugfs_wait_dump
>     Improve on top of version 3 so that the node can be reopened.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>  4 files changed, 87 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>          char product_number[16];
>          char product_name[32];
>          char                            serial[16];
> +
> +       struct amdgpu_autodump          autodump;
>  };
>
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
> ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..261b67ece7fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>  #include <linux/pci.h>
>  #include <linux/uaccess.h>
>  #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>  #include <drm/drm_debugfs.h>
>
>  #include "amdgpu.h"
> @@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device 
> *adev,
>          return 0;
>  }
>
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +       unsigned long timeout = 600 * HZ;
> +       int ret;
> +
> + wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +       ret = 
> wait_for_completion_interruptible_timeout(&adev->autodump.dumping, 
> timeout);
> +       complete_all(&adev->autodump.dumping);
> +       if (ret == 0) {
> +               pr_err("autodump: timeout, move on to gpu recovery\n");
> +               return -ETIMEDOUT;
> +       }
> +#endif
> +       return 0;
> +}
> +
>  #if defined(CONFIG_DEBUG_FS)
>
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
> file *file)
> +{
> +       struct amdgpu_device *adev = inode->i_private;
> +       int ret;
> +
> +       file->private_data = adev;
> +
> +       mutex_lock(&adev->lock_reset);
> +       if (adev->autodump.dumping.done)
> +               ret = 0;
> +       else
> +               ret = -EBUSY;
> +       mutex_unlock(&adev->lock_reset);
> +
> +       return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, 
> struct file *file)
> +{
> +       struct amdgpu_device *adev = file->private_data;
> +
> +       complete_all(&adev->autodump.dumping);
> +       return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, 
> struct poll_table_struct *poll_table)
> +{
> +       struct amdgpu_device *adev = file->private_data;
> +
> + reinit_completion(&adev->autodump.dumping);
> +       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +       if (adev->in_gpu_reset)
> +               return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +       return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +       .owner = THIS_MODULE,
> +       .open = amdgpu_debugfs_autodump_open,
> +       .poll = amdgpu_debugfs_autodump_poll,
> +       .release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +       init_completion(&adev->autodump.dumping);
> +       complete_all(&adev->autodump.dumping);
> + init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +       debugfs_create_file("amdgpu_autodump", 0600,
> + adev->ddev->primary->debugfs_root,
> +               adev, &autodump_debug_fops);
> +}
> +
>  /**
>   * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>   *
> @@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>
>          amdgpu_ras_debugfs_create_all(adev);
>
> +       amdgpu_debugfs_autodump_init(adev);
> +
>          return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
> ARRAY_SIZE(amdgpu_debugfs_list));
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>          unsigned                num_files;
>  };
>
> +struct amdgpu_autodump {
> +       struct completion               dumping;
> +       struct wait_queue_head          gpu_hang;
> +};
> +
>  int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>  int amdgpu_debugfs_init(struct amdgpu_device *adev);
>  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>  int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>  int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>  int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct 
> amdgpu_device *adev,
>          int i, r = 0;
>          bool need_full_reset  = *need_full_reset_arg;
>
> +       amdgpu_debugfs_wait_dump(adev);
> +
>          /* block all schedulers and reset given job's ring */
>          for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                  struct amdgpu_ring *ring = adev->rings[i];
> -- 
> 2.20.1
>


[-- Attachment #1.2: Type: text/html, Size: 20193 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-09  9:45 jianzh
  2020-05-13  8:16 ` Zhao, Jiange
@ 2020-05-13  8:19 ` Christian König
  2020-05-13  9:37   ` Zhao, Jiange
  1 sibling, 1 reply; 23+ messages in thread
From: Christian König @ 2020-05-13  8:19 UTC (permalink / raw)
  To: jianzh, amd-gfx
  Cc: alexander.deucher, pierre-eric.pelloux-prayer, Jiange Zhao,
	christian.koenig, monk.liu

Am 09.05.20 um 11:45 schrieb jianzh@amd.com:
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.
>
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>      add 'app_state_mutex' for race conditions:
> 	(1)Only 1 user can open this file node
> 	(2)wait_dump() can only take effect after poll() executed.
> 	(3)eliminated the race condition between release() and
> 	   wait_dump()
>
> v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
>      removed state checking in amdgpu_debugfs_wait_dump
>      Improve on top of version 3 so that the node can be reopened.
>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 87 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2a806cb55b78..9e8eeddfe7ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,6 +992,8 @@ struct amdgpu_device {
>   	char				product_number[16];
>   	char				product_name[32];
>   	char				serial[16];
> +
> +	struct amdgpu_autodump		autodump;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..261b67ece7fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>   
>   #include "amdgpu.h"
> @@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +	unsigned long timeout = 600 * HZ;
> +	int ret;
> +
> +	wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +	complete_all(&adev->autodump.dumping);
> +	if (ret == 0) {
> +		pr_err("autodump: timeout, move on to gpu recovery\n");
> +		return -ETIMEDOUT;
> +	}
> +#endif
> +	return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = inode->i_private;
> +	int ret;
> +
> +	file->private_data = adev;
> +
> +	mutex_lock(&adev->lock_reset);
> +	if (adev->autodump.dumping.done)
> +		ret = 0;
> +	else
> +		ret = -EBUSY;
> +	mutex_unlock(&adev->lock_reset);
> +
> +	return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	complete_all(&adev->autodump.dumping);
> +	return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	reinit_completion(&adev->autodump.dumping);

Why do you have the reinit_completion here and not in open callback?

Apart from that looks good to me.

Regards,
Christian.

> +	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +	if (adev->in_gpu_reset)
> +		return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +	.owner = THIS_MODULE,
> +	.open = amdgpu_debugfs_autodump_open,
> +	.poll = amdgpu_debugfs_autodump_poll,
> +	.release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +	init_completion(&adev->autodump.dumping);
> +	complete_all(&adev->autodump.dumping);
> +	init_waitqueue_head(&adev->autodump.gpu_hang);
> +
> +	debugfs_create_file("amdgpu_autodump", 0600,
> +		adev->ddev->primary->debugfs_root,
> +		adev, &autodump_debug_fops);
> +}
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   
>   	amdgpu_ras_debugfs_create_all(adev);
>   
> +	amdgpu_debugfs_autodump_init(adev);
> +
>   	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>   					ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..2803884d338d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,11 @@ struct amdgpu_debugfs {
>   	unsigned		num_files;
>   };
>   
> +struct amdgpu_autodump {
> +	struct completion		dumping;
> +	struct wait_queue_head		gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cc41e8f5ad14..545beebcf43e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> +	amdgpu_debugfs_wait_dump(adev);
> +
>   	/* block all schedulers and reset given job's ring */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-09  9:45 jianzh
@ 2020-05-13  8:16 ` Zhao, Jiange
  2020-05-13  8:20   ` Christian König
  2020-05-13  8:19 ` Christian König
  1 sibling, 1 reply; 23+ messages in thread
From: Zhao, Jiange @ 2020-05-13  8:16 UTC (permalink / raw)
  To: Zhao, Jiange, amd-gfx
  Cc: Deucher, Alexander, Pelloux-prayer, Pierre-eric, Koenig,
	Christian, Liu, Monk


[-- Attachment #1.1: Type: text/plain, Size: 8178 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi @Koenig, Christian<mailto:Christian.Koenig@amd.com>,

I made some changes on top of version 3 and tested it. Can you help review?

Jiange
________________________________
From: Zhao, Jiange <jianzh@amd.com>
Sent: Saturday, May 9, 2020 5:45 PM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Koenig, Christian <Christian.Koenig@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>
Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
    add 'app_state_mutex' for race conditions:
        (1)Only 1 user can open this file node
        (2)wait_dump() can only take effect after poll() executed.
        (3)eliminated the race condition between release() and
           wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
    removed state checking in amdgpu_debugfs_wait_dump
    Improve on top of version 3 so that the node can be reopened.

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
         char                            product_number[16];
         char                            product_name[32];
         char                            serial[16];
+
+       struct amdgpu_autodump          autodump;
 };

 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..261b67ece7fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>

 #include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
         return 0;
 }

+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+       unsigned long timeout = 600 * HZ;
+       int ret;
+
+       wake_up_interruptible(&adev->autodump.gpu_hang);
+
+       ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+       complete_all(&adev->autodump.dumping);
+       if (ret == 0) {
+               pr_err("autodump: timeout, move on to gpu recovery\n");
+               return -ETIMEDOUT;
+       }
+#endif
+       return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)

+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+       struct amdgpu_device *adev = inode->i_private;
+       int ret;
+
+       file->private_data = adev;
+
+       mutex_lock(&adev->lock_reset);
+       if (adev->autodump.dumping.done)
+               ret = 0;
+       else
+               ret = -EBUSY;
+       mutex_unlock(&adev->lock_reset);
+
+       return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+       struct amdgpu_device *adev = file->private_data;
+
+       complete_all(&adev->autodump.dumping);
+       return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+       struct amdgpu_device *adev = file->private_data;
+
+       reinit_completion(&adev->autodump.dumping);
+       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+       if (adev->in_gpu_reset)
+               return POLLIN | POLLRDNORM | POLLWRNORM;
+
+       return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+       .owner = THIS_MODULE,
+       .open = amdgpu_debugfs_autodump_open,
+       .poll = amdgpu_debugfs_autodump_poll,
+       .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+       init_completion(&adev->autodump.dumping);
+       complete_all(&adev->autodump.dumping);
+       init_waitqueue_head(&adev->autodump.gpu_hang);
+
+       debugfs_create_file("amdgpu_autodump", 0600,
+               adev->ddev->primary->debugfs_root,
+               adev, &autodump_debug_fops);
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)

         amdgpu_ras_debugfs_create_all(adev);

+       amdgpu_debugfs_autodump_init(adev);
+
         return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
                                         ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..2803884d338d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
         unsigned                num_files;
 };

+struct amdgpu_autodump {
+       struct completion               dumping;
+       struct wait_queue_head          gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cc41e8f5ad14..545beebcf43e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
         int i, r = 0;
         bool need_full_reset  = *need_full_reset_arg;

+       amdgpu_debugfs_wait_dump(adev);
+
         /* block all schedulers and reset given job's ring */
         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                 struct amdgpu_ring *ring = adev->rings[i];
--
2.20.1


[-- Attachment #1.2: Type: text/html, Size: 14707 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
@ 2020-05-09  9:45 jianzh
  2020-05-13  8:16 ` Zhao, Jiange
  2020-05-13  8:19 ` Christian König
  0 siblings, 2 replies; 23+ messages in thread
From: jianzh @ 2020-05-09  9:45 UTC (permalink / raw)
  To: amd-gfx
  Cc: alexander.deucher, pierre-eric.pelloux-prayer, Jiange Zhao,
	christian.koenig, monk.liu

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
    add 'app_state_mutex' for race conditions:
	(1)Only 1 user can open this file node
	(2)wait_dump() can only take effect after poll() executed.
	(3)eliminated the race condition between release() and
	   wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
    removed state checking in amdgpu_debugfs_wait_dump
    Improve on top of version 3 so that the node can be reopened.

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
 	char				product_number[16];
 	char				product_name[32];
 	char				serial[16];
+
+	struct amdgpu_autodump		autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..261b67ece7fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>
 
 #include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 	return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+	unsigned long timeout = 600 * HZ;
+	int ret;
+
+	wake_up_interruptible(&adev->autodump.gpu_hang);
+
+	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+	complete_all(&adev->autodump.dumping);
+	if (ret == 0) {
+		pr_err("autodump: timeout, move on to gpu recovery\n");
+		return -ETIMEDOUT;
+	}
+#endif
+	return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = inode->i_private;
+	int ret;
+
+	file->private_data = adev;
+
+	mutex_lock(&adev->lock_reset);
+	if (adev->autodump.dumping.done)
+		ret = 0;
+	else
+		ret = -EBUSY;
+	mutex_unlock(&adev->lock_reset);
+
+	return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	complete_all(&adev->autodump.dumping);
+	return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	reinit_completion(&adev->autodump.dumping);
+	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+	if (adev->in_gpu_reset)
+		return POLLIN | POLLRDNORM | POLLWRNORM;
+
+	return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+	.owner = THIS_MODULE,
+	.open = amdgpu_debugfs_autodump_open,
+	.poll = amdgpu_debugfs_autodump_poll,
+	.release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+	init_completion(&adev->autodump.dumping);
+	complete_all(&adev->autodump.dumping);
+	init_waitqueue_head(&adev->autodump.gpu_hang);
+
+	debugfs_create_file("amdgpu_autodump", 0600,
+		adev->ddev->primary->debugfs_root,
+		adev, &autodump_debug_fops);
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
 	amdgpu_ras_debugfs_create_all(adev);
 
+	amdgpu_debugfs_autodump_init(adev);
+
 	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
 					ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..2803884d338d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
 	unsigned		num_files;
 };
 
+struct amdgpu_autodump {
+	struct completion		dumping;
+	struct wait_queue_head		gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index cc41e8f5ad14..545beebcf43e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
+	amdgpu_debugfs_wait_dump(adev);
+
 	/* block all schedulers and reset given job's ring */
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-05-06  3:45       ` Zhao, Jiange
@ 2020-05-06  8:05         ` Christian König
  0 siblings, 0 replies; 23+ messages in thread
From: Christian König @ 2020-05-06  8:05 UTC (permalink / raw)
  To: Zhao, Jiange, Pelloux-prayer, Pierre-eric, amd-gfx
  Cc: Deucher, Alexander, Kuehling, Felix, Liu, Monk, Zhang, Hawking

Am 06.05.20 um 05:45 schrieb Zhao, Jiange:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi Christian,
>
>> Hi Jiange, well that looks correct to me, but seems to be a bit to complicated. What exactly was wrong with version 3?
> (1) If you open amdgpu_autodump, use it and close it, then you can't open it again, because wait_for_completion_interruptible_timeout() would decrement amdgpu_autodump.dumping.done to 0, then .open() would always return failure except the first time.

In this case we should probably just use complete_all() instead of just 
complete(). So that the struct complete stays in the completed state.

> (2) reset lock is not optimal in this case. Because usermode app would take any operation at any time and there are so many race conditions to solve. A dedicated lock would be simpler and clearer.

I don't think that this will work. Using the reset lock is mandatory 
here or otherwise we always race between a new process opening the file 
and an ongoing GPU reset.

Just imagine what happens when the process which waited for the GPU 
reset event doesn't do a dump, but just closes and immediately reopens 
the file while the last reset is still ongoing.

What we could do here is using mutex_trylock() on the reset lock and 
return -EBUSY when a reset is ongoing. Or maybe better 
mutex_lock_interruptible().

>> Please completely drop this extra check. Waking up the queue and waiting for completion should always work when done right.
> This check is very necessary, because if there is no usermode app listening, the following wait_for_completion_interruptible_timeout() would wait until timeout anyway, which is 10 minutes for nothing. This is not what we wanted.

See the wait_event_* documentation, exactly that's what you should never do.

Instead just signal the struct completion with complete_all() directly 
after it is created. This way the wakeup is a no-op and waiting for the 
struct completion continues immediately.

Regards,
Christian.

>
> Jiange
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Wednesday, April 29, 2020 10:09 PM
> To: Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
> Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
>
> Am 29.04.20 um 16:04 schrieb Pierre-Eric Pelloux-Prayer:
>> Hi Jiange,
>>
>> This version seems to work fine.
>>
>> Tested-by: Pierre-Eric Pelloux-Prayer
>> <pierre-eric.pelloux-prayer@amd.com>
>>
>>
>> On 29/04/2020 07:08, Zhao, Jiange wrote:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>>
>>> Hi all,
>>>
>>> I worked out the race condition and here is version 5. Please have a look.
>>>
>>> Jiange
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ---------------------------------------------------------------------
>>> ------------------------
>>> *From:* Zhao, Jiange <Jiange.Zhao@amd.com>
>>> *Sent:* Wednesday, April 29, 2020 1:06 PM
>>> *To:* amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>>> *Cc:* Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix
>>> <Felix.Kuehling@amd.com>; Pelloux-prayer, Pierre-eric
>>> <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>;
>>> Liu, Monk <Monk.Liu@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>
>>> *Subject:* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu
>>> reset v4
>>>    
>>> From: Jiange Zhao <Jiange.Zhao@amd.com>
>>>
>>> When GPU got timeout, it would notify an interested part of an
>>> opportunity to dump info before actual GPU reset.
>>>
>>> A usermode app would open 'autodump' node under debugfs system and
>>> poll() for readable/writable. When a GPU reset is due, amdgpu would
>>> notify usermode app through wait_queue_head and give it 10 minutes to
>>> dump info.
>>>
>>> After usermode app has done its work, this 'autodump' node is closed.
>>> On node closure, amdgpu gets to know the dump is done through the
>>> completion that is triggered in release().
>>>
>>> There is no write or read callback because necessary info can be
>>> obtained through dmesg and umr. Messages back and forth between
>>> usermode app and amdgpu are unnecessary.
>>>
>>> v2: (1) changed 'registered' to 'app_listening'
>>>       (2) add a mutex in open() to prevent race condition
>>>
>>> v3 (chk): grab the reset lock to avoid race in autodump_open,
>>>             rename debugfs file to amdgpu_autodump,
>>>             provide autodump_read as well,
>>>             style and code cleanups
>>>
>>> v4: add 'bool app_listening' to differentiate situations, so that
>>>       the node can be reopened; also, there is no need to wait for
>>>       completion when no app is waiting for a dump.
>>>
>>> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>>>       add 'app_state_mutex' for race conditions:
>>>           (1)Only 1 user can open this file node
>>>           (2)wait_dump() can only take effect after poll() executed.
>>>           (3)eliminated the race condition between release() and
>>>              wait_dump()
> Hi Jiange, well that looks correct to me, but seems to be a bit to complicated. What exactly was wrong with version 3?
>
> One more comment below.
>
>>> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92
>>> ++++++++++++++++++++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 ++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>>>    4 files changed, 109 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index bc1e0fd71a09..6f8ef98c4b97 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -990,6 +990,8 @@ struct amdgpu_device {
>>>            char                            product_number[16];
>>>            char                            product_name[32];
>>>            char                            serial[16];
>>> +
>>> +       struct amdgpu_autodump          autodump;
>>>    };
>>>    
>>>    static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>>> ttm_bo_device *bdev) diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 1a4894fa3693..1d4a95e8ad5b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -27,7 +27,7 @@
>>>    #include <linux/pci.h>
>>>    #include <linux/uaccess.h>
>>>    #include <linux/pm_runtime.h>
>>> -
>>> +#include <linux/poll.h>
>>>    #include <drm/drm_debugfs.h>
>>>    
>>>    #include "amdgpu.h"
>>> @@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device
>>> *adev,
>>>            return 0;
>>>    }
>>>    
>>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if
>>> +defined(CONFIG_DEBUG_FS)
>>> +       unsigned long timeout = 600 * HZ;
>>> +       int ret;
>>> +
>>> +       mutex_lock(&adev->autodump.app_state_mutex);
>>> +       if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
>>> +               mutex_unlock(&adev->autodump.app_state_mutex);
>>> +               return 0;
>>> +       }
>>> +       mutex_unlock(&adev->autodump.app_state_mutex);
> Please completely drop this extra check. Waking up the queue and waiting for completion should always work when done right.
>
> Regards,
> Christian.
>
>>> +
>>> +       wake_up_interruptible(&adev->autodump.gpu_hang);
>>> +
>>> +       ret =
>>> +wait_for_completion_interruptible_timeout(&adev->autodump.dumping,
>>> +timeout);
>>> +       if (ret == 0) {
>>> +               pr_err("autodump: timeout, move on to gpu
>>> +recovery\n");
>>> +               return -ETIMEDOUT;
>>> +       }
>>> +#endif
>>> +       return 0;
>>> +}
>>> +
>>>    #if defined(CONFIG_DEBUG_FS)
>>>    
>>> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct
>>> +file *file) {
>>> +       struct amdgpu_device *adev = inode->i_private;
>>> +       int ret;
>>> +
>>> +       file->private_data = adev;
>>> +
>>> +       mutex_lock(&adev->autodump.app_state_mutex);
>>> +       if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
>>> +               adev->autodump.app_state =
>>> +AMDGPU_AUTODUMP_REGISTERED;
>>> +               ret = 0;
>>> +       } else {
>>> +               ret = -EBUSY;
>>> +       }
>>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>>> +
>>> +       return ret;
>>> +}
>>> +
>>> +static int amdgpu_debugfs_autodump_release(struct inode *inode,
>>> +struct file *file) {
>>> +       struct amdgpu_device *adev = file->private_data;
>>> +
>>> +       mutex_lock(&adev->autodump.app_state_mutex);
>>> +       complete(&adev->autodump.dumping);
>>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
>>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>>> +       return 0;
>>> +}
>>> +
>>> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file,
>>> +struct poll_table_struct *poll_table) {
>>> +       struct amdgpu_device *adev = file->private_data;
>>> +
>>> +       mutex_lock(&adev->autodump.app_state_mutex);
>>> +       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
>>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
>>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>>> +
>>> +       if (adev->in_gpu_reset)
>>> +               return POLLIN | POLLRDNORM | POLLWRNORM;
>>> +
>>> +       return 0;
>>> +}
>>> +
>>> +static const struct file_operations autodump_debug_fops = {
>>> +       .owner = THIS_MODULE,
>>> +       .open = amdgpu_debugfs_autodump_open,
>>> +       .poll = amdgpu_debugfs_autodump_poll,
>>> +       .release = amdgpu_debugfs_autodump_release, };
>>> +
>>> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
>>> +{
>>> +       init_completion(&adev->autodump.dumping);
>>> +       init_waitqueue_head(&adev->autodump.gpu_hang);
>>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
>>> +       mutex_init(&adev->autodump.app_state_mutex);
>>> +
>>> +       debugfs_create_file("amdgpu_autodump", 0600,
>>> +               adev->ddev->primary->debugfs_root,
>>> +               adev, &autodump_debug_fops); }
>>> +
>>>    /**
>>>     * amdgpu_debugfs_process_reg_op - Handle MMIO register
>>> reads/writes
>>>     *
>>> @@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>> *adev)
>>>    
>>>            amdgpu_ras_debugfs_create_all(adev);
>>>    
>>> +       amdgpu_debugfs_autodump_init(adev);
>>> +
>>>            return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>>>                                            
>>> ARRAY_SIZE(amdgpu_debugfs_list));
>>>    }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>>> index de12d1101526..51b4ea790686 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>>> @@ -31,6 +31,19 @@ struct amdgpu_debugfs {
>>>            unsigned                num_files;
>>>    };
>>>    
>>> +enum amdgpu_autodump_state {
>>> +       AMDGPU_AUTODUMP_NO_APP,
>>> +       AMDGPU_AUTODUMP_REGISTERED,
>>> +       AMDGPU_AUTODUMP_LISTENING
>>> +};
>>> +
>>> +struct amdgpu_autodump {
>>> +       struct mutex                    app_state_mutex;
>>> +       enum amdgpu_autodump_state      app_state;
>>> +       struct completion               dumping;
>>> +       struct wait_queue_head          gpu_hang; };
>>> +
>>>    int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>>>    int amdgpu_debugfs_init(struct amdgpu_device *adev);
>>>    void amdgpu_debugfs_fini(struct amdgpu_device *adev); @@ -40,3
>>> +53,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>>    int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>>    int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>>>    int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
>>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e6978a2c26b7..8109946075b1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct
>>> amdgpu_device *adev,
>>>            int i, r = 0;
>>>            bool need_full_reset  = *need_full_reset_arg;
>>>    
>>> +       amdgpu_debugfs_wait_dump(adev);
>>> +
>>>            /* block all schedulers and reset given job's ring */
>>>            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>                    struct amdgpu_ring *ring = adev->rings[i];
>>> --
>>> 2.20.1
>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-04-29 14:08     ` Christian König
@ 2020-05-06  3:45       ` Zhao, Jiange
  2020-05-06  8:05         ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: Zhao, Jiange @ 2020-05-06  3:45 UTC (permalink / raw)
  To: Koenig, Christian, Pelloux-prayer, Pierre-eric, amd-gfx
  Cc: Deucher, Alexander, Kuehling, Felix, Liu, Monk, Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

> Hi Jiange, well that looks correct to me, but seems to be a bit to complicated. What exactly was wrong with version 3?
(1) If you open amdgpu_autodump, use it and close it, then you can't open it again, because wait_for_completion_interruptible_timeout() would decrement amdgpu_autodump.dumping.done to 0, then .open() would always return failure except the first time.
(2) reset lock is not optimal in this case. Because usermode app would take any operation at any time and there are so many race conditions to solve. A dedicated lock would be simpler and clearer.

> Please completely drop this extra check. Waking up the queue and waiting for completion should always work when done right.
This check is very necessary, because if there is no usermode app listening, the following wait_for_completion_interruptible_timeout() would wait until timeout anyway, which is 10 minutes for nothing. This is not what we wanted.

Jiange

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Wednesday, April 29, 2020 10:09 PM
To: Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Am 29.04.20 um 16:04 schrieb Pierre-Eric Pelloux-Prayer:
> Hi Jiange,
>
> This version seems to work fine.
>
> Tested-by: Pierre-Eric Pelloux-Prayer 
> <pierre-eric.pelloux-prayer@amd.com>
>
>
> On 29/04/2020 07:08, Zhao, Jiange wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>
>> Hi all,
>>
>> I worked out the race condition and here is version 5. Please have a look.
>>
>> Jiange
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ---------------------------------------------------------------------
>> ------------------------
>> *From:* Zhao, Jiange <Jiange.Zhao@amd.com>
>> *Sent:* Wednesday, April 29, 2020 1:06 PM
>> *To:* amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>> *Cc:* Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix 
>> <Felix.Kuehling@amd.com>; Pelloux-prayer, Pierre-eric 
>> <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander 
>> <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; 
>> Liu, Monk <Monk.Liu@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>
>> *Subject:* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu 
>> reset v4
>>   
>> From: Jiange Zhao <Jiange.Zhao@amd.com>
>>
>> When GPU got timeout, it would notify an interested part of an 
>> opportunity to dump info before actual GPU reset.
>>
>> A usermode app would open 'autodump' node under debugfs system and 
>> poll() for readable/writable. When a GPU reset is due, amdgpu would 
>> notify usermode app through wait_queue_head and give it 10 minutes to 
>> dump info.
>>
>> After usermode app has done its work, this 'autodump' node is closed.
>> On node closure, amdgpu gets to know the dump is done through the 
>> completion that is triggered in release().
>>
>> There is no write or read callback because necessary info can be 
>> obtained through dmesg and umr. Messages back and forth between 
>> usermode app and amdgpu are unnecessary.
>>
>> v2: (1) changed 'registered' to 'app_listening'
>>      (2) add a mutex in open() to prevent race condition
>>
>> v3 (chk): grab the reset lock to avoid race in autodump_open,
>>            rename debugfs file to amdgpu_autodump,
>>            provide autodump_read as well,
>>            style and code cleanups
>>
>> v4: add 'bool app_listening' to differentiate situations, so that
>>      the node can be reopened; also, there is no need to wait for
>>      completion when no app is waiting for a dump.
>>
>> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>>      add 'app_state_mutex' for race conditions:
>>          (1)Only 1 user can open this file node
>>          (2)wait_dump() can only take effect after poll() executed.
>>          (3)eliminated the race condition between release() and
>>             wait_dump()

Hi Jiange, well that looks correct to me, but seems to be a bit to complicated. What exactly was wrong with version 3?

One more comment below.

>>
>> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 
>> ++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>>   4 files changed, 109 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index bc1e0fd71a09..6f8ef98c4b97 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -990,6 +990,8 @@ struct amdgpu_device {
>>           char                            product_number[16];
>>           char                            product_name[32];
>>           char                            serial[16];
>> +
>> +       struct amdgpu_autodump          autodump;
>>   };
>>   
>>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
>> ttm_bo_device *bdev) diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 1a4894fa3693..1d4a95e8ad5b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -27,7 +27,7 @@
>>   #include <linux/pci.h>
>>   #include <linux/uaccess.h>
>>   #include <linux/pm_runtime.h>
>> -
>> +#include <linux/poll.h>
>>   #include <drm/drm_debugfs.h>
>>   
>>   #include "amdgpu.h"
>> @@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device 
>> *adev,
>>           return 0;
>>   }
>>   
>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if 
>> +defined(CONFIG_DEBUG_FS)
>> +       unsigned long timeout = 600 * HZ;
>> +       int ret;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
>> +               mutex_unlock(&adev->autodump.app_state_mutex);
>> +               return 0;
>> +       }
>> +       mutex_unlock(&adev->autodump.app_state_mutex);

Please completely drop this extra check. Waking up the queue and waiting for completion should always work when done right.

Regards,
Christian.

>> +
>> +       wake_up_interruptible(&adev->autodump.gpu_hang);
>> +
>> +       ret = 
>> +wait_for_completion_interruptible_timeout(&adev->autodump.dumping, 
>> +timeout);
>> +       if (ret == 0) {
>> +               pr_err("autodump: timeout, move on to gpu 
>> +recovery\n");
>> +               return -ETIMEDOUT;
>> +       }
>> +#endif
>> +       return 0;
>> +}
>> +
>>   #if defined(CONFIG_DEBUG_FS)
>>   
>> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
>> +file *file) {
>> +       struct amdgpu_device *adev = inode->i_private;
>> +       int ret;
>> +
>> +       file->private_data = adev;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
>> +               adev->autodump.app_state = 
>> +AMDGPU_AUTODUMP_REGISTERED;
>> +               ret = 0;
>> +       } else {
>> +               ret = -EBUSY;
>> +       }
>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>> +
>> +       return ret;
>> +}
>> +
>> +static int amdgpu_debugfs_autodump_release(struct inode *inode, 
>> +struct file *file) {
>> +       struct amdgpu_device *adev = file->private_data;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       complete(&adev->autodump.dumping);
>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>> +       return 0;
>> +}
>> +
>> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, 
>> +struct poll_table_struct *poll_table) {
>> +       struct amdgpu_device *adev = file->private_data;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>> +
>> +       if (adev->in_gpu_reset)
>> +               return POLLIN | POLLRDNORM | POLLWRNORM;
>> +
>> +       return 0;
>> +}
>> +
>> +static const struct file_operations autodump_debug_fops = {
>> +       .owner = THIS_MODULE,
>> +       .open = amdgpu_debugfs_autodump_open,
>> +       .poll = amdgpu_debugfs_autodump_poll,
>> +       .release = amdgpu_debugfs_autodump_release, };
>> +
>> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev) 
>> +{
>> +       init_completion(&adev->autodump.dumping);
>> +       init_waitqueue_head(&adev->autodump.gpu_hang);
>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
>> +       mutex_init(&adev->autodump.app_state_mutex);
>> +
>> +       debugfs_create_file("amdgpu_autodump", 0600,
>> +               adev->ddev->primary->debugfs_root,
>> +               adev, &autodump_debug_fops); }
>> +
>>   /**
>>    * amdgpu_debugfs_process_reg_op - Handle MMIO register 
>> reads/writes
>>    *
>> @@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>> *adev)
>>   
>>           amdgpu_ras_debugfs_create_all(adev);
>>   
>> +       amdgpu_debugfs_autodump_init(adev);
>> +
>>           return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>>                                           
>> ARRAY_SIZE(amdgpu_debugfs_list));
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> index de12d1101526..51b4ea790686 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> @@ -31,6 +31,19 @@ struct amdgpu_debugfs {
>>           unsigned                num_files;
>>   };
>>   
>> +enum amdgpu_autodump_state {
>> +       AMDGPU_AUTODUMP_NO_APP,
>> +       AMDGPU_AUTODUMP_REGISTERED,
>> +       AMDGPU_AUTODUMP_LISTENING
>> +};
>> +
>> +struct amdgpu_autodump {
>> +       struct mutex                    app_state_mutex;
>> +       enum amdgpu_autodump_state      app_state;
>> +       struct completion               dumping;
>> +       struct wait_queue_head          gpu_hang; };
>> +
>>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>>   void amdgpu_debugfs_fini(struct amdgpu_device *adev); @@ -40,3 
>> +53,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e6978a2c26b7..8109946075b1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>           int i, r = 0;
>>           bool need_full_reset  = *need_full_reset_arg;
>>   
>> +       amdgpu_debugfs_wait_dump(adev);
>> +
>>           /* block all schedulers and reset given job's ring */
>>           for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>                   struct amdgpu_ring *ring = adev->rings[i];
>> --
>> 2.20.1
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-04-29 14:04   ` Pierre-Eric Pelloux-Prayer
@ 2020-04-29 14:08     ` Christian König
  2020-05-06  3:45       ` Zhao, Jiange
  0 siblings, 1 reply; 23+ messages in thread
From: Christian König @ 2020-04-29 14:08 UTC (permalink / raw)
  To: Pierre-Eric Pelloux-Prayer, Zhao, Jiange, amd-gfx
  Cc: Deucher, Alexander, Kuehling, Felix, Liu, Monk, Zhang, Hawking

Am 29.04.20 um 16:04 schrieb Pierre-Eric Pelloux-Prayer:
> Hi Jiange,
>
> This version seems to work fine.
>
> Tested-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
>
>
> On 29/04/2020 07:08, Zhao, Jiange wrote:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>
>> Hi all,
>>
>> I worked out the race condition and here is version 5. Please have a look.
>>
>> Jiange
>> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>> *From:* Zhao, Jiange <Jiange.Zhao@amd.com>
>> *Sent:* Wednesday, April 29, 2020 1:06 PM
>> *To:* amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>> *Cc:* Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>
>> *Subject:* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
>>   
>> From: Jiange Zhao <Jiange.Zhao@amd.com>
>>
>> When GPU got timeout, it would notify an interested part
>> of an opportunity to dump info before actual GPU reset.
>>
>> A usermode app would open 'autodump' node under debugfs system
>> and poll() for readable/writable. When a GPU reset is due,
>> amdgpu would notify usermode app through wait_queue_head and give
>> it 10 minutes to dump info.
>>
>> After usermode app has done its work, this 'autodump' node is closed.
>> On node closure, amdgpu gets to know the dump is done through
>> the completion that is triggered in release().
>>
>> There is no write or read callback because necessary info can be
>> obtained through dmesg and umr. Messages back and forth between
>> usermode app and amdgpu are unnecessary.
>>
>> v2: (1) changed 'registered' to 'app_listening'
>>      (2) add a mutex in open() to prevent race condition
>>
>> v3 (chk): grab the reset lock to avoid race in autodump_open,
>>            rename debugfs file to amdgpu_autodump,
>>            provide autodump_read as well,
>>            style and code cleanups
>>
>> v4: add 'bool app_listening' to differentiate situations, so that
>>      the node can be reopened; also, there is no need to wait for
>>      completion when no app is waiting for a dump.
>>
>> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>>      add 'app_state_mutex' for race conditions:
>>          (1)Only 1 user can open this file node
>>          (2)wait_dump() can only take effect after poll() executed.
>>          (3)eliminated the race condition between release() and
>>             wait_dump()

Hi Jiange, well that looks correct to me, but seems to be a bit to 
complicated. What exactly was wrong with version 3?

One more comment below.

>>
>> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 ++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>>   4 files changed, 109 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index bc1e0fd71a09..6f8ef98c4b97 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -990,6 +990,8 @@ struct amdgpu_device {
>>           char                            product_number[16];
>>           char                            product_name[32];
>>           char                            serial[16];
>> +
>> +       struct amdgpu_autodump          autodump;
>>   };
>>   
>>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 1a4894fa3693..1d4a95e8ad5b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -27,7 +27,7 @@
>>   #include <linux/pci.h>
>>   #include <linux/uaccess.h>
>>   #include <linux/pm_runtime.h>
>> -
>> +#include <linux/poll.h>
>>   #include <drm/drm_debugfs.h>
>>   
>>   #include "amdgpu.h"
>> @@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>           return 0;
>>   }
>>   
>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
>> +{
>> +#if defined(CONFIG_DEBUG_FS)
>> +       unsigned long timeout = 600 * HZ;
>> +       int ret;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
>> +               mutex_unlock(&adev->autodump.app_state_mutex);
>> +               return 0;
>> +       }
>> +       mutex_unlock(&adev->autodump.app_state_mutex);

Please completely drop this extra check. Waking up the queue and waiting 
for completion should always work when done right.

Regards,
Christian.

>> +
>> +       wake_up_interruptible(&adev->autodump.gpu_hang);
>> +
>> +       ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
>> +       if (ret == 0) {
>> +               pr_err("autodump: timeout, move on to gpu recovery\n");
>> +               return -ETIMEDOUT;
>> +       }
>> +#endif
>> +       return 0;
>> +}
>> +
>>   #if defined(CONFIG_DEBUG_FS)
>>   
>> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
>> +{
>> +       struct amdgpu_device *adev = inode->i_private;
>> +       int ret;
>> +
>> +       file->private_data = adev;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
>> +               adev->autodump.app_state = AMDGPU_AUTODUMP_REGISTERED;
>> +               ret = 0;
>> +       } else {
>> +               ret = -EBUSY;
>> +       }
>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>> +
>> +       return ret;
>> +}
>> +
>> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
>> +{
>> +       struct amdgpu_device *adev = file->private_data;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       complete(&adev->autodump.dumping);
>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>> +       return 0;
>> +}
>> +
>> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
>> +{
>> +       struct amdgpu_device *adev = file->private_data;
>> +
>> +       mutex_lock(&adev->autodump.app_state_mutex);
>> +       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
>> +       mutex_unlock(&adev->autodump.app_state_mutex);
>> +
>> +       if (adev->in_gpu_reset)
>> +               return POLLIN | POLLRDNORM | POLLWRNORM;
>> +
>> +       return 0;
>> +}
>> +
>> +static const struct file_operations autodump_debug_fops = {
>> +       .owner = THIS_MODULE,
>> +       .open = amdgpu_debugfs_autodump_open,
>> +       .poll = amdgpu_debugfs_autodump_poll,
>> +       .release = amdgpu_debugfs_autodump_release,
>> +};
>> +
>> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
>> +{
>> +       init_completion(&adev->autodump.dumping);
>> +       init_waitqueue_head(&adev->autodump.gpu_hang);
>> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
>> +       mutex_init(&adev->autodump.app_state_mutex);
>> +
>> +       debugfs_create_file("amdgpu_autodump", 0600,
>> +               adev->ddev->primary->debugfs_root,
>> +               adev, &autodump_debug_fops);
>> +}
>> +
>>   /**
>>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>>    *
>> @@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>   
>>           amdgpu_ras_debugfs_create_all(adev);
>>   
>> +       amdgpu_debugfs_autodump_init(adev);
>> +
>>           return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>>                                           ARRAY_SIZE(amdgpu_debugfs_list));
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> index de12d1101526..51b4ea790686 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
>> @@ -31,6 +31,19 @@ struct amdgpu_debugfs {
>>           unsigned                num_files;
>>   };
>>   
>> +enum amdgpu_autodump_state {
>> +       AMDGPU_AUTODUMP_NO_APP,
>> +       AMDGPU_AUTODUMP_REGISTERED,
>> +       AMDGPU_AUTODUMP_LISTENING
>> +};
>> +
>> +struct amdgpu_autodump {
>> +       struct mutex                    app_state_mutex;
>> +       enum amdgpu_autodump_state      app_state;
>> +       struct completion               dumping;
>> +       struct wait_queue_head          gpu_hang;
>> +};
>> +
>>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
>> @@ -40,3 +53,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
>> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e6978a2c26b7..8109946075b1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>           int i, r = 0;
>>           bool need_full_reset  = *need_full_reset_arg;
>>   
>> +       amdgpu_debugfs_wait_dump(adev);
>> +
>>           /* block all schedulers and reset given job's ring */
>>           for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>                   struct amdgpu_ring *ring = adev->rings[i];
>> -- 
>> 2.20.1
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-04-29  5:08 ` Zhao, Jiange
@ 2020-04-29 14:04   ` Pierre-Eric Pelloux-Prayer
  2020-04-29 14:08     ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: Pierre-Eric Pelloux-Prayer @ 2020-04-29 14:04 UTC (permalink / raw)
  To: Zhao, Jiange, amd-gfx
  Cc: Pelloux-prayer, Pierre-eric, Kuehling, Felix, Deucher, Alexander,
	Koenig, Christian, Liu, Monk, Zhang, Hawking

Hi Jiange,

This version seems to work fine.

Tested-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>


On 29/04/2020 07:08, Zhao, Jiange wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> 
> Hi all,
> 
> I worked out the race condition and here is version 5. Please have a look.
> 
> Jiange
> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> *From:* Zhao, Jiange <Jiange.Zhao@amd.com>
> *Sent:* Wednesday, April 29, 2020 1:06 PM
> *To:* amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> *Cc:* Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>
> *Subject:* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
>  
> From: Jiange Zhao <Jiange.Zhao@amd.com>
> 
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
> 
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
> 
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
> 
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
> 
> v2: (1) changed 'registered' to 'app_listening'
>     (2) add a mutex in open() to prevent race condition
> 
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>           rename debugfs file to amdgpu_autodump,
>           provide autodump_read as well,
>           style and code cleanups
> 
> v4: add 'bool app_listening' to differentiate situations, so that
>     the node can be reopened; also, there is no need to wait for
>     completion when no app is waiting for a dump.
> 
> v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
>     add 'app_state_mutex' for race conditions:
>         (1)Only 1 user can open this file node
>         (2)wait_dump() can only take effect after poll() executed.
>         (3)eliminated the race condition between release() and
>            wait_dump()
> 
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 ++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>  4 files changed, 109 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc1e0fd71a09..6f8ef98c4b97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -990,6 +990,8 @@ struct amdgpu_device {
>          char                            product_number[16];
>          char                            product_name[32];
>          char                            serial[16];
> +
> +       struct amdgpu_autodump          autodump;
>  };
>  
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..1d4a95e8ad5b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>  #include <linux/pci.h>
>  #include <linux/uaccess.h>
>  #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>  #include <drm/drm_debugfs.h>
>  
>  #include "amdgpu.h"
> @@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>          return 0;
>  }
>  
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> +       unsigned long timeout = 600 * HZ;
> +       int ret;
> +
> +       mutex_lock(&adev->autodump.app_state_mutex);
> +       if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
> +               mutex_unlock(&adev->autodump.app_state_mutex);
> +               return 0;
> +       }
> +       mutex_unlock(&adev->autodump.app_state_mutex);
> +
> +       wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +       ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +       if (ret == 0) {
> +               pr_err("autodump: timeout, move on to gpu recovery\n");
> +               return -ETIMEDOUT;
> +       }
> +#endif
> +       return 0;
> +}
> +
>  #if defined(CONFIG_DEBUG_FS)
>  
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +       struct amdgpu_device *adev = inode->i_private;
> +       int ret;
> +
> +       file->private_data = adev;
> +
> +       mutex_lock(&adev->autodump.app_state_mutex);
> +       if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
> +               adev->autodump.app_state = AMDGPU_AUTODUMP_REGISTERED;
> +               ret = 0;
> +       } else {
> +               ret = -EBUSY;
> +       }
> +       mutex_unlock(&adev->autodump.app_state_mutex);
> +
> +       return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +       struct amdgpu_device *adev = file->private_data;
> +
> +       mutex_lock(&adev->autodump.app_state_mutex);
> +       complete(&adev->autodump.dumping);
> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
> +       mutex_unlock(&adev->autodump.app_state_mutex);
> +       return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +       struct amdgpu_device *adev = file->private_data;
> +
> +       mutex_lock(&adev->autodump.app_state_mutex);
> +       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +       adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
> +       mutex_unlock(&adev->autodump.app_state_mutex);
> +
> +       if (adev->in_gpu_reset)
> +               return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +       return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +       .owner = THIS_MODULE,
> +       .open = amdgpu_debugfs_autodump_open,
> +       .poll = amdgpu_debugfs_autodump_poll,
> +       .release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +       init_completion(&adev->autodump.dumping);
> +       init_waitqueue_head(&adev->autodump.gpu_hang);
> +       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
> +       mutex_init(&adev->autodump.app_state_mutex);
> +
> +       debugfs_create_file("amdgpu_autodump", 0600,
> +               adev->ddev->primary->debugfs_root,
> +               adev, &autodump_debug_fops);
> +}
> +
>  /**
>   * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>   *
> @@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>  
>          amdgpu_ras_debugfs_create_all(adev);
>  
> +       amdgpu_debugfs_autodump_init(adev);
> +
>          return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>                                          ARRAY_SIZE(amdgpu_debugfs_list));
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..51b4ea790686 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,19 @@ struct amdgpu_debugfs {
>          unsigned                num_files;
>  };
>  
> +enum amdgpu_autodump_state {
> +       AMDGPU_AUTODUMP_NO_APP,
> +       AMDGPU_AUTODUMP_REGISTERED,
> +       AMDGPU_AUTODUMP_LISTENING
> +};
> +
> +struct amdgpu_autodump {
> +       struct mutex                    app_state_mutex;
> +       enum amdgpu_autodump_state      app_state;
> +       struct completion               dumping;
> +       struct wait_queue_head          gpu_hang;
> +};
> +
>  int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>  int amdgpu_debugfs_init(struct amdgpu_device *adev);
>  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +53,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>  int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>  int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>  int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e6978a2c26b7..8109946075b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>          int i, r = 0;
>          bool need_full_reset  = *need_full_reset_arg;
>  
> +       amdgpu_debugfs_wait_dump(adev);
> +
>          /* block all schedulers and reset given job's ring */
>          for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                  struct amdgpu_ring *ring = adev->rings[i];
> -- 
> 2.20.1
> 
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-04-29  5:06 jianzh
@ 2020-04-29  5:08 ` Zhao, Jiange
  2020-04-29 14:04   ` Pierre-Eric Pelloux-Prayer
  0 siblings, 1 reply; 23+ messages in thread
From: Zhao, Jiange @ 2020-04-29  5:08 UTC (permalink / raw)
  To: amd-gfx
  Cc: Pelloux-prayer, Pierre-eric, Kuehling, Felix, Deucher, Alexander,
	Koenig, Christian, Liu,  Monk, Zhang, Hawking


[-- Attachment #1.1: Type: text/plain, Size: 9017 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi all,

I worked out the race condition and here is version 5. Please have a look.

Jiange
________________________________
From: Zhao, Jiange <Jiange.Zhao@amd.com>
Sent: Wednesday, April 29, 2020 1:06 PM
To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Koenig, Christian <Christian.Koenig@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; Pelloux-prayer, Pierre-eric <Pierre-eric.Pelloux-prayer@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Zhao, Jiange <Jiange.Zhao@amd.com>
Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
    add 'app_state_mutex' for race conditions:
        (1)Only 1 user can open this file node
        (2)wait_dump() can only take effect after poll() executed.
        (3)eliminated the race condition between release() and
           wait_dump()

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..6f8ef98c4b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -990,6 +990,8 @@ struct amdgpu_device {
         char                            product_number[16];
         char                            product_name[32];
         char                            serial[16];
+
+       struct amdgpu_autodump          autodump;
 };

 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..1d4a95e8ad5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>

 #include "amdgpu.h"
@@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
         return 0;
 }

+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+       unsigned long timeout = 600 * HZ;
+       int ret;
+
+       mutex_lock(&adev->autodump.app_state_mutex);
+       if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
+               mutex_unlock(&adev->autodump.app_state_mutex);
+               return 0;
+       }
+       mutex_unlock(&adev->autodump.app_state_mutex);
+
+       wake_up_interruptible(&adev->autodump.gpu_hang);
+
+       ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+       if (ret == 0) {
+               pr_err("autodump: timeout, move on to gpu recovery\n");
+               return -ETIMEDOUT;
+       }
+#endif
+       return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)

+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+       struct amdgpu_device *adev = inode->i_private;
+       int ret;
+
+       file->private_data = adev;
+
+       mutex_lock(&adev->autodump.app_state_mutex);
+       if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
+               adev->autodump.app_state = AMDGPU_AUTODUMP_REGISTERED;
+               ret = 0;
+       } else {
+               ret = -EBUSY;
+       }
+       mutex_unlock(&adev->autodump.app_state_mutex);
+
+       return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+       struct amdgpu_device *adev = file->private_data;
+
+       mutex_lock(&adev->autodump.app_state_mutex);
+       complete(&adev->autodump.dumping);
+       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
+       mutex_unlock(&adev->autodump.app_state_mutex);
+       return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+       struct amdgpu_device *adev = file->private_data;
+
+       mutex_lock(&adev->autodump.app_state_mutex);
+       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+       adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
+       mutex_unlock(&adev->autodump.app_state_mutex);
+
+       if (adev->in_gpu_reset)
+               return POLLIN | POLLRDNORM | POLLWRNORM;
+
+       return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+       .owner = THIS_MODULE,
+       .open = amdgpu_debugfs_autodump_open,
+       .poll = amdgpu_debugfs_autodump_poll,
+       .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+       init_completion(&adev->autodump.dumping);
+       init_waitqueue_head(&adev->autodump.gpu_hang);
+       adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
+       mutex_init(&adev->autodump.app_state_mutex);
+
+       debugfs_create_file("amdgpu_autodump", 0600,
+               adev->ddev->primary->debugfs_root,
+               adev, &autodump_debug_fops);
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)

         amdgpu_ras_debugfs_create_all(adev);

+       amdgpu_debugfs_autodump_init(adev);
+
         return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
                                         ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..51b4ea790686 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,19 @@ struct amdgpu_debugfs {
         unsigned                num_files;
 };

+enum amdgpu_autodump_state {
+       AMDGPU_AUTODUMP_NO_APP,
+       AMDGPU_AUTODUMP_REGISTERED,
+       AMDGPU_AUTODUMP_LISTENING
+};
+
+struct amdgpu_autodump {
+       struct mutex                    app_state_mutex;
+       enum amdgpu_autodump_state      app_state;
+       struct completion               dumping;
+       struct wait_queue_head          gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +53,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6978a2c26b7..8109946075b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
         int i, r = 0;
         bool need_full_reset  = *need_full_reset_arg;

+       amdgpu_debugfs_wait_dump(adev);
+
         /* block all schedulers and reset given job's ring */
         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                 struct amdgpu_ring *ring = adev->rings[i];
--
2.20.1


[-- Attachment #1.2: Type: text/html, Size: 16431 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
@ 2020-04-29  5:06 jianzh
  2020-04-29  5:08 ` Zhao, Jiange
  0 siblings, 1 reply; 23+ messages in thread
From: jianzh @ 2020-04-29  5:06 UTC (permalink / raw)
  To: amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Christian.Koenig, Monk.Liu, Hawking.Zhang

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
    add 'app_state_mutex' for race conditions:
	(1)Only 1 user can open this file node
	(2)wait_dump() can only take effect after poll() executed.
	(3)eliminated the race condition between release() and
	   wait_dump()

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..6f8ef98c4b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -990,6 +990,8 @@ struct amdgpu_device {
 	char				product_number[16];
 	char				product_name[32];
 	char				serial[16];
+
+	struct amdgpu_autodump		autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..1d4a95e8ad5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>
 
 #include "amdgpu.h"
@@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 	return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+	unsigned long timeout = 600 * HZ;
+	int ret;
+
+	mutex_lock(&adev->autodump.app_state_mutex);
+	if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
+		mutex_unlock(&adev->autodump.app_state_mutex);
+		return 0;
+	}
+	mutex_unlock(&adev->autodump.app_state_mutex);
+
+	wake_up_interruptible(&adev->autodump.gpu_hang);
+
+	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+	if (ret == 0) {
+		pr_err("autodump: timeout, move on to gpu recovery\n");
+		return -ETIMEDOUT;
+	}
+#endif
+	return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = inode->i_private;
+	int ret;
+
+	file->private_data = adev;
+
+	mutex_lock(&adev->autodump.app_state_mutex);
+	if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
+		adev->autodump.app_state = AMDGPU_AUTODUMP_REGISTERED;
+		ret = 0;
+	} else {
+		ret = -EBUSY;
+	}
+	mutex_unlock(&adev->autodump.app_state_mutex);
+
+	return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	mutex_lock(&adev->autodump.app_state_mutex);
+	complete(&adev->autodump.dumping);
+	adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
+	mutex_unlock(&adev->autodump.app_state_mutex);
+	return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	mutex_lock(&adev->autodump.app_state_mutex);
+	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+	adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
+	mutex_unlock(&adev->autodump.app_state_mutex);
+
+	if (adev->in_gpu_reset)
+		return POLLIN | POLLRDNORM | POLLWRNORM;
+
+	return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+	.owner = THIS_MODULE,
+	.open = amdgpu_debugfs_autodump_open,
+	.poll = amdgpu_debugfs_autodump_poll,
+	.release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+	init_completion(&adev->autodump.dumping);
+	init_waitqueue_head(&adev->autodump.gpu_hang);
+	adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
+	mutex_init(&adev->autodump.app_state_mutex);
+
+	debugfs_create_file("amdgpu_autodump", 0600,
+		adev->ddev->primary->debugfs_root,
+		adev, &autodump_debug_fops);
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1522,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
 	amdgpu_ras_debugfs_create_all(adev);
 
+	amdgpu_debugfs_autodump_init(adev);
+
 	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
 					ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..51b4ea790686 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,19 @@ struct amdgpu_debugfs {
 	unsigned		num_files;
 };
 
+enum amdgpu_autodump_state {
+	AMDGPU_AUTODUMP_NO_APP,
+	AMDGPU_AUTODUMP_REGISTERED,
+	AMDGPU_AUTODUMP_LISTENING
+};
+
+struct amdgpu_autodump {
+	struct mutex			app_state_mutex;
+	enum amdgpu_autodump_state	app_state;
+	struct completion		dumping;
+	struct wait_queue_head		gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +53,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6978a2c26b7..8109946075b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
+	amdgpu_debugfs_wait_dump(adev);
+
 	/* block all schedulers and reset given job's ring */
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
  2020-04-26 10:09 jianzh
@ 2020-04-26 10:40 ` Christian König
  0 siblings, 0 replies; 23+ messages in thread
From: Christian König @ 2020-04-26 10:40 UTC (permalink / raw)
  To: jianzh, amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Monk.Liu, Hawking.Zhang

Am 26.04.20 um 12:09 schrieb jianzh@amd.com:
> From: Jiange Zhao <Jiange.Zhao@amd.com>
>
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
>
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
>
> v2: (1) changed 'registered' to 'app_listening'
>      (2) add a mutex in open() to prevent race condition
>
> v3 (chk): grab the reset lock to avoid race in autodump_open,
>            rename debugfs file to amdgpu_autodump,
>            provide autodump_read as well,
>            style and code cleanups
>
> v4: add 'bool app_listening' to differentiate situations, so that
>      the node can be reopened; also, there is no need to wait for
>      completion when no app is waiting for a dump.

NAK, exactly that is racy and should be avoided.

What problem are you seeing here?

Regards,
Christian.

>
> Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 82 ++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  7 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 92 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc1e0fd71a09..6f8ef98c4b97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -990,6 +990,8 @@ struct amdgpu_device {
>   	char				product_number[16];
>   	char				product_name[32];
>   	char				serial[16];
> +
> +	struct amdgpu_autodump		autodump;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..04720264e8b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,7 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -
> +#include <linux/poll.h>
>   #include <drm/drm_debugfs.h>
>   
>   #include "amdgpu.h"
> @@ -74,7 +74,85 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
>   #if defined(CONFIG_DEBUG_FS)
> +	unsigned long timeout = 600 * HZ;
> +	int ret;
> +
> +	if (!adev->autodump.app_listening)
> +		return 0;
> +
> +	wake_up_interruptible(&adev->autodump.gpu_hang);
> +
> +	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> +	if (ret == 0) {
> +		pr_err("autodump: timeout, move on to gpu recovery\n");
> +		return -ETIMEDOUT;
> +	}
> +#endif
> +	return 0;
> +}
> +
> +#if defined(CONFIG_DEBUG_FS)
> +
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = inode->i_private;
> +	int ret;
> +
> +	file->private_data = adev;
> +
> +	mutex_lock(&adev->lock_reset);
> +	if (!adev->autodump.app_listening) {
> +		adev->autodump.app_listening = true;
> +		ret = 0;
> +	} else {
> +		ret = -EBUSY;
> +	}
> +	mutex_unlock(&adev->lock_reset);
> +
> +	return ret;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	complete(&adev->autodump.dumping);
> +	adev->autodump.app_listening = false;
> +	return 0;
> +}
> +
> +static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> +{
> +	struct amdgpu_device *adev = file->private_data;
> +
> +	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> +
> +	if (adev->in_gpu_reset)
> +		return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> +	.owner = THIS_MODULE,
> +	.open = amdgpu_debugfs_autodump_open,
> +	.poll = amdgpu_debugfs_autodump_poll,
> +	.release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> +	init_completion(&adev->autodump.dumping);
> +	init_waitqueue_head(&adev->autodump.gpu_hang);
> +	adev->autodump.app_listening = false;
> +
> +	debugfs_create_file("amdgpu_autodump", 0600,
> +		adev->ddev->primary->debugfs_root,
> +		adev, &autodump_debug_fops);
> +}
>   
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
> @@ -1434,6 +1512,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   
>   	amdgpu_ras_debugfs_create_all(adev);
>   
> +	amdgpu_debugfs_autodump_init(adev);
> +
>   	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
>   					ARRAY_SIZE(amdgpu_debugfs_list));
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index de12d1101526..72f6fe5c7596 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -31,6 +31,12 @@ struct amdgpu_debugfs {
>   	unsigned		num_files;
>   };
>   
> +struct amdgpu_autodump {
> +	bool			app_listening;
> +	struct completion       dumping;
> +	struct wait_queue_head  gpu_hang;
> +};
> +
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> @@ -40,3 +46,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e6978a2c26b7..8109946075b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool need_full_reset  = *need_full_reset_arg;
>   
> +	amdgpu_debugfs_wait_dump(adev);
> +
>   	/* block all schedulers and reset given job's ring */
>   	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   		struct amdgpu_ring *ring = adev->rings[i];

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4
@ 2020-04-26 10:09 jianzh
  2020-04-26 10:40 ` Christian König
  0 siblings, 1 reply; 23+ messages in thread
From: jianzh @ 2020-04-26 10:09 UTC (permalink / raw)
  To: amd-gfx
  Cc: Pierre-eric.Pelloux-prayer, Jiange Zhao, Felix.Kuehling,
	Alexander.Deucher, Christian.Koenig, Monk.Liu, Hawking.Zhang

From: Jiange Zhao <Jiange.Zhao@amd.com>

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
    (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
          rename debugfs file to amdgpu_autodump,
          provide autodump_read as well,
          style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
    the node can be reopened; also, there is no need to wait for
    completion when no app is waiting for a dump.

Signed-off-by: Jiange Zhao <Jiange.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 82 ++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  7 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..6f8ef98c4b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -990,6 +990,8 @@ struct amdgpu_device {
 	char				product_number[16];
 	char				product_name[32];
 	char				serial[16];
+
+	struct amdgpu_autodump		autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..04720264e8b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
 #include <drm/drm_debugfs.h>
 
 #include "amdgpu.h"
@@ -74,7 +74,85 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 	return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
 #if defined(CONFIG_DEBUG_FS)
+	unsigned long timeout = 600 * HZ;
+	int ret;
+
+	if (!adev->autodump.app_listening)
+		return 0;
+
+	wake_up_interruptible(&adev->autodump.gpu_hang);
+
+	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+	if (ret == 0) {
+		pr_err("autodump: timeout, move on to gpu recovery\n");
+		return -ETIMEDOUT;
+	}
+#endif
+	return 0;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = inode->i_private;
+	int ret;
+
+	file->private_data = adev;
+
+	mutex_lock(&adev->lock_reset);
+	if (!adev->autodump.app_listening) {
+		adev->autodump.app_listening = true;
+		ret = 0;
+	} else {
+		ret = -EBUSY;
+	}
+	mutex_unlock(&adev->lock_reset);
+
+	return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	complete(&adev->autodump.dumping);
+	adev->autodump.app_listening = false;
+	return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+	struct amdgpu_device *adev = file->private_data;
+
+	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+	if (adev->in_gpu_reset)
+		return POLLIN | POLLRDNORM | POLLWRNORM;
+
+	return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+	.owner = THIS_MODULE,
+	.open = amdgpu_debugfs_autodump_open,
+	.poll = amdgpu_debugfs_autodump_poll,
+	.release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+	init_completion(&adev->autodump.dumping);
+	init_waitqueue_head(&adev->autodump.gpu_hang);
+	adev->autodump.app_listening = false;
+
+	debugfs_create_file("amdgpu_autodump", 0600,
+		adev->ddev->primary->debugfs_root,
+		adev, &autodump_debug_fops);
+}
 
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
@@ -1434,6 +1512,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
 	amdgpu_ras_debugfs_create_all(adev);
 
+	amdgpu_debugfs_autodump_init(adev);
+
 	return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
 					ARRAY_SIZE(amdgpu_debugfs_list));
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index de12d1101526..72f6fe5c7596 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,12 @@ struct amdgpu_debugfs {
 	unsigned		num_files;
 };
 
+struct amdgpu_autodump {
+	bool			app_listening;
+	struct completion       dumping;
+	struct wait_queue_head  gpu_hang;
+};
+
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +46,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
 int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e6978a2c26b7..8109946075b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3912,6 +3912,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool need_full_reset  = *need_full_reset_arg;
 
+	amdgpu_debugfs_wait_dump(adev);
+
 	/* block all schedulers and reset given job's ring */
 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 		struct amdgpu_ring *ring = adev->rings[i];
-- 
2.20.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2020-05-15  6:51 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-14  9:18 [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4 jianzh
2020-05-14 10:25 ` Christian König
  -- strict thread matches above, loose matches on Subject: below --
2020-05-14  5:29 jianzh
2020-05-14  7:01 ` Christian König
2020-05-14  7:14   ` Zhao, Jiange
2020-05-14  8:28     ` Christian König
2020-05-14 15:15       ` Li, Dennis
2020-05-15  2:40         ` Zhao, Jiange
2020-05-15  6:51           ` Christian König
2020-05-09  9:45 jianzh
2020-05-13  8:16 ` Zhao, Jiange
2020-05-13  8:20   ` Christian König
2020-05-13  8:19 ` Christian König
2020-05-13  9:37   ` Zhao, Jiange
2020-05-13 10:36     ` Christian König
2020-04-29  5:06 jianzh
2020-04-29  5:08 ` Zhao, Jiange
2020-04-29 14:04   ` Pierre-Eric Pelloux-Prayer
2020-04-29 14:08     ` Christian König
2020-05-06  3:45       ` Zhao, Jiange
2020-05-06  8:05         ` Christian König
2020-04-26 10:09 jianzh
2020-04-26 10:40 ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.