All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: Provide SMI events watch
@ 2020-05-13 19:41 Amber Lin
  2020-05-14  0:52 ` Felix Kuehling
  0 siblings, 1 reply; 11+ messages in thread
From: Amber Lin @ 2020-05-13 19:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: Amber Lin

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
    - correct kfifo usage
    - move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile              |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  18 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 214 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  29 +++
 include/uapi/linux/kfd_ioctl.h                   |  16 +-
 9 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_dbgdev.o \
 		$(AMDKFD_PATH)/kfd_dbgmgr.o \
+		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 		struct kfd_vm_fault_info info;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 
 		memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cf0017f..e9b96ad 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
 	return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_smi_events_args *args = data;
+	struct kfd_dev *dev;
+
+	dev = kfd_device_by_id(args->gpuid);
+	if (!dev)
+		return -EINVAL;
+
+	return kfd_smi_event_open(dev, &args->anon_fd);
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
 			kfd_ioctl_alloc_queue_gws, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+			kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
 	return ret;
 }
 
+static void kfd_smi_init(struct kfd_dev *dev) {
+	INIT_LIST_HEAD(&dev->smi_clients);
+	spin_lock_init(&dev->smi_lock);
+}
+
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			 struct drm_device *ddev,
 			 const struct kgd2kfd_shared_resources *gpu_resources)
@@ -700,6 +705,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 		goto kfd_topology_add_device_error;
 	}
 
+	kfd_smi_init(kfd);
+
 	kfd->init_complete = true;
 	dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor,
 		 kfd->pdev->device);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e05d75e..151e83e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_smi_events.h"
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 		kfd_signal_vm_fault_event(dev, pasid, &info);
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index cde5e4c..f70f789 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -309,6 +309,10 @@ struct kfd_dev {
 
 	/* Global GWS resource shared b/t processes*/
 	void *gws;
+
+	/* Clients watching SMI events */
+	struct list_head smi_clients;
+	spinlock_t smi_lock;
 };
 
 enum kfd_mempool {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
new file mode 100644
index 0000000..f5fd18e
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_vm.h"
+#include "kfd_priv.h"
+#include "kfd_smi_events.h"
+
+struct kfd_smi_client {
+	struct list_head list;
+	struct kfifo fifo;
+	wait_queue_head_t wait_queue;
+	/* events enabled */
+	uint64_t events;
+	struct kfd_dev *dev;
+	spinlock_t lock;
+};
+
+#define MAX_KFIFO_SIZE	1024
+
+static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
+static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
+static ssize_t kfd_smi_ev_write(struct file *, const char __user *, size_t,
+				loff_t *);
+static int kfd_smi_ev_release(struct inode *, struct file *);
+
+static const char kfd_smi_name[] = "kfd_smi_ev";
+
+static const struct file_operations kfd_smi_ev_fops = {
+	.owner = THIS_MODULE,
+	.poll = kfd_smi_ev_poll,
+	.read = kfd_smi_ev_read,
+	.write = kfd_smi_ev_write,
+	.release = kfd_smi_ev_release
+};
+
+static __poll_t kfd_smi_ev_poll(struct file *filep,
+				struct poll_table_struct *wait)
+{
+	__poll_t mask;
+	struct kfd_smi_client *client = filep->private_data;
+
+	poll_wait(filep, &client->wait_queue, wait);
+
+	spin_lock(&client->lock);
+	mask = kfifo_is_empty(&client->fifo) ? 0 : POLLIN | POLLRDNORM;
+	spin_unlock(&client->lock);
+
+	return mask;
+}
+
+static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
+			       size_t size, loff_t *offset)
+{
+	int ret;
+	size_t to_copy;
+	struct kfd_smi_client *client = filep->private_data;
+	unsigned char buf[MAX_KFIFO_SIZE];
+
+	BUILD_BUG_ON(MAX_KFIFO_SIZE > 1024);
+
+	/* kfifo_to_user can sleep so we can't use spinlock protection around
+	 * it. Instead, we kfifo out as spinlocked then copy them to the user.
+	 */
+	spin_lock(&client->lock);
+	to_copy = kfifo_len(&client->fifo);
+	if (!to_copy) {
+		spin_unlock(&client->lock);
+		return -EAGAIN;
+	}
+	to_copy = min3(size, sizeof(buf), to_copy);
+	ret = kfifo_out(&client->fifo, buf, to_copy);
+	spin_unlock(&client->lock);
+	if (ret <= 0)
+		return -EAGAIN;
+
+	ret = copy_to_user(user, buf, to_copy);
+	if (ret)
+		return -EFAULT;
+
+	return to_copy;
+}
+
+static ssize_t kfd_smi_ev_write(struct file *filep, const char __user *user,
+				size_t size, loff_t *offset)
+{
+	struct kfd_smi_client *client = filep->private_data;
+	uint64_t events;
+
+	if (!access_ok(user, size) || size < sizeof(events))
+		return -EFAULT;
+	if (copy_from_user(&events, user, sizeof(events)))
+		return -EFAULT;
+
+	WRITE_ONCE(client->events, events);
+
+	return sizeof(events);
+}
+
+static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
+{
+	struct kfd_smi_client *client = filep->private_data;
+	struct kfd_dev *dev = client->dev;
+
+	spin_lock(&dev->smi_lock);
+	list_del_rcu(&client->list);
+	spin_unlock(&dev->smi_lock);
+
+	synchronize_rcu();
+	kfifo_free(&client->fifo);
+	kfree(client);
+
+	return 0;
+}
+
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+	struct amdgpu_task_info task_info;
+	/* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */
+	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
+	 */
+	char fifo_in[43];
+	struct kfd_smi_client *client;
+	int len;
+
+	if (list_empty(&dev->smi_clients))
+		return;
+
+	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
+	amdgpu_vm_get_task_info(adev, pasid, &task_info);
+	/* Report VM faults from user applications, not retry from kernel */
+	if (!task_info.pid)
+		return;
+
+	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
+		task_info.pid, task_info.task_name);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
+		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
+			continue;
+		spin_lock(&client->lock);
+		if (kfifo_avail(&client->fifo) >= len) {
+			kfifo_in(&client->fifo, fifo_in, len);
+			wake_up_all(&client->wait_queue);
+		}
+		else
+			pr_debug("smi_event(vmfault): no space left\n");
+		spin_unlock(&client->lock);
+	}
+
+	rcu_read_unlock();
+}
+
+int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
+{
+	struct kfd_smi_client *client;
+	int ret;
+
+	client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&client->list);
+
+	ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
+	if (ret) {
+		kfree(client);
+		return ret;
+	}
+
+	ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
+			       O_RDWR);
+	if (ret < 0) {
+		kfifo_free(&client->fifo);
+		kfree(client);
+		return ret;
+	}
+	*fd = ret;
+
+	init_waitqueue_head(&client->wait_queue);
+	spin_lock_init(&client->lock);
+	client->events = 0;
+	client->dev = dev;
+
+	spin_lock(&dev->smi_lock);
+	list_add_rcu(&client->list, &dev->smi_clients);
+	spin_unlock(&dev->smi_lock);
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
new file mode 100644
index 0000000..a9cb218
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_SMI_EVENTS_H_INCLUDED
+#define KFD_SMI_EVENTS_H_INCLUDED
+
+int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+
+#endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b6be623..733c183 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -442,6 +442,17 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+/*
+ * KFD SMI(System Management Interface) events
+ */
+/* Event type (defined by bitmask) */
+#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
+
+struct kfd_ioctl_smi_events_args {
+	__u32 gpuid;	/* to KFD */
+	__u32 anon_fd;	/* from KFD */
+};
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -546,7 +557,10 @@ enum kfd_mmio_remap {
 #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
 		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
 
+#define AMDKFD_IOC_SMI_EVENTS			\
+		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1F
+#define AMDKFD_COMMAND_END		0x20
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-05-13 19:41 [PATCH] drm/amdkfd: Provide SMI events watch Amber Lin
@ 2020-05-14  0:52 ` Felix Kuehling
  0 siblings, 0 replies; 11+ messages in thread
From: Felix Kuehling @ 2020-05-14  0:52 UTC (permalink / raw)
  To: Amber Lin, amd-gfx

Am 2020-05-13 um 3:41 p.m. schrieb Amber Lin:
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register devices and subscribe events they are interested. After
> registered, the user can use annoymous file descriptor's poll function
> with wait-time specified and wait for events to happen. Once an event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> v2: - remove UNREGISTER and add event ENABLE/DISABLE
>     - correct kfifo usage
>     - move event message API to kfd_ioctl.h
> v3: send the event msg in text than in binary
> v4: support multiple clients
> v5: move events enablement from ioctl to fd write
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile              |   1 +
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  18 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   7 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |   4 +
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 214 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  29 +++
>  include/uapi/linux/kfd_ioctl.h                   |  16 +-
>  9 files changed, 292 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..e1e4115 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>  		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>  		$(AMDKFD_PATH)/kfd_dbgdev.o \
>  		$(AMDKFD_PATH)/kfd_dbgmgr.o \
> +		$(AMDKFD_PATH)/kfd_smi_events.o \
>  		$(AMDKFD_PATH)/kfd_crat.o
>  
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>  
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>  					const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>  		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>  		struct kfd_vm_fault_info info;
>  
> +		kfd_smi_event_update_vmfault(dev, pasid);
>  		kfd_process_vm_fault(dev->dqm, pasid);
>  
>  		memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index cf0017f..e9b96ad 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>  
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> @@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
>  	return r;
>  }
>  
> +/* Handle requests for watching SMI events */
> +static int kfd_ioctl_smi_events(struct file *filep,
> +				struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_smi_events_args *args = data;
> +	struct kfd_dev *dev;
> +
> +	dev = kfd_device_by_id(args->gpuid);
> +	if (!dev)
> +		return -EINVAL;
> +
> +	return kfd_smi_event_open(dev, &args->anon_fd);
> +}
> +
>  #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>  	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
>  			    .cmd_drv = 0, .name = #ioctl}
> @@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>  
>  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>  			kfd_ioctl_alloc_queue_gws, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
> +			kfd_ioctl_smi_events, 0),
>  };
>  
>  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0491ab2..2c030c2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
>  	return ret;
>  }
>  
> +static void kfd_smi_init(struct kfd_dev *dev) {
> +	INIT_LIST_HEAD(&dev->smi_clients);
> +	spin_lock_init(&dev->smi_lock);
> +}
> +
>  bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  			 struct drm_device *ddev,
>  			 const struct kgd2kfd_shared_resources *gpu_resources)
> @@ -700,6 +705,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  		goto kfd_topology_add_device_error;
>  	}
>  
> +	kfd_smi_init(kfd);
> +
>  	kfd->init_complete = true;
>  	dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor,
>  		 kfd->pdev->device);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index e05d75e..151e83e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "soc15_int.h"
>  #include "kfd_device_queue_manager.h"
> +#include "kfd_smi_events.h"
>  
>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>  					const uint32_t *ih_ring_entry,
> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>  		info.prot_read  = ring_id & 0x10;
>  		info.prot_write = ring_id & 0x20;
>  
> +		kfd_smi_event_update_vmfault(dev, pasid);
>  		kfd_process_vm_fault(dev->dqm, pasid);
>  		kfd_signal_vm_fault_event(dev, pasid, &info);
>  	}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index cde5e4c..f70f789 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -309,6 +309,10 @@ struct kfd_dev {
>  
>  	/* Global GWS resource shared b/t processes*/
>  	void *gws;
> +
> +	/* Clients watching SMI events */
> +	struct list_head smi_clients;
> +	spinlock_t smi_lock;
>  };
>  
>  enum kfd_mempool {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> new file mode 100644
> index 0000000..f5fd18e
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -0,0 +1,214 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/poll.h>
> +#include <linux/wait.h>
> +#include <linux/anon_inodes.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_vm.h"
> +#include "kfd_priv.h"
> +#include "kfd_smi_events.h"
> +
> +struct kfd_smi_client {
> +	struct list_head list;
> +	struct kfifo fifo;
> +	wait_queue_head_t wait_queue;
> +	/* events enabled */
> +	uint64_t events;
> +	struct kfd_dev *dev;
> +	spinlock_t lock;
> +};
> +
> +#define MAX_KFIFO_SIZE	1024
> +
> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
> +static ssize_t kfd_smi_ev_write(struct file *, const char __user *, size_t,
> +				loff_t *);
> +static int kfd_smi_ev_release(struct inode *, struct file *);
> +
> +static const char kfd_smi_name[] = "kfd_smi_ev";
> +
> +static const struct file_operations kfd_smi_ev_fops = {
> +	.owner = THIS_MODULE,
> +	.poll = kfd_smi_ev_poll,
> +	.read = kfd_smi_ev_read,
> +	.write = kfd_smi_ev_write,
> +	.release = kfd_smi_ev_release
> +};
> +
> +static __poll_t kfd_smi_ev_poll(struct file *filep,
> +				struct poll_table_struct *wait)
> +{
> +	__poll_t mask;
> +	struct kfd_smi_client *client = filep->private_data;
> +
> +	poll_wait(filep, &client->wait_queue, wait);
> +
> +	spin_lock(&client->lock);
> +	mask = kfifo_is_empty(&client->fifo) ? 0 : POLLIN | POLLRDNORM;
> +	spin_unlock(&client->lock);
> +
> +	return mask;
> +}
> +
> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
> +			       size_t size, loff_t *offset)
> +{
> +	int ret;
> +	size_t to_copy;
> +	struct kfd_smi_client *client = filep->private_data;
> +	unsigned char buf[MAX_KFIFO_SIZE];
> +
> +	BUILD_BUG_ON(MAX_KFIFO_SIZE > 1024);
> +
> +	/* kfifo_to_user can sleep so we can't use spinlock protection around
> +	 * it. Instead, we kfifo out as spinlocked then copy them to the user.
> +	 */
> +	spin_lock(&client->lock);
> +	to_copy = kfifo_len(&client->fifo);
> +	if (!to_copy) {
> +		spin_unlock(&client->lock);
> +		return -EAGAIN;
> +	}
> +	to_copy = min3(size, sizeof(buf), to_copy);
> +	ret = kfifo_out(&client->fifo, buf, to_copy);
> +	spin_unlock(&client->lock);
> +	if (ret <= 0)
> +		return -EAGAIN;
> +
> +	ret = copy_to_user(user, buf, to_copy);
> +	if (ret)
> +		return -EFAULT;
> +
> +	return to_copy;
> +}
> +
> +static ssize_t kfd_smi_ev_write(struct file *filep, const char __user *user,
> +				size_t size, loff_t *offset)
> +{
> +	struct kfd_smi_client *client = filep->private_data;
> +	uint64_t events;
> +
> +	if (!access_ok(user, size) || size < sizeof(events))
> +		return -EFAULT;
> +	if (copy_from_user(&events, user, sizeof(events)))
> +		return -EFAULT;
> +
> +	WRITE_ONCE(client->events, events);
> +
> +	return sizeof(events);
> +}
> +
> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
> +{
> +	struct kfd_smi_client *client = filep->private_data;
> +	struct kfd_dev *dev = client->dev;
> +
> +	spin_lock(&dev->smi_lock);
> +	list_del_rcu(&client->list);
> +	spin_unlock(&dev->smi_lock);
> +
> +	synchronize_rcu();
> +	kfifo_free(&client->fifo);
> +	kfree(client);
> +
> +	return 0;
> +}
> +
> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> +	struct amdgpu_task_info task_info;
> +	/* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */
> +	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
> +	 */
> +	char fifo_in[43];
> +	struct kfd_smi_client *client;
> +	int len;
> +
> +	if (list_empty(&dev->smi_clients))
> +		return;
> +
> +	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> +	amdgpu_vm_get_task_info(adev, pasid, &task_info);
> +	/* Report VM faults from user applications, not retry from kernel */
> +	if (!task_info.pid)
> +		return;
> +
> +	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
> +		task_info.pid, task_info.task_name);
> +
> +	rcu_read_lock();
> +
> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> +		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
> +			continue;
> +		spin_lock(&client->lock);
> +		if (kfifo_avail(&client->fifo) >= len) {
> +			kfifo_in(&client->fifo, fifo_in, len);
> +			wake_up_all(&client->wait_queue);
> +		}
> +		else
> +			pr_debug("smi_event(vmfault): no space left\n");
> +		spin_unlock(&client->lock);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
> +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> +{
> +	struct kfd_smi_client *client;
> +	int ret;
> +
> +	client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
> +	if (!client)
> +		return -ENOMEM;
> +	INIT_LIST_HEAD(&client->list);
> +
> +	ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL);
> +	if (ret) {
> +		kfree(client);
> +		return ret;
> +	}
> +
> +	ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
> +			       O_RDWR);
> +	if (ret < 0) {
> +		kfifo_free(&client->fifo);
> +		kfree(client);
> +		return ret;
> +	}
> +	*fd = ret;
> +
> +	init_waitqueue_head(&client->wait_queue);
> +	spin_lock_init(&client->lock);
> +	client->events = 0;
> +	client->dev = dev;
> +
> +	spin_lock(&dev->smi_lock);
> +	list_add_rcu(&client->list, &dev->smi_clients);
> +	spin_unlock(&dev->smi_lock);
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> new file mode 100644
> index 0000000..a9cb218
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -0,0 +1,29 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
> +#define KFD_SMI_EVENTS_H_INCLUDED
> +
> +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +
> +#endif
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index b6be623..733c183 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,17 @@ struct kfd_ioctl_import_dmabuf_args {
>  	__u32 dmabuf_fd;	/* to KFD */
>  };
>  
> +/*
> + * KFD SMI(System Management Interface) events
> + */
> +/* Event type (defined by bitmask) */
> +#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
> +
> +struct kfd_ioctl_smi_events_args {
> +	__u32 gpuid;	/* to KFD */
> +	__u32 anon_fd;	/* from KFD */
> +};
> +
>  /* Register offset inside the remapped mmio page
>   */
>  enum kfd_mmio_remap {
> @@ -546,7 +557,10 @@ enum kfd_mmio_remap {
>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>  		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>  
> +#define AMDKFD_IOC_SMI_EVENTS			\
> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
> +
>  #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x1F
> +#define AMDKFD_COMMAND_END		0x20
>  
>  #endif
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-04-01 13:10   ` Amber Lin
@ 2020-04-01 20:54     ` Felix Kuehling
  0 siblings, 0 replies; 11+ messages in thread
From: Felix Kuehling @ 2020-04-01 20:54 UTC (permalink / raw)
  To: Amber Lin, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 21155 bytes --]

Am 2020-04-01 um 9:10 a.m. schrieb Amber Lin:
>
> Thanks Felix for the review. I have a better understanding of how
> kfifo works now and have changed my code quite a bit. Couple of
> questions below inline regarding the gpu_id and data arguments.
>
Replies inline ...


> Thanks.
>
> Amber
>
> On 2020-03-26 4:53 p.m., Felix Kuehling wrote:
>>
>> Hi Amber,
>>
>> I see that this is based on the debugger event code. Jon and I are
>> just working through some issues with that code. The lessons from
>> that will need to be applied to this as well. But I think we can
>> define your API to simplify this a bit.
>>
>> The basic problem is, that we have one Fifo in the kfd_device, but
>> potentially multiple file descriptors referring to it. For the event
>> interface I think we can enforce only a single file descriptor per
>> device. If there is already one, your register call can fail. See
>> more comments inline.
>>
>> On 2020-03-17 13:57, Amber Lin wrote:
>>> When the compute is malfunctioning or performance drops, the system admin
>>> will use SMI (System Management Interface) tool to monitor/diagnostic what
>>> went wrong. This patch provides an event watch interface for the user
>>> space to register events they are interested. After the event is
>>> registered, the user can use annoymous file descriptor's pull function
>>
>> pull -> poll
>>
> Thank you for spotting the typo. I’ll change that.
>
>>> with wait-time specified to wait for the event to happen. Once the event
>>> happens, the user can use read() to retrieve information related to the
>>> event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>>>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>>>  include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>>>  9 files changed, 265 insertions(+), 2 deletions(-)
>>>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>>>  		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>>>  		$(AMDKFD_PATH)/kfd_dbgdev.o \
>>>  		$(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> -		$(AMDKFD_PATH)/kfd_crat.o
>>> +		$(AMDKFD_PATH)/kfd_crat.o \
>>> +		$(AMDKFD_PATH)/kfd_smi_events.o
>>>  
>>>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>  #include "kfd_events.h"
>>>  #include "cik_int.h"
>>>  #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>>  					const uint32_t *ih_ring_entry,
>>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>>  		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>>  		struct kfd_vm_fault_info info;
>>>  
>>> +		kfd_smi_event_update_vmfault(dev, pasid);
>>>  		kfd_process_vm_fault(dev->dqm, pasid);
>>>  
>>>  		memset(&info, 0, sizeof(info));
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..8e92956 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>>  #include "kfd_device_queue_manager.h"
>>>  #include "kfd_dbgmgr.h"
>>>  #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>>  static int kfd_open(struct inode *, struct file *);
>>> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>>>  	return ret;
>>>  }
>>>  
>>> +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> +				struct kfd_process *p, void *data)
>>> +{
>>> +	struct kfd_ioctl_smi_events_args *args = data;
>>> +	struct kfd_dev *dev;
>>> +	int ret = 0;
>>> +
>>> +	dev = kfd_device_by_id(args->gpu_id);
>>> +	if (!dev)
>>> +		return -EINVAL;
>>> +
>>> +	switch (args->op) {
>>> +	case KFD_SMI_EVENTS_REGISTER:
>>> +		ret = kfd_smi_event_register(dev, args->events);
>>> +		if (ret >= 0) {
>>> +			/* When the registration is successful, it returns the
>>> +			 * annoymous inode. Pass it to the user in data1
>>> +			 */
>>> +			args->data1 = ret;
>>> +			ret = 0;
>>
>> You could return the file descriptor as the return value. On success
>> it returns a positive fd. On failure it returns a negative error code.
>>
> I'm thinking to have a consistent return value regardless of the
> argument content. When args->op is not REGISTER, the return value as 0
> is success.

OK.


>>> +		}
>>> +		break;
>>> +	case KFD_SMI_EVENTS_UNREGISTER:
>>> +		kfd_smi_event_unregister(dev, args->events);
>>
>> Register seems to do two things: create a file descriptor, and
>> subscribe to specific events. This unregister function only affects
>> the subscription but not the file descriptor. I'd suggest a cleaner API:
>>
>>   * Register: creates the file descriptor
>>   * Subscribe: enables/disables subscription to specific events
>>
>> The unregistration is done implicitly by closing the file descriptor,
>> so there is no explicit ioctl API call for this.
>>
> Make sense. I've changed this.
>>
>>> +		break;
>>> +	default:
>>> +		ret = -EINVAL;
>>> +		break;
>>> +	}
>>> +
>>> +	return ret;
>>> +}
>>> +
>>>  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>>  {
>>>  	struct kfd_local_mem_info mem_info;
>>> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>>>  
>>>  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>>  			kfd_ioctl_alloc_queue_gws, 0),
>>> +
>>> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>>> +			kfd_ioctl_smi_events, 0),
>>>  };
>>>  
>>>  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 7866cd06..450368c 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>>>  	kfd->device_info = device_info;
>>>  	kfd->pdev = pdev;
>>>  	kfd->init_complete = false;
>>> +	kfd->smi_events.events = 0;
>>>  	kfd->kfd2kgd = f2g;
>>>  	atomic_set(&kfd->compute_profile, 0);
>>>  
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index e05d75e..151e83e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -24,6 +24,7 @@
>>>  #include "kfd_events.h"
>>>  #include "soc15_int.h"
>>>  #include "kfd_device_queue_manager.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>>  					const uint32_t *ih_ring_entry,
>>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>>>  		info.prot_read  = ring_id & 0x10;
>>>  		info.prot_write = ring_id & 0x20;
>>>  
>>> +		kfd_smi_event_update_vmfault(dev, pasid);
>>>  		kfd_process_vm_fault(dev->dqm, pasid);
>>>  		kfd_signal_vm_fault_event(dev, pasid, &info);
>>>  	}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 43b888b..fdb51de 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>>>  	uint32_t vmid_num_kfd;
>>>  };
>>>  
>>> +struct kfd_smi_events {
>>> +	uint64_t events;
>>> +	struct kfifo fifo;
>>> +	wait_queue_head_t wait_queue;
>>> +	uint32_t max_events;
>>> +};
>>> +
>>>  struct kfd_dev {
>>>  	struct kgd_dev *kgd;
>>>  
>>> @@ -309,6 +316,9 @@ struct kfd_dev {
>>>  
>>>  	/* Global GWS resource shared b/t processes*/
>>>  	void *gws;
>>> +
>>> +	/* if this device is in SMI events watch */
>>> +	struct kfd_smi_events smi_events;
>>>  };
>>>  
>>>  enum kfd_mempool {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> new file mode 100644
>>> index 0000000..ba9d036
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> @@ -0,0 +1,143 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include <linux/poll.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_vm.h"
>>> +#include "kfd_priv.h"
>>> +#include "kfd_smi_events.h"
>>> +
>>> +static DEFINE_MUTEX(kfd_smi_mutex);
>>> +
>>> +struct mutex *kfd_get_smi_mutex(void)
>>> +{
>>> +	return &kfd_smi_mutex;
>>> +}
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
>>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
>>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>>> +
>>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>>> +
>>> +static const struct file_operations kfd_smi_ev_fops = {
>>> +	.owner = THIS_MODULE,
>>> +	.poll = kfd_smi_ev_poll,
>>> +	.read = kfd_smi_ev_read,
>>> +	.release = kfd_smi_ev_release
>>> +};
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>>> +				struct poll_table_struct *wait)
>>> +{
>>> +	struct kfd_dev *dev = filep->private_data;
>>> +	struct kfd_smi_events *ev = &dev->smi_events;
>>> +
>>> +	__poll_t mask = 0;
>>> +
>>> +	poll_wait(filep, &ev->wait_queue, wait);
>>> +	mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
>>> +
>>> +	return mask;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
>>> +			       size_t size, loff_t *offset)
>>> +{
>>> +	int ret, copied = 0;
>>> +	struct kfd_dev *dev = filep->private_data;
>>> +
>>> +	ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
>>> +	if (ret || !copied) {
>>> +		pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
>>> +				ret, copied);
>>> +		return ret ? ret : -EAGAIN;
>>> +	}
>>> +
>>> +	return copied;
>>> +}
>>> +
>>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>> +{
>>> +	struct kfd_dev *dev = filep->private_data;
>>> +
>>> +	kfifo_free(&dev->smi_events.fifo);
>>> +	return 0;
>>> +}
>>> +
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
>>> +{
>>> +	struct kfd_smi_vmfault_fifo fifo_out;
>>> +	struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
>>> +	struct amdgpu_task_info task_info;
>>> +
>>> +	if (!kdev->smi_events.events)
>>> +		return;
>>
>> This condition is redundant.
>>
> Removed it. It was from my original plan to handle multiple events in
> one function but after realizing not do-able and changing the function
> name, I forgot to remove this check.
>>
>>> +
>>> +	if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
>>> +		return;
>>> +
>>> +	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>> +	amdgpu_vm_get_task_info(adev, pasid, &task_info);
>>> +
>>> +	fifo_out.group = 0;
>>> +	fifo_out.event = KFD_SMI_EV_VMFAULT;
>>> +	fifo_out.gpu_id = kdev->id;
>>> +	fifo_out.pid = task_info.pid;
>>> +	strcpy(fifo_out.task_name, task_info.task_name);
>>> +	kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
>>> +	wake_up_all(&kdev->smi_events.wait_queue);
>>> +}
>>> +
>>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +	mutex_lock(kfd_get_smi_mutex());
>>> +	dev->smi_events.events &= ~events;
>>> +	mutex_unlock(kfd_get_smi_mutex());
>>> +}
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +	int ret;
>>> +
>>> +	mutex_lock(kfd_get_smi_mutex());
>>> +	dev->smi_events.events |= events;
>>> +	mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> +	/* We use the lower 32 bits for now. Each bit represents one event. If
>>> +	 * featured events are increased to more than 32, we'll use the upper
>>> +	 * bits as groups so the total number of events can be up to 32*32.
>>> +	 */
>>> +	dev->smi_events.max_events = 32;
>>
>> I don't understand the explanation above. It seems to refer to the
>> event subscription mechanism. But you use this as the size of the
>> fifo. That's the size in bytes. Your struct kfd_smi_vmfault_fifo is
>> bigger than that, so even a single entry will overflow your FIFO queue.
>>
> I’m afraid we’ll eventually need to support more than 64 events, so I
> want to reserve the upper 32 bits to serve as “group”. Once we reach
> 32 events, 0x00000000_80000000 will be the last event in group 0, and
> 0x00000001_00000001 is the first event in group 1. This way we can
> expand the 64 bits to support more than 64 events.
>
It seems you can only use one group at a time. I don't see how you can
enable events in different groups at the same time using this method. At
least internally you'll need a bitmask that's wide enough to represent
all event types. In the subscribe/unsubscribe API you can use different
ways to encode the events in a more compact way because you can make
multiple calls to enable/disable events.


> This seems too complicated to explain in code. I'll just use the
> events parameter as event IDs. Once the featured events grows to more
> than 64, we can use "data" in the argument to express group when the
> op is ENABLE/DISABLE events.
>
>>> +	ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
>>> +			 GFP_KERNEL);
>>> +	if (ret) {
>>> +		pr_err("fail to allocate kfifo\n");
>>> +		return ret;
>>> +	}
>>> +	init_waitqueue_head(&dev->smi_events.wait_queue);
>>> +
>>> +	return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>>> +				(void *)dev, 0);
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> new file mode 100644
>>> index 0000000..2e320d3
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> @@ -0,0 +1,41 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>>> +#define KFD_SMI_EVENTS_H_INCLUDED
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
>>> +
>>> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
>>> + * can read the first 8 bytes to determine the next read length.
>>
>> Note about terminology: FIFO refers to a fifo queue. I think you're
>> using it to describe an entry or record in the FIFO.
>>
>> I don't know why you need to split group and event? The event mask is
>> only 64-bit, so there can be no more than 64 events.
>>
> I've removed group
>>
>>> + */
>>> +struct kfd_smi_vmfault_fifo {
>>> +	uint32_t group;
>>> +	uint32_t event;
>>> +	unsigned int gpu_id;
>>
>> The gpu_id is redundant because the file descriptor used to read the
>> events is associated with a kfd_dev (GPU). If you want to have a
>> single fifo, you should change the register API to not require a
>> gpu_id and the fifo should be global, not a member of the kfd_dev.
>>
> True
>>
>>> +	pid_t pid;
>>> +	char task_name[TASK_COMM_LEN];
>>> +};
>>
>> This needs to be part of the user API because usermode needs to parse
>> this structure. So it should be defined in kfd_ioctl.h. In your
>> proposed API I think the size of the FIFO entry is implied by the
>> event type. It would be cleaner to separate the FIFO entries into a
>> header and payload. The header would be the same for all events and
>> doesn't need to be duplicated in each event structure. It would
>> contain the event type and size (to allow variable size events). The
>> payload would depend on the event type.
>>
> I'll change this
>>
>>> +
>>> +#endif
>>> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>>> index 4f66764..6ce7c69 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>>> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>>>  	__u32 dmabuf_fd;	/* to KFD */
>>>  };
>>>  
>>> +/*
>>> + * KFD SMI(System Management Interface) events
>>> + */
>>> +enum kfd_smi_events_op {
>>> +	KFD_SMI_EVENTS_REGISTER = 1,
>>> +	KFD_SMI_EVENTS_UNREGISTER
>>> +};
>>> +
>>> +/* Event ID (mask) */
>>> +#define KFD_SMI_EV_VMFAULT     0x00000001
>>> +
>>> +struct kfd_ioctl_smi_events_args {
>>> +	__u32 op;       /* to KFD */
>>> +	/* upper 32 bits: group. lower 32 bits: event IDs */
>>> +	__u64 events;   /* to KFD */
>>> +	__u32 gpu_id;   /* to KFD */
>>> +	pid_t pid;      /* to KFD */
>>> +	__u32 data1;    /* from KFD */
>>> +	__u32 data2;
>>> +	__u32 data3;
>>
>> This looks like you copied it from the debug API.  pid, data1-3 are
>> not used by your API. I think gpu_id should not be used either if you
>> want the event FIFO to be global.
>>
>> Regards,
>>   Felix
>>
> gpu_id is for the user space to tell KFD which device it wants to
> watch events. I'll use separate fd/fifo for each device. I want to
> reserve some extra parameters in case I need to feedback extra
> information for possible future events. Further thinking of it, that
> will be handled in the anonymous fd's output, not here. I'll remove
> data 2 and 3 and rename data1 as data. I want to use it for anon_fd in
> REGISTRATION and for group in DISABLE/ENABLE when we need to support
> more than 64 events.

OK.

Regards,
  Felix

>>> +};
>>> +
>>>  /* Register offset inside the remapped mmio page
>>>   */
>>>  enum kfd_mmio_remap {
>>> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>>>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>>>  		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>>  
>>> +#define AMDKFD_IOC_SMI_EVENTS			\
>>> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>> +
>>>  #define AMDKFD_COMMAND_START		0x01
>>> -#define AMDKFD_COMMAND_END		0x1F
>>> +#define AMDKFD_COMMAND_END		0x20
>>>  
>>>  #endif

[-- Attachment #1.2: Type: text/html, Size: 27286 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-26 20:53 ` Felix Kuehling
@ 2020-04-01 13:10   ` Amber Lin
  2020-04-01 20:54     ` Felix Kuehling
  0 siblings, 1 reply; 11+ messages in thread
From: Amber Lin @ 2020-04-01 13:10 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 20219 bytes --]

Thanks Felix for the review. I have a better understanding of how kfifo 
works now and have changed my code quite a bit. Couple of questions 
below inline regarding the gpu_id and data arguments.

Thanks.

Amber

On 2020-03-26 4:53 p.m., Felix Kuehling wrote:
>
> Hi Amber,
>
> I see that this is based on the debugger event code. Jon and I are 
> just working through some issues with that code. The lessons from that 
> will need to be applied to this as well. But I think we can define 
> your API to simplify this a bit.
>
> The basic problem is, that we have one Fifo in the kfd_device, but 
> potentially multiple file descriptors referring to it. For the event 
> interface I think we can enforce only a single file descriptor per 
> device. If there is already one, your register call can fail. See more 
> comments inline.
>
> On 2020-03-17 13:57, Amber Lin wrote:
>> When the compute is malfunctioning or performance drops, the system admin
>> will use SMI (System Management Interface) tool to monitor/diagnostic what
>> went wrong. This patch provides an event watch interface for the user
>> space to register events they are interested. After the event is
>> registered, the user can use annoymous file descriptor's pull function
>
> pull -> poll
>
Thank you for spotting the typo. I’ll change that.

>> with wait-time specified to wait for the event to happen. Once the event
>> happens, the user can use read() to retrieve information related to the
>> event.
>>
>> VM fault event is done in this patch.
>>
>> Signed-off-by: Amber Lin<Amber.Lin@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>>   include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>>   9 files changed, 265 insertions(+), 2 deletions(-)
>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
>> index 6147462..cc98b4a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>> @@ -53,7 +53,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>>   		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>>   		$(AMDKFD_PATH)/kfd_dbgdev.o \
>>   		$(AMDKFD_PATH)/kfd_dbgmgr.o \
>> -		$(AMDKFD_PATH)/kfd_crat.o
>> +		$(AMDKFD_PATH)/kfd_crat.o \
>> +		$(AMDKFD_PATH)/kfd_smi_events.o
>>   
>>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> index 9f59ba9..24b4717 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> @@ -24,6 +24,7 @@
>>   #include "kfd_events.h"
>>   #include "cik_int.h"
>>   #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>   
>>   static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>   					const uint32_t *ih_ring_entry,
>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>   		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>   		struct kfd_vm_fault_info info;
>>   
>> +		kfd_smi_event_update_vmfault(dev, pasid);
>>   		kfd_process_vm_fault(dev->dqm, pasid);
>>   
>>   		memset(&info, 0, sizeof(info));
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index f8fa03a..8e92956 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -39,6 +39,7 @@
>>   #include "kfd_device_queue_manager.h"
>>   #include "kfd_dbgmgr.h"
>>   #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>   
>>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>   static int kfd_open(struct inode *, struct file *);
>> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>>   	return ret;
>>   }
>>   
>> +/* Handle requests for watching SMI events */
>> +static int kfd_ioctl_smi_events(struct file *filep,
>> +				struct kfd_process *p, void *data)
>> +{
>> +	struct kfd_ioctl_smi_events_args *args = data;
>> +	struct kfd_dev *dev;
>> +	int ret = 0;
>> +
>> +	dev = kfd_device_by_id(args->gpu_id);
>> +	if (!dev)
>> +		return -EINVAL;
>> +
>> +	switch (args->op) {
>> +	case KFD_SMI_EVENTS_REGISTER:
>> +		ret = kfd_smi_event_register(dev, args->events);
>> +		if (ret >= 0) {
>> +			/* When the registration is successful, it returns the
>> +			 * annoymous inode. Pass it to the user in data1
>> +			 */
>> +			args->data1 = ret;
>> +			ret = 0;
>
> You could return the file descriptor as the return value. On success 
> it returns a positive fd. On failure it returns a negative error code.
>
I'm thinking to have a consistent return value regardless of the 
argument content. When args->op is not REGISTER, the return value as 0 
is success.
>
>> +		}
>> +		break;
>> +	case KFD_SMI_EVENTS_UNREGISTER:
>> +		kfd_smi_event_unregister(dev, args->events);
>
> Register seems to do two things: create a file descriptor, and 
> subscribe to specific events. This unregister function only affects 
> the subscription but not the file descriptor. I'd suggest a cleaner API:
>
>   * Register: creates the file descriptor
>   * Subscribe: enables/disables subscription to specific events
>
> The unregistration is done implicitly by closing the file descriptor, 
> so there is no explicit ioctl API call for this.
>
Make sense. I've changed this.
>
>> +		break;
>> +	default:
>> +		ret = -EINVAL;
>> +		break;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>>   bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>   {
>>   	struct kfd_local_mem_info mem_info;
>> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>>   
>>   	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>   			kfd_ioctl_alloc_queue_gws, 0),
>> +
>> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>> +			kfd_ioctl_smi_events, 0),
>>   };
>>   
>>   #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 7866cd06..450368c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>>   	kfd->device_info = device_info;
>>   	kfd->pdev = pdev;
>>   	kfd->init_complete = false;
>> +	kfd->smi_events.events = 0;
>>   	kfd->kfd2kgd = f2g;
>>   	atomic_set(&kfd->compute_profile, 0);
>>   
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> index e05d75e..151e83e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> @@ -24,6 +24,7 @@
>>   #include "kfd_events.h"
>>   #include "soc15_int.h"
>>   #include "kfd_device_queue_manager.h"
>> +#include "kfd_smi_events.h"
>>   
>>   static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>   					const uint32_t *ih_ring_entry,
>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>>   		info.prot_read  = ring_id & 0x10;
>>   		info.prot_write = ring_id & 0x20;
>>   
>> +		kfd_smi_event_update_vmfault(dev, pasid);
>>   		kfd_process_vm_fault(dev->dqm, pasid);
>>   		kfd_signal_vm_fault_event(dev, pasid, &info);
>>   	}
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 43b888b..fdb51de 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>>   	uint32_t vmid_num_kfd;
>>   };
>>   
>> +struct kfd_smi_events {
>> +	uint64_t events;
>> +	struct kfifo fifo;
>> +	wait_queue_head_t wait_queue;
>> +	uint32_t max_events;
>> +};
>> +
>>   struct kfd_dev {
>>   	struct kgd_dev *kgd;
>>   
>> @@ -309,6 +316,9 @@ struct kfd_dev {
>>   
>>   	/* Global GWS resource shared b/t processes*/
>>   	void *gws;
>> +
>> +	/* if this device is in SMI events watch */
>> +	struct kfd_smi_events smi_events;
>>   };
>>   
>>   enum kfd_mempool {
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> new file mode 100644
>> index 0000000..ba9d036
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -0,0 +1,143 @@
>> +/*
>> + * Copyright 2020 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#include <linux/poll.h>
>> +#include <linux/wait.h>
>> +#include <linux/anon_inodes.h>
>> +#include <uapi/linux/kfd_ioctl.h>
>> +#include "amdgpu_vm.h"
>> +#include "kfd_priv.h"
>> +#include "kfd_smi_events.h"
>> +
>> +static DEFINE_MUTEX(kfd_smi_mutex);
>> +
>> +struct mutex *kfd_get_smi_mutex(void)
>> +{
>> +	return &kfd_smi_mutex;
>> +}
>> +
>> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>> +
>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>> +
>> +static const struct file_operations kfd_smi_ev_fops = {
>> +	.owner = THIS_MODULE,
>> +	.poll = kfd_smi_ev_poll,
>> +	.read = kfd_smi_ev_read,
>> +	.release = kfd_smi_ev_release
>> +};
>> +
>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>> +				struct poll_table_struct *wait)
>> +{
>> +	struct kfd_dev *dev = filep->private_data;
>> +	struct kfd_smi_events *ev = &dev->smi_events;
>> +
>> +	__poll_t mask = 0;
>> +
>> +	poll_wait(filep, &ev->wait_queue, wait);
>> +	mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
>> +
>> +	return mask;
>> +}
>> +
>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
>> +			       size_t size, loff_t *offset)
>> +{
>> +	int ret, copied = 0;
>> +	struct kfd_dev *dev = filep->private_data;
>> +
>> +	ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
>> +	if (ret || !copied) {
>> +		pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
>> +				ret, copied);
>> +		return ret ? ret : -EAGAIN;
>> +	}
>> +
>> +	return copied;
>> +}
>> +
>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>> +{
>> +	struct kfd_dev *dev = filep->private_data;
>> +
>> +	kfifo_free(&dev->smi_events.fifo);
>> +	return 0;
>> +}
>> +
>> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
>> +{
>> +	struct kfd_smi_vmfault_fifo fifo_out;
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
>> +	struct amdgpu_task_info task_info;
>> +
>> +	if (!kdev->smi_events.events)
>> +		return;
>
> This condition is redundant.
>
Removed it. It was from my original plan to handle multiple events in 
one function but after realizing not do-able and changing the function 
name, I forgot to remove this check.
>
>> +
>> +	if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
>> +		return;
>> +
>> +	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> +	amdgpu_vm_get_task_info(adev, pasid, &task_info);
>> +
>> +	fifo_out.group = 0;
>> +	fifo_out.event = KFD_SMI_EV_VMFAULT;
>> +	fifo_out.gpu_id = kdev->id;
>> +	fifo_out.pid = task_info.pid;
>> +	strcpy(fifo_out.task_name, task_info.task_name);
>> +	kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
>> +	wake_up_all(&kdev->smi_events.wait_queue);
>> +}
>> +
>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
>> +{
>> +	mutex_lock(kfd_get_smi_mutex());
>> +	dev->smi_events.events &= ~events;
>> +	mutex_unlock(kfd_get_smi_mutex());
>> +}
>> +
>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
>> +{
>> +	int ret;
>> +
>> +	mutex_lock(kfd_get_smi_mutex());
>> +	dev->smi_events.events |= events;
>> +	mutex_unlock(kfd_get_smi_mutex());
>> +
>> +	/* We use the lower 32 bits for now. Each bit represents one event. If
>> +	 * featured events are increased to more than 32, we'll use the upper
>> +	 * bits as groups so the total number of events can be up to 32*32.
>> +	 */
>> +	dev->smi_events.max_events = 32;
>
> I don't understand the explanation above. It seems to refer to the 
> event subscription mechanism. But you use this as the size of the 
> fifo. That's the size in bytes. Your struct kfd_smi_vmfault_fifo is 
> bigger than that, so even a single entry will overflow your FIFO queue.
>
I’m afraid we’ll eventually need to support more than 64 events, so I 
want to reserve the upper 32 bits to serve as “group”. Once we reach 32 
events, 0x00000000_80000000 will be the last event in group 0, and 
0x00000001_00000001 is the first event in group 1. This way we can 
expand the 64 bits to support more than 64 events.

This seems too complicated to explain in code. I'll just use the events 
parameter as event IDs. Once the featured events grows to more than 64, 
we can use "data" in the argument to express group when the op is 
ENABLE/DISABLE events.

>> +	ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
>> +			 GFP_KERNEL);
>> +	if (ret) {
>> +		pr_err("fail to allocate kfifo\n");
>> +		return ret;
>> +	}
>> +	init_waitqueue_head(&dev->smi_events.wait_queue);
>> +
>> +	return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>> +				(void *)dev, 0);
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> new file mode 100644
>> index 0000000..2e320d3
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> @@ -0,0 +1,41 @@
>> +/*
>> + * Copyright 2020 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>> +#define KFD_SMI_EVENTS_H_INCLUDED
>> +
>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
>> +
>> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
>> + * can read the first 8 bytes to determine the next read length.
>
> Note about terminology: FIFO refers to a fifo queue. I think you're 
> using it to describe an entry or record in the FIFO.
>
> I don't know why you need to split group and event? The event mask is 
> only 64-bit, so there can be no more than 64 events.
>
I've removed group
>
>> + */
>> +struct kfd_smi_vmfault_fifo {
>> +	uint32_t group;
>> +	uint32_t event;
>> +	unsigned int gpu_id;
>
> The gpu_id is redundant because the file descriptor used to read the 
> events is associated with a kfd_dev (GPU). If you want to have a 
> single fifo, you should change the register API to not require a 
> gpu_id and the fifo should be global, not a member of the kfd_dev.
>
True
>
>> +	pid_t pid;
>> +	char task_name[TASK_COMM_LEN];
>> +};
>
> This needs to be part of the user API because usermode needs to parse 
> this structure. So it should be defined in kfd_ioctl.h. In your 
> proposed API I think the size of the FIFO entry is implied by the 
> event type. It would be cleaner to separate the FIFO entries into a 
> header and payload. The header would be the same for all events and 
> doesn't need to be duplicated in each event structure. It would 
> contain the event type and size (to allow variable size events). The 
> payload would depend on the event type.
>
I'll change this
>
>> +
>> +#endif
>> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>> index 4f66764..6ce7c69 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>>   	__u32 dmabuf_fd;	/* to KFD */
>>   };
>>   
>> +/*
>> + * KFD SMI(System Management Interface) events
>> + */
>> +enum kfd_smi_events_op {
>> +	KFD_SMI_EVENTS_REGISTER = 1,
>> +	KFD_SMI_EVENTS_UNREGISTER
>> +};
>> +
>> +/* Event ID (mask) */
>> +#define KFD_SMI_EV_VMFAULT     0x00000001
>> +
>> +struct kfd_ioctl_smi_events_args {
>> +	__u32 op;       /* to KFD */
>> +	/* upper 32 bits: group. lower 32 bits: event IDs */
>> +	__u64 events;   /* to KFD */
>> +	__u32 gpu_id;   /* to KFD */
>> +	pid_t pid;      /* to KFD */
>> +	__u32 data1;    /* from KFD */
>> +	__u32 data2;
>> +	__u32 data3;
>
> This looks like you copied it from the debug API.  pid, data1-3 are 
> not used by your API. I think gpu_id should not be used either if you 
> want the event FIFO to be global.
>
> Regards,
>   Felix
>
gpu_id is for the user space to tell KFD which device it wants to watch 
events. I'll use separate fd/fifo for each device. I want to reserve 
some extra parameters in case I need to feedback extra information for 
possible future events. Further thinking of it, that will be handled in 
the anonymous fd's output, not here. I'll remove data 2 and 3 and rename 
data1 as data. I want to use it for anon_fd in REGISTRATION and for 
group in DISABLE/ENABLE when we need to support more than 64 events.
>
>> +};
>> +
>>   /* Register offset inside the remapped mmio page
>>    */
>>   enum kfd_mmio_remap {
>> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>>   #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>>   		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>   
>> +#define AMDKFD_IOC_SMI_EVENTS			\
>> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>> +
>>   #define AMDKFD_COMMAND_START		0x01
>> -#define AMDKFD_COMMAND_END		0x1F
>> +#define AMDKFD_COMMAND_END		0x20
>>   
>>   #endif

[-- Attachment #1.2: Type: text/html, Size: 25708 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-17 17:57 Amber Lin
  2020-03-17 19:03 ` Alex Deucher
@ 2020-03-26 20:53 ` Felix Kuehling
  2020-04-01 13:10   ` Amber Lin
  1 sibling, 1 reply; 11+ messages in thread
From: Felix Kuehling @ 2020-03-26 20:53 UTC (permalink / raw)
  To: Amber Lin, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 17877 bytes --]

Hi Amber,

I see that this is based on the debugger event code. Jon and I are just 
working through some issues with that code. The lessons from that will 
need to be applied to this as well. But I think we can define your API 
to simplify this a bit.

The basic problem is, that we have one Fifo in the kfd_device, but 
potentially multiple file descriptors referring to it. For the event 
interface I think we can enforce only a single file descriptor per 
device. If there is already one, your register call can fail. See more 
comments inline.

On 2020-03-17 13:57, Amber Lin wrote:
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register events they are interested. After the event is
> registered, the user can use annoymous file descriptor's pull function

pull -> poll


> with wait-time specified to wait for the event to happen. Once the event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>   include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>   9 files changed, 265 insertions(+), 2 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>   		$(AMDKFD_PATH)/kfd_dbgdev.o \
>   		$(AMDKFD_PATH)/kfd_dbgmgr.o \
> -		$(AMDKFD_PATH)/kfd_crat.o
> +		$(AMDKFD_PATH)/kfd_crat.o \
> +		$(AMDKFD_PATH)/kfd_smi_events.o
>   
>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>   #include "kfd_events.h"
>   #include "cik_int.h"
>   #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>   
>   static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>   					const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>   		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>   		struct kfd_vm_fault_info info;
>   
> +		kfd_smi_event_update_vmfault(dev, pasid);
>   		kfd_process_vm_fault(dev->dqm, pasid);
>   
>   		memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>   #include "kfd_device_queue_manager.h"
>   #include "kfd_dbgmgr.h"
>   #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>   
>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>   static int kfd_open(struct inode *, struct file *);
> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>   	return ret;
>   }
>   
> +/* Handle requests for watching SMI events */
> +static int kfd_ioctl_smi_events(struct file *filep,
> +				struct kfd_process *p, void *data)
> +{
> +	struct kfd_ioctl_smi_events_args *args = data;
> +	struct kfd_dev *dev;
> +	int ret = 0;
> +
> +	dev = kfd_device_by_id(args->gpu_id);
> +	if (!dev)
> +		return -EINVAL;
> +
> +	switch (args->op) {
> +	case KFD_SMI_EVENTS_REGISTER:
> +		ret = kfd_smi_event_register(dev, args->events);
> +		if (ret >= 0) {
> +			/* When the registration is successful, it returns the
> +			 * annoymous inode. Pass it to the user in data1
> +			 */
> +			args->data1 = ret;
> +			ret = 0;

You could return the file descriptor as the return value. On success it 
returns a positive fd. On failure it returns a negative error code.


> +		}
> +		break;
> +	case KFD_SMI_EVENTS_UNREGISTER:
> +		kfd_smi_event_unregister(dev, args->events);

Register seems to do two things: create a file descriptor, and subscribe 
to specific events. This unregister function only affects the 
subscription but not the file descriptor. I'd suggest a cleaner API:

  * Register: creates the file descriptor
  * Subscribe: enables/disables subscription to specific events

The unregistration is done implicitly by closing the file descriptor, so 
there is no explicit ioctl API call for this.


> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>   bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>   {
>   	struct kfd_local_mem_info mem_info;
> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>   
>   	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>   			kfd_ioctl_alloc_queue_gws, 0),
> +
> +	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
> +			kfd_ioctl_smi_events, 0),
>   };
>   
>   #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 7866cd06..450368c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>   	kfd->device_info = device_info;
>   	kfd->pdev = pdev;
>   	kfd->init_complete = false;
> +	kfd->smi_events.events = 0;
>   	kfd->kfd2kgd = f2g;
>   	atomic_set(&kfd->compute_profile, 0);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index e05d75e..151e83e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -24,6 +24,7 @@
>   #include "kfd_events.h"
>   #include "soc15_int.h"
>   #include "kfd_device_queue_manager.h"
> +#include "kfd_smi_events.h"
>   
>   static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>   					const uint32_t *ih_ring_entry,
> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>   		info.prot_read  = ring_id & 0x10;
>   		info.prot_write = ring_id & 0x20;
>   
> +		kfd_smi_event_update_vmfault(dev, pasid);
>   		kfd_process_vm_fault(dev->dqm, pasid);
>   		kfd_signal_vm_fault_event(dev, pasid, &info);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 43b888b..fdb51de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>   	uint32_t vmid_num_kfd;
>   };
>   
> +struct kfd_smi_events {
> +	uint64_t events;
> +	struct kfifo fifo;
> +	wait_queue_head_t wait_queue;
> +	uint32_t max_events;
> +};
> +
>   struct kfd_dev {
>   	struct kgd_dev *kgd;
>   
> @@ -309,6 +316,9 @@ struct kfd_dev {
>   
>   	/* Global GWS resource shared b/t processes*/
>   	void *gws;
> +
> +	/* if this device is in SMI events watch */
> +	struct kfd_smi_events smi_events;
>   };
>   
>   enum kfd_mempool {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> new file mode 100644
> index 0000000..ba9d036
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -0,0 +1,143 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/poll.h>
> +#include <linux/wait.h>
> +#include <linux/anon_inodes.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_vm.h"
> +#include "kfd_priv.h"
> +#include "kfd_smi_events.h"
> +
> +static DEFINE_MUTEX(kfd_smi_mutex);
> +
> +struct mutex *kfd_get_smi_mutex(void)
> +{
> +	return &kfd_smi_mutex;
> +}
> +
> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
> +static int kfd_smi_ev_release(struct inode *, struct file *);
> +
> +static const char kfd_smi_name[] = "kfd_smi_ev";
> +
> +static const struct file_operations kfd_smi_ev_fops = {
> +	.owner = THIS_MODULE,
> +	.poll = kfd_smi_ev_poll,
> +	.read = kfd_smi_ev_read,
> +	.release = kfd_smi_ev_release
> +};
> +
> +static __poll_t kfd_smi_ev_poll(struct file *filep,
> +				struct poll_table_struct *wait)
> +{
> +	struct kfd_dev *dev = filep->private_data;
> +	struct kfd_smi_events *ev = &dev->smi_events;
> +
> +	__poll_t mask = 0;
> +
> +	poll_wait(filep, &ev->wait_queue, wait);
> +	mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
> +
> +	return mask;
> +}
> +
> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
> +			       size_t size, loff_t *offset)
> +{
> +	int ret, copied = 0;
> +	struct kfd_dev *dev = filep->private_data;
> +
> +	ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
> +	if (ret || !copied) {
> +		pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
> +				ret, copied);
> +		return ret ? ret : -EAGAIN;
> +	}
> +
> +	return copied;
> +}
> +
> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
> +{
> +	struct kfd_dev *dev = filep->private_data;
> +
> +	kfifo_free(&dev->smi_events.fifo);
> +	return 0;
> +}
> +
> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
> +{
> +	struct kfd_smi_vmfault_fifo fifo_out;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
> +	struct amdgpu_task_info task_info;
> +
> +	if (!kdev->smi_events.events)
> +		return;

This condition is redundant.


> +
> +	if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
> +		return;
> +
> +	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> +	amdgpu_vm_get_task_info(adev, pasid, &task_info);
> +
> +	fifo_out.group = 0;
> +	fifo_out.event = KFD_SMI_EV_VMFAULT;
> +	fifo_out.gpu_id = kdev->id;
> +	fifo_out.pid = task_info.pid;
> +	strcpy(fifo_out.task_name, task_info.task_name);
> +	kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
> +	wake_up_all(&kdev->smi_events.wait_queue);
> +}
> +
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
> +{
> +	mutex_lock(kfd_get_smi_mutex());
> +	dev->smi_events.events &= ~events;
> +	mutex_unlock(kfd_get_smi_mutex());
> +}
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
> +{
> +	int ret;
> +
> +	mutex_lock(kfd_get_smi_mutex());
> +	dev->smi_events.events |= events;
> +	mutex_unlock(kfd_get_smi_mutex());
> +
> +	/* We use the lower 32 bits for now. Each bit represents one event. If
> +	 * featured events are increased to more than 32, we'll use the upper
> +	 * bits as groups so the total number of events can be up to 32*32.
> +	 */
> +	dev->smi_events.max_events = 32;

I don't understand the explanation above. It seems to refer to the event 
subscription mechanism. But you use this as the size of the fifo. That's 
the size in bytes. Your struct kfd_smi_vmfault_fifo is bigger than that, 
so even a single entry will overflow your FIFO queue.


> +	ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
> +			 GFP_KERNEL);
> +	if (ret) {
> +		pr_err("fail to allocate kfifo\n");
> +		return ret;
> +	}
> +	init_waitqueue_head(&dev->smi_events.wait_queue);
> +
> +	return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
> +				(void *)dev, 0);
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> new file mode 100644
> index 0000000..2e320d3
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -0,0 +1,41 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
> +#define KFD_SMI_EVENTS_H_INCLUDED
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +
> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
> + * can read the first 8 bytes to determine the next read length.

Note about terminology: FIFO refers to a fifo queue. I think you're 
using it to describe an entry or record in the FIFO.

I don't know why you need to split group and event? The event mask is 
only 64-bit, so there can be no more than 64 events.

> + */
> +struct kfd_smi_vmfault_fifo {
> +	uint32_t group;
> +	uint32_t event;
> +	unsigned int gpu_id;

The gpu_id is redundant because the file descriptor used to read the 
events is associated with a kfd_dev (GPU). If you want to have a single 
fifo, you should change the register API to not require a gpu_id and the 
fifo should be global, not a member of the kfd_dev.


> +	pid_t pid;
> +	char task_name[TASK_COMM_LEN];
> +};

This needs to be part of the user API because usermode needs to parse 
this structure. So it should be defined in kfd_ioctl.h. In your proposed 
API I think the size of the FIFO entry is implied by the event type. It 
would be cleaner to separate the FIFO entries into a header and payload. 
The header would be the same for all events and doesn't need to be 
duplicated in each event structure. It would contain the event type and 
size (to allow variable size events). The payload would depend on the 
event type.


> +
> +#endif
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 4f66764..6ce7c69 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>   	__u32 dmabuf_fd;	/* to KFD */
>   };
>   
> +/*
> + * KFD SMI(System Management Interface) events
> + */
> +enum kfd_smi_events_op {
> +	KFD_SMI_EVENTS_REGISTER = 1,
> +	KFD_SMI_EVENTS_UNREGISTER
> +};
> +
> +/* Event ID (mask) */
> +#define KFD_SMI_EV_VMFAULT     0x00000001
> +
> +struct kfd_ioctl_smi_events_args {
> +	__u32 op;       /* to KFD */
> +	/* upper 32 bits: group. lower 32 bits: event IDs */
> +	__u64 events;   /* to KFD */
> +	__u32 gpu_id;   /* to KFD */
> +	pid_t pid;      /* to KFD */
> +	__u32 data1;    /* from KFD */
> +	__u32 data2;
> +	__u32 data3;

This looks like you copied it from the debug API.  pid, data1-3 are not 
used by your API. I think gpu_id should not be used either if you want 
the event FIFO to be global.

Regards,
   Felix


> +};
> +
>   /* Register offset inside the remapped mmio page
>    */
>   enum kfd_mmio_remap {
> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>   #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>   		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>   
> +#define AMDKFD_IOC_SMI_EVENTS			\
> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
> +
>   #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x1F
> +#define AMDKFD_COMMAND_END		0x20
>   
>   #endif

[-- Attachment #1.2: Type: text/html, Size: 21251 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-23 18:19   ` Amber Lin
@ 2020-03-24 15:58     ` Amber Lin
  0 siblings, 0 replies; 11+ messages in thread
From: Amber Lin @ 2020-03-24 15:58 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx list

Sorry for the messed-up link. This is the link (rocm-smi-lib) which 
makes use of the interface
https://github.com/RadeonOpenCompute/rocm_smi_lib

On 2020-03-23 2:19 p.m., Amber Lin wrote:
> Somehow my reply didn't seem to reach the mailing list...
>
> Hi Alex,
>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FRadeonOpenCompute%2Frocm_smi_lib&amp;data=02%7C01%7Camber.lin%40amd.com%7C37d1a82d9e734d9fec6d08d7cf56ce36%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637205844045641423&amp;sdata=I%2BVkN3VKYFUiZ0xGW0Yst70rcqrMRXUTcd995RgfRa4%3D&amp;reserved=0 
> will use this interface. Those functions will be added to this library:
>
> /* Get a handler for watching events */
> rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
> /* Register events for the device using the handler from init */ 
> rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,
>                                 rsmi_event_handle_t *handle);
> /* Wait for events. If one of the events happens, a success is 
> returned with
>  * with details in data.
>  */
> rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t 
> timeout_ms,
>                                 rsmi_event_data_t *data);
> /* Stop watching events */
> rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);
>
> I add the ioctl to /dev/kfd with a debate if it should be in 
> /dev/dri/card* or /dev/dri/renderD* instead. The first event to report 
> is VM fault in this patch. Other events like RAS errors, PCIe errors, 
> GPU reset… etc will be added for the system admin to diagnose the 
> system health. I see this as a system feature so I use /dev/kfd. I’ll 
> like to hear if people think differently. Thanks.
>
> Thanks.
>
> Amber
>
> On 2020-03-17 3:03 p.m., Alex Deucher wrote:
>> On Tue, Mar 17, 2020 at 1:57 PM Amber Lin <Amber.Lin@amd.com> wrote:
>>> When the compute is malfunctioning or performance drops, the system 
>>> admin
>>> will use SMI (System Management Interface) tool to 
>>> monitor/diagnostic what
>>> went wrong. This patch provides an event watch interface for the user
>>> space to register events they are interested. After the event is
>>> registered, the user can use annoymous file descriptor's pull function
>>> with wait-time specified to wait for the event to happen. Once the 
>>> event
>>> happens, the user can use read() to retrieve information related to the
>>> event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>> Can you provide a link to the userspace tools that make use of this 
>> interface?
>>
>> Thanks,
>>
>> Alex
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>>>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 
>>> +++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>>>   include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>>>   9 files changed, 265 insertions(+), 2 deletions(-)
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
>>> b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
>>>                  $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>>                  $(AMDKFD_PATH)/kfd_dbgdev.o \
>>>                  $(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> -               $(AMDKFD_PATH)/kfd_crat.o
>>> +               $(AMDKFD_PATH)/kfd_crat.o \
>>> +               $(AMDKFD_PATH)/kfd_smi_events.o
>>>
>>>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
>>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "cik_int.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>
>>>   static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>>                                          const uint32_t *ih_ring_entry,
>>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct 
>>> kfd_dev *dev,
>>>                  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>>                  struct kfd_vm_fault_info info;
>>>
>>> +               kfd_smi_event_update_vmfault(dev, pasid);
>>>                  kfd_process_vm_fault(dev->dqm, pasid);
>>>
>>>                  memset(&info, 0, sizeof(info));
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..8e92956 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>>   #include "kfd_device_queue_manager.h"
>>>   #include "kfd_dbgmgr.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>
>>>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>>   static int kfd_open(struct inode *, struct file *);
>>> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file 
>>> *filep, struct kfd_process *p,
>>>          return ret;
>>>   }
>>>
>>> +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> +                               struct kfd_process *p, void *data)
>>> +{
>>> +       struct kfd_ioctl_smi_events_args *args = data;
>>> +       struct kfd_dev *dev;
>>> +       int ret = 0;
>>> +
>>> +       dev = kfd_device_by_id(args->gpu_id);
>>> +       if (!dev)
>>> +               return -EINVAL;
>>> +
>>> +       switch (args->op) {
>>> +       case KFD_SMI_EVENTS_REGISTER:
>>> +               ret = kfd_smi_event_register(dev, args->events);
>>> +               if (ret >= 0) {
>>> +                       /* When the registration is successful, it 
>>> returns the
>>> +                        * annoymous inode. Pass it to the user in 
>>> data1
>>> +                        */
>>> +                       args->data1 = ret;
>>> +                       ret = 0;
>>> +               }
>>> +               break;
>>> +       case KFD_SMI_EVENTS_UNREGISTER:
>>> +               kfd_smi_event_unregister(dev, args->events);
>>> +               break;
>>> +       default:
>>> +               ret = -EINVAL;
>>> +               break;
>>> +       }
>>> +
>>> +       return ret;
>>> +}
>>> +
>>>   bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>>   {
>>>          struct kfd_local_mem_info mem_info;
>>> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc 
>>> amdkfd_ioctls[] = {
>>>
>>>          AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>>                          kfd_ioctl_alloc_queue_gws, 0),
>>> +
>>> +       AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>>> +                       kfd_ioctl_smi_events, 0),
>>>   };
>>>
>>>   #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 7866cd06..450368c 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>>>          kfd->device_info = device_info;
>>>          kfd->pdev = pdev;
>>>          kfd->init_complete = false;
>>> +       kfd->smi_events.events = 0;
>>>          kfd->kfd2kgd = f2g;
>>>          atomic_set(&kfd->compute_profile, 0);
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> index e05d75e..151e83e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "soc15_int.h"
>>>   #include "kfd_device_queue_manager.h"
>>> +#include "kfd_smi_events.h"
>>>
>>>   static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>>                                          const uint32_t *ih_ring_entry,
>>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev 
>>> *dev,
>>>                  info.prot_read  = ring_id & 0x10;
>>>                  info.prot_write = ring_id & 0x20;
>>>
>>> +               kfd_smi_event_update_vmfault(dev, pasid);
>>>                  kfd_process_vm_fault(dev->dqm, pasid);
>>>                  kfd_signal_vm_fault_event(dev, pasid, &info);
>>>          }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 43b888b..fdb51de 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>>>          uint32_t vmid_num_kfd;
>>>   };
>>>
>>> +struct kfd_smi_events {
>>> +       uint64_t events;
>>> +       struct kfifo fifo;
>>> +       wait_queue_head_t wait_queue;
>>> +       uint32_t max_events;
>>> +};
>>> +
>>>   struct kfd_dev {
>>>          struct kgd_dev *kgd;
>>>
>>> @@ -309,6 +316,9 @@ struct kfd_dev {
>>>
>>>          /* Global GWS resource shared b/t processes*/
>>>          void *gws;
>>> +
>>> +       /* if this device is in SMI events watch */
>>> +       struct kfd_smi_events smi_events;
>>>   };
>>>
>>>   enum kfd_mempool {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> new file mode 100644
>>> index 0000000..ba9d036
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>> @@ -0,0 +1,143 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person 
>>> obtaining a
>>> + * copy of this software and associated documentation files (the 
>>> "Software"),
>>> + * to deal in the Software without restriction, including without 
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, 
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to 
>>> whom the
>>> + * Software is furnished to do so, subject to the following 
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be 
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include <linux/poll.h>
>>> +#include <linux/wait.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <uapi/linux/kfd_ioctl.h>
>>> +#include "amdgpu_vm.h"
>>> +#include "kfd_priv.h"
>>> +#include "kfd_smi_events.h"
>>> +
>>> +static DEFINE_MUTEX(kfd_smi_mutex);
>>> +
>>> +struct mutex *kfd_get_smi_mutex(void)
>>> +{
>>> +       return &kfd_smi_mutex;
>>> +}
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *, struct 
>>> poll_table_struct *);
>>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, 
>>> size_t, loff_t *);
>>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>>> +
>>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>>> +
>>> +static const struct file_operations kfd_smi_ev_fops = {
>>> +       .owner = THIS_MODULE,
>>> +       .poll = kfd_smi_ev_poll,
>>> +       .read = kfd_smi_ev_read,
>>> +       .release = kfd_smi_ev_release
>>> +};
>>> +
>>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>>> +                               struct poll_table_struct *wait)
>>> +{
>>> +       struct kfd_dev *dev = filep->private_data;
>>> +       struct kfd_smi_events *ev = &dev->smi_events;
>>> +
>>> +       __poll_t mask = 0;
>>> +
>>> +       poll_wait(filep, &ev->wait_queue, wait);
>>> +       mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : 
>>> mask;
>>> +
>>> +       return mask;
>>> +}
>>> +
>>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
>>> +                              size_t size, loff_t *offset)
>>> +{
>>> +       int ret, copied = 0;
>>> +       struct kfd_dev *dev = filep->private_data;
>>> +
>>> +       ret = kfifo_to_user(&dev->smi_events.fifo, user, size, 
>>> &copied);
>>> +       if (ret || !copied) {
>>> +               pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
>>> +                               ret, copied);
>>> +               return ret ? ret : -EAGAIN;
>>> +       }
>>> +
>>> +       return copied;
>>> +}
>>> +
>>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>> +{
>>> +       struct kfd_dev *dev = filep->private_data;
>>> +
>>> +       kfifo_free(&dev->smi_events.fifo);
>>> +       return 0;
>>> +}
>>> +
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t 
>>> pasid)
>>> +{
>>> +       struct kfd_smi_vmfault_fifo fifo_out;
>>> +       struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
>>> +       struct amdgpu_task_info task_info;
>>> +
>>> +       if (!kdev->smi_events.events)
>>> +               return;
>>> +
>>> +       if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
>>> +               return;
>>> +
>>> +       memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>> +       amdgpu_vm_get_task_info(adev, pasid, &task_info);
>>> +
>>> +       fifo_out.group = 0;
>>> +       fifo_out.event = KFD_SMI_EV_VMFAULT;
>>> +       fifo_out.gpu_id = kdev->id;
>>> +       fifo_out.pid = task_info.pid;
>>> +       strcpy(fifo_out.task_name, task_info.task_name);
>>> +       kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
>>> +       wake_up_all(&kdev->smi_events.wait_queue);
>>> +}
>>> +
>>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +       mutex_lock(kfd_get_smi_mutex());
>>> +       dev->smi_events.events &= ~events;
>>> +       mutex_unlock(kfd_get_smi_mutex());
>>> +}
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
>>> +{
>>> +       int ret;
>>> +
>>> +       mutex_lock(kfd_get_smi_mutex());
>>> +       dev->smi_events.events |= events;
>>> +       mutex_unlock(kfd_get_smi_mutex());
>>> +
>>> +       /* We use the lower 32 bits for now. Each bit represents one 
>>> event. If
>>> +        * featured events are increased to more than 32, we'll use 
>>> the upper
>>> +        * bits as groups so the total number of events can be up to 
>>> 32*32.
>>> +        */
>>> +       dev->smi_events.max_events = 32;
>>> +       ret = kfifo_alloc(&dev->smi_events.fifo, 
>>> dev->smi_events.max_events,
>>> +                        GFP_KERNEL);
>>> +       if (ret) {
>>> +               pr_err("fail to allocate kfifo\n");
>>> +               return ret;
>>> +       }
>>> + init_waitqueue_head(&dev->smi_events.wait_queue);
>>> +
>>> +       return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>>> +                               (void *)dev, 0);
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> new file mode 100644
>>> index 0000000..2e320d3
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>> @@ -0,0 +1,41 @@
>>> +/*
>>> + * Copyright 2020 Advanced Micro Devices, Inc.
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person 
>>> obtaining a
>>> + * copy of this software and associated documentation files (the 
>>> "Software"),
>>> + * to deal in the Software without restriction, including without 
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, 
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to 
>>> whom the
>>> + * Software is furnished to do so, subject to the following 
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be 
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>>> EVENT SHALL
>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>>> DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>>> OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>>> USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>>> +#define KFD_SMI_EVENTS_H_INCLUDED
>>> +
>>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
>>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
>>> pasid);
>>> +
>>> +/* All FIFO must start with "uint32_t group" and "uint32_t event" 
>>> so the user
>>> + * can read the first 8 bytes to determine the next read length.
>>> + */
>>> +struct kfd_smi_vmfault_fifo {
>>> +       uint32_t group;
>>> +       uint32_t event;
>>> +       unsigned int gpu_id;
>>> +       pid_t pid;
>>> +       char task_name[TASK_COMM_LEN];
>>> +};
>>> +
>>> +#endif
>>> diff --git a/include/uapi/linux/kfd_ioctl.h 
>>> b/include/uapi/linux/kfd_ioctl.h
>>> index 4f66764..6ce7c69 100644
>>> --- a/include/uapi/linux/kfd_ioctl.h
>>> +++ b/include/uapi/linux/kfd_ioctl.h
>>> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>>>          __u32 dmabuf_fd;        /* to KFD */
>>>   };
>>>
>>> +/*
>>> + * KFD SMI(System Management Interface) events
>>> + */
>>> +enum kfd_smi_events_op {
>>> +       KFD_SMI_EVENTS_REGISTER = 1,
>>> +       KFD_SMI_EVENTS_UNREGISTER
>>> +};
>>> +
>>> +/* Event ID (mask) */
>>> +#define KFD_SMI_EV_VMFAULT     0x00000001
>>> +
>>> +struct kfd_ioctl_smi_events_args {
>>> +       __u32 op;       /* to KFD */
>>> +       /* upper 32 bits: group. lower 32 bits: event IDs */
>>> +       __u64 events;   /* to KFD */
>>> +       __u32 gpu_id;   /* to KFD */
>>> +       pid_t pid;      /* to KFD */
>>> +       __u32 data1;    /* from KFD */
>>> +       __u32 data2;
>>> +       __u32 data3;
>>> +};
>>> +
>>>   /* Register offset inside the remapped mmio page
>>>    */
>>>   enum kfd_mmio_remap {
>>> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>>>   #define AMDKFD_IOC_ALLOC_QUEUE_GWS             \
>>>                  AMDKFD_IOWR(0x1E, struct 
>>> kfd_ioctl_alloc_queue_gws_args)
>>>
>>> +#define AMDKFD_IOC_SMI_EVENTS                  \
>>> +               AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>> +
>>>   #define AMDKFD_COMMAND_START           0x01
>>> -#define AMDKFD_COMMAND_END             0x1F
>>> +#define AMDKFD_COMMAND_END             0x20
>>>
>>>   #endif
>>> -- 
>>> 2.7.4
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Camber.lin%40amd.com%7C37d1a82d9e734d9fec6d08d7cf56ce36%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637205844045641423&amp;sdata=U34cf9CypZMAY7us1cbtR7F7VAfP1U1OSVM6L6LYTY4%3D&amp;reserved=0 
>>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Camber.lin%40amd.com%7C37d1a82d9e734d9fec6d08d7cf56ce36%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637205844045641423&amp;sdata=U34cf9CypZMAY7us1cbtR7F7VAfP1U1OSVM6L6LYTY4%3D&amp;reserved=0 
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-17 19:03 ` Alex Deucher
  2020-03-18  2:03   ` Lin, Amber
  2020-03-19 18:04   ` Lin, Amber
@ 2020-03-23 18:19   ` Amber Lin
  2020-03-24 15:58     ` Amber Lin
  2 siblings, 1 reply; 11+ messages in thread
From: Amber Lin @ 2020-03-23 18:19 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx list

Somehow my reply didn't seem to reach the mailing list...

Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this 
interface. Those functions will be added to this library:

/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */ 
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,
                                 rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
  * with details in data.
  */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t 
timeout_ms,
                                 rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in 
/dev/dri/card* or /dev/dri/renderD* instead. The first event to report 
is VM fault in this patch. Other events like RAS errors, PCIe errors, 
GPU reset… etc will be added for the system admin to diagnose the system 
health. I see this as a system feature so I use /dev/kfd. I’ll like to 
hear if people think differently. Thanks.

Thanks.

Amber

On 2020-03-17 3:03 p.m., Alex Deucher wrote:
> On Tue, Mar 17, 2020 at 1:57 PM Amber Lin <Amber.Lin@amd.com> wrote:
>> When the compute is malfunctioning or performance drops, the system admin
>> will use SMI (System Management Interface) tool to monitor/diagnostic what
>> went wrong. This patch provides an event watch interface for the user
>> space to register events they are interested. After the event is
>> registered, the user can use annoymous file descriptor's pull function
>> with wait-time specified to wait for the event to happen. Once the event
>> happens, the user can use read() to retrieve information related to the
>> event.
>>
>> VM fault event is done in this patch.
>>
>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> Can you provide a link to the userspace tools that make use of this interface?
>
> Thanks,
>
> Alex
>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>>   include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>>   9 files changed, 265 insertions(+), 2 deletions(-)
>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
>> index 6147462..cc98b4a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
>>                  $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>                  $(AMDKFD_PATH)/kfd_dbgdev.o \
>>                  $(AMDKFD_PATH)/kfd_dbgmgr.o \
>> -               $(AMDKFD_PATH)/kfd_crat.o
>> +               $(AMDKFD_PATH)/kfd_crat.o \
>> +               $(AMDKFD_PATH)/kfd_smi_events.o
>>
>>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> index 9f59ba9..24b4717 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> @@ -24,6 +24,7 @@
>>   #include "kfd_events.h"
>>   #include "cik_int.h"
>>   #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>
>>   static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>                                          const uint32_t *ih_ring_entry,
>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>                  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>                  struct kfd_vm_fault_info info;
>>
>> +               kfd_smi_event_update_vmfault(dev, pasid);
>>                  kfd_process_vm_fault(dev->dqm, pasid);
>>
>>                  memset(&info, 0, sizeof(info));
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index f8fa03a..8e92956 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -39,6 +39,7 @@
>>   #include "kfd_device_queue_manager.h"
>>   #include "kfd_dbgmgr.h"
>>   #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>
>>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>   static int kfd_open(struct inode *, struct file *);
>> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>>          return ret;
>>   }
>>
>> +/* Handle requests for watching SMI events */
>> +static int kfd_ioctl_smi_events(struct file *filep,
>> +                               struct kfd_process *p, void *data)
>> +{
>> +       struct kfd_ioctl_smi_events_args *args = data;
>> +       struct kfd_dev *dev;
>> +       int ret = 0;
>> +
>> +       dev = kfd_device_by_id(args->gpu_id);
>> +       if (!dev)
>> +               return -EINVAL;
>> +
>> +       switch (args->op) {
>> +       case KFD_SMI_EVENTS_REGISTER:
>> +               ret = kfd_smi_event_register(dev, args->events);
>> +               if (ret >= 0) {
>> +                       /* When the registration is successful, it returns the
>> +                        * annoymous inode. Pass it to the user in data1
>> +                        */
>> +                       args->data1 = ret;
>> +                       ret = 0;
>> +               }
>> +               break;
>> +       case KFD_SMI_EVENTS_UNREGISTER:
>> +               kfd_smi_event_unregister(dev, args->events);
>> +               break;
>> +       default:
>> +               ret = -EINVAL;
>> +               break;
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>>   bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>>   {
>>          struct kfd_local_mem_info mem_info;
>> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>>
>>          AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>>                          kfd_ioctl_alloc_queue_gws, 0),
>> +
>> +       AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
>> +                       kfd_ioctl_smi_events, 0),
>>   };
>>
>>   #define AMDKFD_CORE_IOCTL_COUNT        ARRAY_SIZE(amdkfd_ioctls)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 7866cd06..450368c 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>>          kfd->device_info = device_info;
>>          kfd->pdev = pdev;
>>          kfd->init_complete = false;
>> +       kfd->smi_events.events = 0;
>>          kfd->kfd2kgd = f2g;
>>          atomic_set(&kfd->compute_profile, 0);
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> index e05d75e..151e83e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> @@ -24,6 +24,7 @@
>>   #include "kfd_events.h"
>>   #include "soc15_int.h"
>>   #include "kfd_device_queue_manager.h"
>> +#include "kfd_smi_events.h"
>>
>>   static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>>                                          const uint32_t *ih_ring_entry,
>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>>                  info.prot_read  = ring_id & 0x10;
>>                  info.prot_write = ring_id & 0x20;
>>
>> +               kfd_smi_event_update_vmfault(dev, pasid);
>>                  kfd_process_vm_fault(dev->dqm, pasid);
>>                  kfd_signal_vm_fault_event(dev, pasid, &info);
>>          }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 43b888b..fdb51de 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>>          uint32_t vmid_num_kfd;
>>   };
>>
>> +struct kfd_smi_events {
>> +       uint64_t events;
>> +       struct kfifo fifo;
>> +       wait_queue_head_t wait_queue;
>> +       uint32_t max_events;
>> +};
>> +
>>   struct kfd_dev {
>>          struct kgd_dev *kgd;
>>
>> @@ -309,6 +316,9 @@ struct kfd_dev {
>>
>>          /* Global GWS resource shared b/t processes*/
>>          void *gws;
>> +
>> +       /* if this device is in SMI events watch */
>> +       struct kfd_smi_events smi_events;
>>   };
>>
>>   enum kfd_mempool {
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> new file mode 100644
>> index 0000000..ba9d036
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -0,0 +1,143 @@
>> +/*
>> + * Copyright 2020 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#include <linux/poll.h>
>> +#include <linux/wait.h>
>> +#include <linux/anon_inodes.h>
>> +#include <uapi/linux/kfd_ioctl.h>
>> +#include "amdgpu_vm.h"
>> +#include "kfd_priv.h"
>> +#include "kfd_smi_events.h"
>> +
>> +static DEFINE_MUTEX(kfd_smi_mutex);
>> +
>> +struct mutex *kfd_get_smi_mutex(void)
>> +{
>> +       return &kfd_smi_mutex;
>> +}
>> +
>> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
>> +static int kfd_smi_ev_release(struct inode *, struct file *);
>> +
>> +static const char kfd_smi_name[] = "kfd_smi_ev";
>> +
>> +static const struct file_operations kfd_smi_ev_fops = {
>> +       .owner = THIS_MODULE,
>> +       .poll = kfd_smi_ev_poll,
>> +       .read = kfd_smi_ev_read,
>> +       .release = kfd_smi_ev_release
>> +};
>> +
>> +static __poll_t kfd_smi_ev_poll(struct file *filep,
>> +                               struct poll_table_struct *wait)
>> +{
>> +       struct kfd_dev *dev = filep->private_data;
>> +       struct kfd_smi_events *ev = &dev->smi_events;
>> +
>> +       __poll_t mask = 0;
>> +
>> +       poll_wait(filep, &ev->wait_queue, wait);
>> +       mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
>> +
>> +       return mask;
>> +}
>> +
>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
>> +                              size_t size, loff_t *offset)
>> +{
>> +       int ret, copied = 0;
>> +       struct kfd_dev *dev = filep->private_data;
>> +
>> +       ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
>> +       if (ret || !copied) {
>> +               pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
>> +                               ret, copied);
>> +               return ret ? ret : -EAGAIN;
>> +       }
>> +
>> +       return copied;
>> +}
>> +
>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>> +{
>> +       struct kfd_dev *dev = filep->private_data;
>> +
>> +       kfifo_free(&dev->smi_events.fifo);
>> +       return 0;
>> +}
>> +
>> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
>> +{
>> +       struct kfd_smi_vmfault_fifo fifo_out;
>> +       struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
>> +       struct amdgpu_task_info task_info;
>> +
>> +       if (!kdev->smi_events.events)
>> +               return;
>> +
>> +       if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
>> +               return;
>> +
>> +       memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> +       amdgpu_vm_get_task_info(adev, pasid, &task_info);
>> +
>> +       fifo_out.group = 0;
>> +       fifo_out.event = KFD_SMI_EV_VMFAULT;
>> +       fifo_out.gpu_id = kdev->id;
>> +       fifo_out.pid = task_info.pid;
>> +       strcpy(fifo_out.task_name, task_info.task_name);
>> +       kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
>> +       wake_up_all(&kdev->smi_events.wait_queue);
>> +}
>> +
>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
>> +{
>> +       mutex_lock(kfd_get_smi_mutex());
>> +       dev->smi_events.events &= ~events;
>> +       mutex_unlock(kfd_get_smi_mutex());
>> +}
>> +
>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
>> +{
>> +       int ret;
>> +
>> +       mutex_lock(kfd_get_smi_mutex());
>> +       dev->smi_events.events |= events;
>> +       mutex_unlock(kfd_get_smi_mutex());
>> +
>> +       /* We use the lower 32 bits for now. Each bit represents one event. If
>> +        * featured events are increased to more than 32, we'll use the upper
>> +        * bits as groups so the total number of events can be up to 32*32.
>> +        */
>> +       dev->smi_events.max_events = 32;
>> +       ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
>> +                        GFP_KERNEL);
>> +       if (ret) {
>> +               pr_err("fail to allocate kfifo\n");
>> +               return ret;
>> +       }
>> +       init_waitqueue_head(&dev->smi_events.wait_queue);
>> +
>> +       return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
>> +                               (void *)dev, 0);
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> new file mode 100644
>> index 0000000..2e320d3
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> @@ -0,0 +1,41 @@
>> +/*
>> + * Copyright 2020 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
>> +#define KFD_SMI_EVENTS_H_INCLUDED
>> +
>> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
>> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
>> +
>> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
>> + * can read the first 8 bytes to determine the next read length.
>> + */
>> +struct kfd_smi_vmfault_fifo {
>> +       uint32_t group;
>> +       uint32_t event;
>> +       unsigned int gpu_id;
>> +       pid_t pid;
>> +       char task_name[TASK_COMM_LEN];
>> +};
>> +
>> +#endif
>> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>> index 4f66764..6ce7c69 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>>          __u32 dmabuf_fd;        /* to KFD */
>>   };
>>
>> +/*
>> + * KFD SMI(System Management Interface) events
>> + */
>> +enum kfd_smi_events_op {
>> +       KFD_SMI_EVENTS_REGISTER = 1,
>> +       KFD_SMI_EVENTS_UNREGISTER
>> +};
>> +
>> +/* Event ID (mask) */
>> +#define KFD_SMI_EV_VMFAULT     0x00000001
>> +
>> +struct kfd_ioctl_smi_events_args {
>> +       __u32 op;       /* to KFD */
>> +       /* upper 32 bits: group. lower 32 bits: event IDs */
>> +       __u64 events;   /* to KFD */
>> +       __u32 gpu_id;   /* to KFD */
>> +       pid_t pid;      /* to KFD */
>> +       __u32 data1;    /* from KFD */
>> +       __u32 data2;
>> +       __u32 data3;
>> +};
>> +
>>   /* Register offset inside the remapped mmio page
>>    */
>>   enum kfd_mmio_remap {
>> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>>   #define AMDKFD_IOC_ALLOC_QUEUE_GWS             \
>>                  AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>
>> +#define AMDKFD_IOC_SMI_EVENTS                  \
>> +               AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>> +
>>   #define AMDKFD_COMMAND_START           0x01
>> -#define AMDKFD_COMMAND_END             0x1F
>> +#define AMDKFD_COMMAND_END             0x20
>>
>>   #endif
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CAmber.Lin%40amd.com%7C2f64059b66554d40898208d7caa5e374%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637200686149934820&amp;sdata=vBpnnq1xhSxEMjEWffwSLVIcSymSemDQTiEoYXj5lEE%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-17 19:03 ` Alex Deucher
  2020-03-18  2:03   ` Lin, Amber
@ 2020-03-19 18:04   ` Lin, Amber
  2020-03-23 18:19   ` Amber Lin
  2 siblings, 0 replies; 11+ messages in thread
From: Lin, Amber @ 2020-03-19 18:04 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx list

[AMD Public Use]

Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this interface. Those functions will be added to this library:

/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,
                                rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms,
                                rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in /dev/dri/card* or /dev/dri/renderD* instead. The first event to report is VM fault in this patch. Other events like RAS errors, PCIe errors, GPU reset… etc will be added for the system admin to diagnose the system health. I see this as a system feature so I use /dev/kfd. I’ll like to hear if people think differently. Thanks.

Regards,
Amber

-----Original Message-----
From: Alex Deucher <alexdeucher@gmail.com> 
Sent: Tuesday, March 17, 2020 3:03 PM
To: Lin, Amber <Amber.Lin@amd.com>
Cc: amd-gfx list <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] drm/amdkfd: Provide SMI events watch

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When the compute is malfunctioning or performance drops, the system 
> admin will use SMI (System Management Interface) tool to 
> monitor/diagnostic what went wrong. This patch provides an event watch 
> interface for the user space to register events they are interested. 
> After the event is registered, the user can use annoymous file 
> descriptor's pull function with wait-time specified to wait for the 
> event to happen. Once the event happens, the user can use read() to 
> retrieve information related to the event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex

> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>  include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>  9 files changed, 265 insertions(+), 2 deletions(-)  create mode 
> 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
>                 $(AMDKFD_PATH)/kfd_int_process_v9.o \
>                 $(AMDKFD_PATH)/kfd_dbgdev.o \
>                 $(AMDKFD_PATH)/kfd_dbgmgr.o \
> -               $(AMDKFD_PATH)/kfd_crat.o
> +               $(AMDKFD_PATH)/kfd_crat.o \
> +               $(AMDKFD_PATH)/kfd_smi_events.o
>
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git 
> a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>                                         const uint32_t *ih_ring_entry, 
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>                 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>                 struct kfd_vm_fault_info info;
>
> +               kfd_smi_event_update_vmfault(dev, pasid);
>                 kfd_process_vm_fault(dev->dqm, pasid);
>
>                 memset(&info, 0, sizeof(info)); diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);  
> static int kfd_open(struct inode *, struct file *); @@ -1243,6 
> +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>         return ret;
>  }
>
> +/* Handle requests for watching SMI events */ static int 
> +kfd_ioctl_smi_events(struct file *filep,
> +                               struct kfd_process *p, void *data) {
> +       struct kfd_ioctl_smi_events_args *args = data;
> +       struct kfd_dev *dev;
> +       int ret = 0;
> +
> +       dev = kfd_device_by_id(args->gpu_id);
> +       if (!dev)
> +               return -EINVAL;
> +
> +       switch (args->op) {
> +       case KFD_SMI_EVENTS_REGISTER:
> +               ret = kfd_smi_event_register(dev, args->events);
> +               if (ret >= 0) {
> +                       /* When the registration is successful, it returns the
> +                        * annoymous inode. Pass it to the user in data1
> +                        */
> +                       args->data1 = ret;
> +                       ret = 0;
> +               }
> +               break;
> +       case KFD_SMI_EVENTS_UNREGISTER:
> +               kfd_smi_event_unregister(dev, args->events);
> +               break;
> +       default:
> +               ret = -EINVAL;
> +               break;
> +       }
> +
> +       return ret;
> +}
> +
>  bool kfd_dev_is_large_bar(struct kfd_dev *dev)  {
>         struct kfd_local_mem_info mem_info; @@ -1827,6 +1862,9 @@ 
> static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>
>         AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>                         kfd_ioctl_alloc_queue_gws, 0),
> +
> +       AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
> +                       kfd_ioctl_smi_events, 0),
>  };
>
>  #define AMDKFD_CORE_IOCTL_COUNT        ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 7866cd06..450368c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>         kfd->device_info = device_info;
>         kfd->pdev = pdev;
>         kfd->init_complete = false;
> +       kfd->smi_events.events = 0;
>         kfd->kfd2kgd = f2g;
>         atomic_set(&kfd->compute_profile, 0);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index e05d75e..151e83e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "soc15_int.h"
>  #include "kfd_device_queue_manager.h"
> +#include "kfd_smi_events.h"
>
>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>                                         const uint32_t *ih_ring_entry, 
> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>                 info.prot_read  = ring_id & 0x10;
>                 info.prot_write = ring_id & 0x20;
>
> +               kfd_smi_event_update_vmfault(dev, pasid);
>                 kfd_process_vm_fault(dev->dqm, pasid);
>                 kfd_signal_vm_fault_event(dev, pasid, &info);
>         }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 43b888b..fdb51de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>         uint32_t vmid_num_kfd;
>  };
>
> +struct kfd_smi_events {
> +       uint64_t events;
> +       struct kfifo fifo;
> +       wait_queue_head_t wait_queue;
> +       uint32_t max_events;
> +};
> +
>  struct kfd_dev {
>         struct kgd_dev *kgd;
>
> @@ -309,6 +316,9 @@ struct kfd_dev {
>
>         /* Global GWS resource shared b/t processes*/
>         void *gws;
> +
> +       /* if this device is in SMI events watch */
> +       struct kfd_smi_events smi_events;
>  };
>
>  enum kfd_mempool {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> new file mode 100644
> index 0000000..ba9d036
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -0,0 +1,143 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person 
> +obtaining a
> + * copy of this software and associated documentation files (the 
> +"Software"),
> + * to deal in the Software without restriction, including without 
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute, 
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom 
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be 
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/poll.h>
> +#include <linux/wait.h>
> +#include <linux/anon_inodes.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_vm.h"
> +#include "kfd_priv.h"
> +#include "kfd_smi_events.h"
> +
> +static DEFINE_MUTEX(kfd_smi_mutex);
> +
> +struct mutex *kfd_get_smi_mutex(void) {
> +       return &kfd_smi_mutex;
> +}
> +
> +static __poll_t kfd_smi_ev_poll(struct file *, struct 
> +poll_table_struct *); static ssize_t kfd_smi_ev_read(struct file *, 
> +char __user *, size_t, loff_t *); static int 
> +kfd_smi_ev_release(struct inode *, struct file *);
> +
> +static const char kfd_smi_name[] = "kfd_smi_ev";
> +
> +static const struct file_operations kfd_smi_ev_fops = {
> +       .owner = THIS_MODULE,
> +       .poll = kfd_smi_ev_poll,
> +       .read = kfd_smi_ev_read,
> +       .release = kfd_smi_ev_release
> +};
> +
> +static __poll_t kfd_smi_ev_poll(struct file *filep,
> +                               struct poll_table_struct *wait) {
> +       struct kfd_dev *dev = filep->private_data;
> +       struct kfd_smi_events *ev = &dev->smi_events;
> +
> +       __poll_t mask = 0;
> +
> +       poll_wait(filep, &ev->wait_queue, wait);
> +       mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : 
> + mask;
> +
> +       return mask;
> +}
> +
> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
> +                              size_t size, loff_t *offset) {
> +       int ret, copied = 0;
> +       struct kfd_dev *dev = filep->private_data;
> +
> +       ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
> +       if (ret || !copied) {
> +               pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
> +                               ret, copied);
> +               return ret ? ret : -EAGAIN;
> +       }
> +
> +       return copied;
> +}
> +
> +static int kfd_smi_ev_release(struct inode *inode, struct file 
> +*filep) {
> +       struct kfd_dev *dev = filep->private_data;
> +
> +       kfifo_free(&dev->smi_events.fifo);
> +       return 0;
> +}
> +
> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t 
> +pasid) {
> +       struct kfd_smi_vmfault_fifo fifo_out;
> +       struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
> +       struct amdgpu_task_info task_info;
> +
> +       if (!kdev->smi_events.events)
> +               return;
> +
> +       if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
> +               return;
> +
> +       memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> +       amdgpu_vm_get_task_info(adev, pasid, &task_info);
> +
> +       fifo_out.group = 0;
> +       fifo_out.event = KFD_SMI_EV_VMFAULT;
> +       fifo_out.gpu_id = kdev->id;
> +       fifo_out.pid = task_info.pid;
> +       strcpy(fifo_out.task_name, task_info.task_name);
> +       kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
> +       wake_up_all(&kdev->smi_events.wait_queue);
> +}
> +
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events) {
> +       mutex_lock(kfd_get_smi_mutex());
> +       dev->smi_events.events &= ~events;
> +       mutex_unlock(kfd_get_smi_mutex());
> +}
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events) {
> +       int ret;
> +
> +       mutex_lock(kfd_get_smi_mutex());
> +       dev->smi_events.events |= events;
> +       mutex_unlock(kfd_get_smi_mutex());
> +
> +       /* We use the lower 32 bits for now. Each bit represents one event. If
> +        * featured events are increased to more than 32, we'll use the upper
> +        * bits as groups so the total number of events can be up to 32*32.
> +        */
> +       dev->smi_events.max_events = 32;
> +       ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
> +                        GFP_KERNEL);
> +       if (ret) {
> +               pr_err("fail to allocate kfifo\n");
> +               return ret;
> +       }
> +       init_waitqueue_head(&dev->smi_events.wait_queue);
> +
> +       return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
> +                               (void *)dev, 0); }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> new file mode 100644
> index 0000000..2e320d3
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -0,0 +1,41 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person 
> +obtaining a
> + * copy of this software and associated documentation files (the 
> +"Software"),
> + * to deal in the Software without restriction, including without 
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute, 
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom 
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be 
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
> +#define KFD_SMI_EVENTS_H_INCLUDED
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events); 
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events); 
> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
> +pasid);
> +
> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so 
> +the user
> + * can read the first 8 bytes to determine the next read length.
> + */
> +struct kfd_smi_vmfault_fifo {
> +       uint32_t group;
> +       uint32_t event;
> +       unsigned int gpu_id;
> +       pid_t pid;
> +       char task_name[TASK_COMM_LEN]; };
> +
> +#endif
> diff --git a/include/uapi/linux/kfd_ioctl.h 
> b/include/uapi/linux/kfd_ioctl.h index 4f66764..6ce7c69 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>         __u32 dmabuf_fd;        /* to KFD */
>  };
>
> +/*
> + * KFD SMI(System Management Interface) events  */ enum 
> +kfd_smi_events_op {
> +       KFD_SMI_EVENTS_REGISTER = 1,
> +       KFD_SMI_EVENTS_UNREGISTER
> +};
> +
> +/* Event ID (mask) */
> +#define KFD_SMI_EV_VMFAULT     0x00000001
> +
> +struct kfd_ioctl_smi_events_args {
> +       __u32 op;       /* to KFD */
> +       /* upper 32 bits: group. lower 32 bits: event IDs */
> +       __u64 events;   /* to KFD */
> +       __u32 gpu_id;   /* to KFD */
> +       pid_t pid;      /* to KFD */
> +       __u32 data1;    /* from KFD */
> +       __u32 data2;
> +       __u32 data3;
> +};
> +
>  /* Register offset inside the remapped mmio page
>   */
>  enum kfd_mmio_remap {
> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS             \
>                 AMDKFD_IOWR(0x1E, struct 
> kfd_ioctl_alloc_queue_gws_args)
>
> +#define AMDKFD_IOC_SMI_EVENTS                  \
> +               AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
> +
>  #define AMDKFD_COMMAND_START           0x01
> -#define AMDKFD_COMMAND_END             0x1F
> +#define AMDKFD_COMMAND_END             0x20
>
>  #endif
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CAm
> ber.Lin%40amd.com%7C2f64059b66554d40898208d7caa5e374%7C3dd8961fe4884e6
> 08e11a82d994e183d%7C0%7C0%7C637200686149934820&amp;sdata=vBpnnq1xhSxEM
> jEWffwSLVIcSymSemDQTiEoYXj5lEE%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-17 19:03 ` Alex Deucher
@ 2020-03-18  2:03   ` Lin, Amber
  2020-03-19 18:04   ` Lin, Amber
  2020-03-23 18:19   ` Amber Lin
  2 siblings, 0 replies; 11+ messages in thread
From: Lin, Amber @ 2020-03-18  2:03 UTC (permalink / raw)
  To: Alex Deucher; +Cc: amd-gfx list


[-- Attachment #1.1: Type: text/plain, Size: 18650 bytes --]

[AMD Official Use Only - Internal Distribution Only]



________________________________
From: Alex Deucher <alexdeucher@gmail.com>
Sent: Tuesday, March 17, 2020 3:03 PM
To: Lin, Amber <Amber.Lin@amd.com>
Cc: amd-gfx list <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] drm/amdkfd: Provide SMI events watch

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register events they are interested. After the event is
> registered, the user can use annoymous file descriptor's pull function
> with wait-time specified to wait for the event to happen. Once the event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex
=====
Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this interface. Those functions will be added to this library:

/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,
                                rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms,
                                rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

Regards,
Amber

> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>  include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>  9 files changed, 265 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
>                 $(AMDKFD_PATH)/kfd_int_process_v9.o \
>                 $(AMDKFD_PATH)/kfd_dbgdev.o \
>                 $(AMDKFD_PATH)/kfd_dbgmgr.o \
> -               $(AMDKFD_PATH)/kfd_crat.o
> +               $(AMDKFD_PATH)/kfd_crat.o \
> +               $(AMDKFD_PATH)/kfd_smi_events.o
>
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>                                         const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>                 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>                 struct kfd_vm_fault_info info;
>
> +               kfd_smi_event_update_vmfault(dev, pasid);
>                 kfd_process_vm_fault(dev->dqm, pasid);
>
>                 memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>         return ret;
>  }
>
> +/* Handle requests for watching SMI events */
> +static int kfd_ioctl_smi_events(struct file *filep,
> +                               struct kfd_process *p, void *data)
> +{
> +       struct kfd_ioctl_smi_events_args *args = data;
> +       struct kfd_dev *dev;
> +       int ret = 0;
> +
> +       dev = kfd_device_by_id(args->gpu_id);
> +       if (!dev)
> +               return -EINVAL;
> +
> +       switch (args->op) {
> +       case KFD_SMI_EVENTS_REGISTER:
> +               ret = kfd_smi_event_register(dev, args->events);
> +               if (ret >= 0) {
> +                       /* When the registration is successful, it returns the
> +                        * annoymous inode. Pass it to the user in data1
> +                        */
> +                       args->data1 = ret;
> +                       ret = 0;
> +               }
> +               break;
> +       case KFD_SMI_EVENTS_UNREGISTER:
> +               kfd_smi_event_unregister(dev, args->events);
> +               break;
> +       default:
> +               ret = -EINVAL;
> +               break;
> +       }
> +
> +       return ret;
> +}
> +
>  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>  {
>         struct kfd_local_mem_info mem_info;
> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>
>         AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>                         kfd_ioctl_alloc_queue_gws, 0),
> +
> +       AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
> +                       kfd_ioctl_smi_events, 0),
>  };
>
>  #define AMDKFD_CORE_IOCTL_COUNT        ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 7866cd06..450368c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>         kfd->device_info = device_info;
>         kfd->pdev = pdev;
>         kfd->init_complete = false;
> +       kfd->smi_events.events = 0;
>         kfd->kfd2kgd = f2g;
>         atomic_set(&kfd->compute_profile, 0);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index e05d75e..151e83e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "soc15_int.h"
>  #include "kfd_device_queue_manager.h"
> +#include "kfd_smi_events.h"
>
>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>                                         const uint32_t *ih_ring_entry,
> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>                 info.prot_read  = ring_id & 0x10;
>                 info.prot_write = ring_id & 0x20;
>
> +               kfd_smi_event_update_vmfault(dev, pasid);
>                 kfd_process_vm_fault(dev->dqm, pasid);
>                 kfd_signal_vm_fault_event(dev, pasid, &info);
>         }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 43b888b..fdb51de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>         uint32_t vmid_num_kfd;
>  };
>
> +struct kfd_smi_events {
> +       uint64_t events;
> +       struct kfifo fifo;
> +       wait_queue_head_t wait_queue;
> +       uint32_t max_events;
> +};
> +
>  struct kfd_dev {
>         struct kgd_dev *kgd;
>
> @@ -309,6 +316,9 @@ struct kfd_dev {
>
>         /* Global GWS resource shared b/t processes*/
>         void *gws;
> +
> +       /* if this device is in SMI events watch */
> +       struct kfd_smi_events smi_events;
>  };
>
>  enum kfd_mempool {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> new file mode 100644
> index 0000000..ba9d036
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -0,0 +1,143 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/poll.h>
> +#include <linux/wait.h>
> +#include <linux/anon_inodes.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_vm.h"
> +#include "kfd_priv.h"
> +#include "kfd_smi_events.h"
> +
> +static DEFINE_MUTEX(kfd_smi_mutex);
> +
> +struct mutex *kfd_get_smi_mutex(void)
> +{
> +       return &kfd_smi_mutex;
> +}
> +
> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
> +static int kfd_smi_ev_release(struct inode *, struct file *);
> +
> +static const char kfd_smi_name[] = "kfd_smi_ev";
> +
> +static const struct file_operations kfd_smi_ev_fops = {
> +       .owner = THIS_MODULE,
> +       .poll = kfd_smi_ev_poll,
> +       .read = kfd_smi_ev_read,
> +       .release = kfd_smi_ev_release
> +};
> +
> +static __poll_t kfd_smi_ev_poll(struct file *filep,
> +                               struct poll_table_struct *wait)
> +{
> +       struct kfd_dev *dev = filep->private_data;
> +       struct kfd_smi_events *ev = &dev->smi_events;
> +
> +       __poll_t mask = 0;
> +
> +       poll_wait(filep, &ev->wait_queue, wait);
> +       mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
> +
> +       return mask;
> +}
> +
> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
> +                              size_t size, loff_t *offset)
> +{
> +       int ret, copied = 0;
> +       struct kfd_dev *dev = filep->private_data;
> +
> +       ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
> +       if (ret || !copied) {
> +               pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
> +                               ret, copied);
> +               return ret ? ret : -EAGAIN;
> +       }
> +
> +       return copied;
> +}
> +
> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
> +{
> +       struct kfd_dev *dev = filep->private_data;
> +
> +       kfifo_free(&dev->smi_events.fifo);
> +       return 0;
> +}
> +
> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
> +{
> +       struct kfd_smi_vmfault_fifo fifo_out;
> +       struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
> +       struct amdgpu_task_info task_info;
> +
> +       if (!kdev->smi_events.events)
> +               return;
> +
> +       if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
> +               return;
> +
> +       memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> +       amdgpu_vm_get_task_info(adev, pasid, &task_info);
> +
> +       fifo_out.group = 0;
> +       fifo_out.event = KFD_SMI_EV_VMFAULT;
> +       fifo_out.gpu_id = kdev->id;
> +       fifo_out.pid = task_info.pid;
> +       strcpy(fifo_out.task_name, task_info.task_name);
> +       kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
> +       wake_up_all(&kdev->smi_events.wait_queue);
> +}
> +
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
> +{
> +       mutex_lock(kfd_get_smi_mutex());
> +       dev->smi_events.events &= ~events;
> +       mutex_unlock(kfd_get_smi_mutex());
> +}
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
> +{
> +       int ret;
> +
> +       mutex_lock(kfd_get_smi_mutex());
> +       dev->smi_events.events |= events;
> +       mutex_unlock(kfd_get_smi_mutex());
> +
> +       /* We use the lower 32 bits for now. Each bit represents one event. If
> +        * featured events are increased to more than 32, we'll use the upper
> +        * bits as groups so the total number of events can be up to 32*32.
> +        */
> +       dev->smi_events.max_events = 32;
> +       ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
> +                        GFP_KERNEL);
> +       if (ret) {
> +               pr_err("fail to allocate kfifo\n");
> +               return ret;
> +       }
> +       init_waitqueue_head(&dev->smi_events.wait_queue);
> +
> +       return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
> +                               (void *)dev, 0);
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> new file mode 100644
> index 0000000..2e320d3
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -0,0 +1,41 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
> +#define KFD_SMI_EVENTS_H_INCLUDED
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +
> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
> + * can read the first 8 bytes to determine the next read length.
> + */
> +struct kfd_smi_vmfault_fifo {
> +       uint32_t group;
> +       uint32_t event;
> +       unsigned int gpu_id;
> +       pid_t pid;
> +       char task_name[TASK_COMM_LEN];
> +};
> +
> +#endif
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 4f66764..6ce7c69 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>         __u32 dmabuf_fd;        /* to KFD */
>  };
>
> +/*
> + * KFD SMI(System Management Interface) events
> + */
> +enum kfd_smi_events_op {
> +       KFD_SMI_EVENTS_REGISTER = 1,
> +       KFD_SMI_EVENTS_UNREGISTER
> +};
> +
> +/* Event ID (mask) */
> +#define KFD_SMI_EV_VMFAULT     0x00000001
> +
> +struct kfd_ioctl_smi_events_args {
> +       __u32 op;       /* to KFD */
> +       /* upper 32 bits: group. lower 32 bits: event IDs */
> +       __u64 events;   /* to KFD */
> +       __u32 gpu_id;   /* to KFD */
> +       pid_t pid;      /* to KFD */
> +       __u32 data1;    /* from KFD */
> +       __u32 data2;
> +       __u32 data3;
> +};
> +
>  /* Register offset inside the remapped mmio page
>   */
>  enum kfd_mmio_remap {
> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS             \
>                 AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>
> +#define AMDKFD_IOC_SMI_EVENTS                  \
> +               AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
> +
>  #define AMDKFD_COMMAND_START           0x01
> -#define AMDKFD_COMMAND_END             0x1F
> +#define AMDKFD_COMMAND_END             0x20
>
>  #endif
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7CAmber.Lin%40amd.com%7C2f64059b66554d40898208d7caa5e374%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637200686149934820&amp;sdata=vBpnnq1xhSxEMjEWffwSLVIcSymSemDQTiEoYXj5lEE%3D&amp;reserved=0

[-- Attachment #1.2: Type: text/html, Size: 34274 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] drm/amdkfd: Provide SMI events watch
  2020-03-17 17:57 Amber Lin
@ 2020-03-17 19:03 ` Alex Deucher
  2020-03-18  2:03   ` Lin, Amber
                     ` (2 more replies)
  2020-03-26 20:53 ` Felix Kuehling
  1 sibling, 3 replies; 11+ messages in thread
From: Alex Deucher @ 2020-03-17 19:03 UTC (permalink / raw)
  To: Amber Lin; +Cc: amd-gfx list

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin <Amber.Lin@amd.com> wrote:
>
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register events they are interested. After the event is
> registered, the user can use annoymous file descriptor's pull function
> with wait-time specified to wait for the event to happen. Once the event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex

> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
>  include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
>  9 files changed, 265 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
>                 $(AMDKFD_PATH)/kfd_int_process_v9.o \
>                 $(AMDKFD_PATH)/kfd_dbgdev.o \
>                 $(AMDKFD_PATH)/kfd_dbgmgr.o \
> -               $(AMDKFD_PATH)/kfd_crat.o
> +               $(AMDKFD_PATH)/kfd_crat.o \
> +               $(AMDKFD_PATH)/kfd_smi_events.o
>
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>                                         const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>                 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>                 struct kfd_vm_fault_info info;
>
> +               kfd_smi_event_update_vmfault(dev, pasid);
>                 kfd_process_vm_fault(dev->dqm, pasid);
>
>                 memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
>         return ret;
>  }
>
> +/* Handle requests for watching SMI events */
> +static int kfd_ioctl_smi_events(struct file *filep,
> +                               struct kfd_process *p, void *data)
> +{
> +       struct kfd_ioctl_smi_events_args *args = data;
> +       struct kfd_dev *dev;
> +       int ret = 0;
> +
> +       dev = kfd_device_by_id(args->gpu_id);
> +       if (!dev)
> +               return -EINVAL;
> +
> +       switch (args->op) {
> +       case KFD_SMI_EVENTS_REGISTER:
> +               ret = kfd_smi_event_register(dev, args->events);
> +               if (ret >= 0) {
> +                       /* When the registration is successful, it returns the
> +                        * annoymous inode. Pass it to the user in data1
> +                        */
> +                       args->data1 = ret;
> +                       ret = 0;
> +               }
> +               break;
> +       case KFD_SMI_EVENTS_UNREGISTER:
> +               kfd_smi_event_unregister(dev, args->events);
> +               break;
> +       default:
> +               ret = -EINVAL;
> +               break;
> +       }
> +
> +       return ret;
> +}
> +
>  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>  {
>         struct kfd_local_mem_info mem_info;
> @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
>
>         AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>                         kfd_ioctl_alloc_queue_gws, 0),
> +
> +       AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
> +                       kfd_ioctl_smi_events, 0),
>  };
>
>  #define AMDKFD_CORE_IOCTL_COUNT        ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 7866cd06..450368c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
>         kfd->device_info = device_info;
>         kfd->pdev = pdev;
>         kfd->init_complete = false;
> +       kfd->smi_events.events = 0;
>         kfd->kfd2kgd = f2g;
>         atomic_set(&kfd->compute_profile, 0);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index e05d75e..151e83e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "soc15_int.h"
>  #include "kfd_device_queue_manager.h"
> +#include "kfd_smi_events.h"
>
>  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
>                                         const uint32_t *ih_ring_entry,
> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>                 info.prot_read  = ring_id & 0x10;
>                 info.prot_write = ring_id & 0x20;
>
> +               kfd_smi_event_update_vmfault(dev, pasid);
>                 kfd_process_vm_fault(dev->dqm, pasid);
>                 kfd_signal_vm_fault_event(dev, pasid, &info);
>         }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 43b888b..fdb51de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -223,6 +223,13 @@ struct kfd_vmid_info {
>         uint32_t vmid_num_kfd;
>  };
>
> +struct kfd_smi_events {
> +       uint64_t events;
> +       struct kfifo fifo;
> +       wait_queue_head_t wait_queue;
> +       uint32_t max_events;
> +};
> +
>  struct kfd_dev {
>         struct kgd_dev *kgd;
>
> @@ -309,6 +316,9 @@ struct kfd_dev {
>
>         /* Global GWS resource shared b/t processes*/
>         void *gws;
> +
> +       /* if this device is in SMI events watch */
> +       struct kfd_smi_events smi_events;
>  };
>
>  enum kfd_mempool {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> new file mode 100644
> index 0000000..ba9d036
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -0,0 +1,143 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/poll.h>
> +#include <linux/wait.h>
> +#include <linux/anon_inodes.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu_vm.h"
> +#include "kfd_priv.h"
> +#include "kfd_smi_events.h"
> +
> +static DEFINE_MUTEX(kfd_smi_mutex);
> +
> +struct mutex *kfd_get_smi_mutex(void)
> +{
> +       return &kfd_smi_mutex;
> +}
> +
> +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
> +static int kfd_smi_ev_release(struct inode *, struct file *);
> +
> +static const char kfd_smi_name[] = "kfd_smi_ev";
> +
> +static const struct file_operations kfd_smi_ev_fops = {
> +       .owner = THIS_MODULE,
> +       .poll = kfd_smi_ev_poll,
> +       .read = kfd_smi_ev_read,
> +       .release = kfd_smi_ev_release
> +};
> +
> +static __poll_t kfd_smi_ev_poll(struct file *filep,
> +                               struct poll_table_struct *wait)
> +{
> +       struct kfd_dev *dev = filep->private_data;
> +       struct kfd_smi_events *ev = &dev->smi_events;
> +
> +       __poll_t mask = 0;
> +
> +       poll_wait(filep, &ev->wait_queue, wait);
> +       mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
> +
> +       return mask;
> +}
> +
> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
> +                              size_t size, loff_t *offset)
> +{
> +       int ret, copied = 0;
> +       struct kfd_dev *dev = filep->private_data;
> +
> +       ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
> +       if (ret || !copied) {
> +               pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
> +                               ret, copied);
> +               return ret ? ret : -EAGAIN;
> +       }
> +
> +       return copied;
> +}
> +
> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
> +{
> +       struct kfd_dev *dev = filep->private_data;
> +
> +       kfifo_free(&dev->smi_events.fifo);
> +       return 0;
> +}
> +
> +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
> +{
> +       struct kfd_smi_vmfault_fifo fifo_out;
> +       struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
> +       struct amdgpu_task_info task_info;
> +
> +       if (!kdev->smi_events.events)
> +               return;
> +
> +       if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
> +               return;
> +
> +       memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> +       amdgpu_vm_get_task_info(adev, pasid, &task_info);
> +
> +       fifo_out.group = 0;
> +       fifo_out.event = KFD_SMI_EV_VMFAULT;
> +       fifo_out.gpu_id = kdev->id;
> +       fifo_out.pid = task_info.pid;
> +       strcpy(fifo_out.task_name, task_info.task_name);
> +       kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
> +       wake_up_all(&kdev->smi_events.wait_queue);
> +}
> +
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
> +{
> +       mutex_lock(kfd_get_smi_mutex());
> +       dev->smi_events.events &= ~events;
> +       mutex_unlock(kfd_get_smi_mutex());
> +}
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
> +{
> +       int ret;
> +
> +       mutex_lock(kfd_get_smi_mutex());
> +       dev->smi_events.events |= events;
> +       mutex_unlock(kfd_get_smi_mutex());
> +
> +       /* We use the lower 32 bits for now. Each bit represents one event. If
> +        * featured events are increased to more than 32, we'll use the upper
> +        * bits as groups so the total number of events can be up to 32*32.
> +        */
> +       dev->smi_events.max_events = 32;
> +       ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
> +                        GFP_KERNEL);
> +       if (ret) {
> +               pr_err("fail to allocate kfifo\n");
> +               return ret;
> +       }
> +       init_waitqueue_head(&dev->smi_events.wait_queue);
> +
> +       return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
> +                               (void *)dev, 0);
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> new file mode 100644
> index 0000000..2e320d3
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -0,0 +1,41 @@
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_SMI_EVENTS_H_INCLUDED
> +#define KFD_SMI_EVENTS_H_INCLUDED
> +
> +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
> +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +
> +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
> + * can read the first 8 bytes to determine the next read length.
> + */
> +struct kfd_smi_vmfault_fifo {
> +       uint32_t group;
> +       uint32_t event;
> +       unsigned int gpu_id;
> +       pid_t pid;
> +       char task_name[TASK_COMM_LEN];
> +};
> +
> +#endif
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 4f66764..6ce7c69 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
>         __u32 dmabuf_fd;        /* to KFD */
>  };
>
> +/*
> + * KFD SMI(System Management Interface) events
> + */
> +enum kfd_smi_events_op {
> +       KFD_SMI_EVENTS_REGISTER = 1,
> +       KFD_SMI_EVENTS_UNREGISTER
> +};
> +
> +/* Event ID (mask) */
> +#define KFD_SMI_EV_VMFAULT     0x00000001
> +
> +struct kfd_ioctl_smi_events_args {
> +       __u32 op;       /* to KFD */
> +       /* upper 32 bits: group. lower 32 bits: event IDs */
> +       __u64 events;   /* to KFD */
> +       __u32 gpu_id;   /* to KFD */
> +       pid_t pid;      /* to KFD */
> +       __u32 data1;    /* from KFD */
> +       __u32 data2;
> +       __u32 data3;
> +};
> +
>  /* Register offset inside the remapped mmio page
>   */
>  enum kfd_mmio_remap {
> @@ -546,7 +568,10 @@ enum kfd_mmio_remap {
>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS             \
>                 AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>
> +#define AMDKFD_IOC_SMI_EVENTS                  \
> +               AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
> +
>  #define AMDKFD_COMMAND_START           0x01
> -#define AMDKFD_COMMAND_END             0x1F
> +#define AMDKFD_COMMAND_END             0x20
>
>  #endif
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] drm/amdkfd: Provide SMI events watch
@ 2020-03-17 17:57 Amber Lin
  2020-03-17 19:03 ` Alex Deucher
  2020-03-26 20:53 ` Felix Kuehling
  0 siblings, 2 replies; 11+ messages in thread
From: Amber Lin @ 2020-03-17 17:57 UTC (permalink / raw)
  To: amd-gfx; +Cc: Amber Lin

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile              |   3 +-
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  38 ++++++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c          |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  10 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 143 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  41 +++++++
 include/uapi/linux/kfd_ioctl.h                   |  27 ++++-
 9 files changed, 265 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_dbgdev.o \
 		$(AMDKFD_PATH)/kfd_dbgmgr.o \
-		$(AMDKFD_PATH)/kfd_crat.o
+		$(AMDKFD_PATH)/kfd_crat.o \
+		$(AMDKFD_PATH)/kfd_smi_events.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 		struct kfd_vm_fault_info info;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 
 		memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
 	return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_smi_events_args *args = data;
+	struct kfd_dev *dev;
+	int ret = 0;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (!dev)
+		return -EINVAL;
+
+	switch (args->op) {
+	case KFD_SMI_EVENTS_REGISTER:
+		ret = kfd_smi_event_register(dev, args->events);
+		if (ret >= 0) {
+			/* When the registration is successful, it returns the
+			 * annoymous inode. Pass it to the user in data1
+			 */
+			args->data1 = ret;
+			ret = 0;
+		}
+		break;
+	case KFD_SMI_EVENTS_UNREGISTER:
+		kfd_smi_event_unregister(dev, args->events);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
 	struct kfd_local_mem_info mem_info;
@@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
 			kfd_ioctl_alloc_queue_gws, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+			kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 7866cd06..450368c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
 	kfd->device_info = device_info;
 	kfd->pdev = pdev;
 	kfd->init_complete = false;
+	kfd->smi_events.events = 0;
 	kfd->kfd2kgd = f2g;
 	atomic_set(&kfd->compute_profile, 0);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e05d75e..151e83e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_smi_events.h"
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 		kfd_signal_vm_fault_event(dev, pasid, &info);
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 43b888b..fdb51de 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -223,6 +223,13 @@ struct kfd_vmid_info {
 	uint32_t vmid_num_kfd;
 };
 
+struct kfd_smi_events {
+	uint64_t events;
+	struct kfifo fifo;
+	wait_queue_head_t wait_queue;
+	uint32_t max_events;
+};
+
 struct kfd_dev {
 	struct kgd_dev *kgd;
 
@@ -309,6 +316,9 @@ struct kfd_dev {
 
 	/* Global GWS resource shared b/t processes*/
 	void *gws;
+
+	/* if this device is in SMI events watch */
+	struct kfd_smi_events smi_events;
 };
 
 enum kfd_mempool {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
new file mode 100644
index 0000000..ba9d036
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_vm.h"
+#include "kfd_priv.h"
+#include "kfd_smi_events.h"
+
+static DEFINE_MUTEX(kfd_smi_mutex);
+
+struct mutex *kfd_get_smi_mutex(void)
+{
+	return &kfd_smi_mutex;
+}
+
+static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
+static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
+static int kfd_smi_ev_release(struct inode *, struct file *);
+
+static const char kfd_smi_name[] = "kfd_smi_ev";
+
+static const struct file_operations kfd_smi_ev_fops = {
+	.owner = THIS_MODULE,
+	.poll = kfd_smi_ev_poll,
+	.read = kfd_smi_ev_read,
+	.release = kfd_smi_ev_release
+};
+
+static __poll_t kfd_smi_ev_poll(struct file *filep,
+				struct poll_table_struct *wait)
+{
+	struct kfd_dev *dev = filep->private_data;
+	struct kfd_smi_events *ev = &dev->smi_events;
+
+	__poll_t mask = 0;
+
+	poll_wait(filep, &ev->wait_queue, wait);
+	mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask;
+
+	return mask;
+}
+
+static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
+			       size_t size, loff_t *offset)
+{
+	int ret, copied = 0;
+	struct kfd_dev *dev = filep->private_data;
+
+	ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
+	if (ret || !copied) {
+		pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n",
+				ret, copied);
+		return ret ? ret : -EAGAIN;
+	}
+
+	return copied;
+}
+
+static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
+{
+	struct kfd_dev *dev = filep->private_data;
+
+	kfifo_free(&dev->smi_events.fifo);
+	return 0;
+}
+
+void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid)
+{
+	struct kfd_smi_vmfault_fifo fifo_out;
+	struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd;
+	struct amdgpu_task_info task_info;
+
+	if (!kdev->smi_events.events)
+		return;
+
+	if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT))
+		return;
+
+	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
+	amdgpu_vm_get_task_info(adev, pasid, &task_info);
+
+	fifo_out.group = 0;
+	fifo_out.event = KFD_SMI_EV_VMFAULT;
+	fifo_out.gpu_id = kdev->id;
+	fifo_out.pid = task_info.pid;
+	strcpy(fifo_out.task_name, task_info.task_name);
+	kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out));
+	wake_up_all(&kdev->smi_events.wait_queue);
+}
+
+void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events)
+{
+	mutex_lock(kfd_get_smi_mutex());
+	dev->smi_events.events &= ~events;
+	mutex_unlock(kfd_get_smi_mutex());
+}
+
+int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events)
+{
+	int ret;
+
+	mutex_lock(kfd_get_smi_mutex());
+	dev->smi_events.events |= events;
+	mutex_unlock(kfd_get_smi_mutex());
+
+	/* We use the lower 32 bits for now. Each bit represents one event. If
+	 * featured events are increased to more than 32, we'll use the upper
+	 * bits as groups so the total number of events can be up to 32*32.
+	 */
+	dev->smi_events.max_events = 32;
+	ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events,
+			 GFP_KERNEL);
+	if (ret) {
+		pr_err("fail to allocate kfifo\n");
+		return ret;
+	}
+	init_waitqueue_head(&dev->smi_events.wait_queue);
+
+	return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
+				(void *)dev, 0);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
new file mode 100644
index 0000000..2e320d3
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_SMI_EVENTS_H_INCLUDED
+#define KFD_SMI_EVENTS_H_INCLUDED
+
+int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events);
+void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events);
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+
+/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user
+ * can read the first 8 bytes to determine the next read length.
+ */
+struct kfd_smi_vmfault_fifo {
+	uint32_t group;
+	uint32_t event;
+	unsigned int gpu_id;
+	pid_t pid;
+	char task_name[TASK_COMM_LEN];
+};
+
+#endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 4f66764..6ce7c69 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+/*
+ * KFD SMI(System Management Interface) events
+ */
+enum kfd_smi_events_op {
+	KFD_SMI_EVENTS_REGISTER = 1,
+	KFD_SMI_EVENTS_UNREGISTER
+};
+
+/* Event ID (mask) */
+#define KFD_SMI_EV_VMFAULT     0x00000001
+
+struct kfd_ioctl_smi_events_args {
+	__u32 op;       /* to KFD */
+	/* upper 32 bits: group. lower 32 bits: event IDs */
+	__u64 events;   /* to KFD */
+	__u32 gpu_id;   /* to KFD */
+	pid_t pid;      /* to KFD */
+	__u32 data1;    /* from KFD */
+	__u32 data2;
+	__u32 data3;
+};
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -546,7 +568,10 @@ enum kfd_mmio_remap {
 #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
 		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
 
+#define AMDKFD_IOC_SMI_EVENTS			\
+		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1F
+#define AMDKFD_COMMAND_END		0x20
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-05-14  0:52 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-13 19:41 [PATCH] drm/amdkfd: Provide SMI events watch Amber Lin
2020-05-14  0:52 ` Felix Kuehling
  -- strict thread matches above, loose matches on Subject: below --
2020-03-17 17:57 Amber Lin
2020-03-17 19:03 ` Alex Deucher
2020-03-18  2:03   ` Lin, Amber
2020-03-19 18:04   ` Lin, Amber
2020-03-23 18:19   ` Amber Lin
2020-03-24 15:58     ` Amber Lin
2020-03-26 20:53 ` Felix Kuehling
2020-04-01 13:10   ` Amber Lin
2020-04-01 20:54     ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.