All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4] drm/amdkfd: Provide SMI events watch
@ 2020-04-14 21:30 Amber Lin
  2020-04-14 23:04 ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Amber Lin @ 2020-04-14 21:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: Amber Lin

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
    - correct kfifo usage
    - move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile              |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  43 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_module.c          |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 235 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  33 ++++
 include/uapi/linux/kfd_ioctl.h                   |  35 +++-
 9 files changed, 354 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_dbgdev.o \
 		$(AMDKFD_PATH)/kfd_dbgmgr.o \
+		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 		struct kfd_vm_fault_info info;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 
 		memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..f13fde59 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,45 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
 	return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_smi_events_args *args = data;
+	uint32_t *ids_array;
+	int ret = 0;
+
+	switch (args->op) {
+	case KFD_SMI_EVENTS_REGISTER:
+		ids_array = kmalloc_array(args->num_gpuids, sizeof(uint32_t),
+					  GFP_KERNEL);
+		if (!ids_array)
+			return -ENOMEM;
+		if (copy_from_user(ids_array,
+				  (void __user *)args->gpuids_array_ptr,
+				  args->num_gpuids * sizeof(uint32_t))) {
+			kfree(ids_array);
+			return -EFAULT;
+		}
+
+		ret = kfd_smi_event_register(args->num_gpuids, ids_array,
+					     &args->anon_fd, &args->client_id);
+		if (ret)
+			kfree(ids_array);
+
+		return ret;
+
+	case KFD_SMI_EVENTS_ENABLE:
+		/* subscribe events */
+		return kfd_smi_event_enable(args->client_id, args->events);
+	case KFD_SMI_EVENTS_DISABLE:
+		/* unsubscribe events */
+		return kfd_smi_event_disable(args->client_id, args->events);
+	}
+
+	return -EINVAL;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -1827,6 +1867,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
 			kfd_ioctl_alloc_queue_gws, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+			kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e05d75e..151e83e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_smi_events.h"
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 		kfd_signal_vm_fault_event(dev, pasid, &info);
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index f4b7f7e..1aa96a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -62,6 +62,7 @@ static int kfd_init(void)
 	kfd_procfs_init();
 
 	kfd_debugfs_init();
+	kfd_smi_event_init();
 
 	return 0;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 43b888b..3cdff5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1087,4 +1087,7 @@ static inline void kfd_debugfs_fini(void) {}
 
 #endif
 
+/* SMI events */
+void kfd_smi_event_init(void);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
new file mode 100644
index 0000000..3e94f94
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_vm.h"
+#include "kfd_priv.h"
+#include "kfd_smi_events.h"
+
+struct kfd_smi_client {
+	uint32_t id;
+	struct list_head list;
+	struct kfifo fifo;
+	wait_queue_head_t wait_queue;
+	/* events enabled */
+	uint64_t enabled;
+	/* devices in watching */
+	uint32_t num_devs;
+	uint32_t *dev_ids;
+};
+static LIST_HEAD(clients);
+static spinlock_t clients_lock;
+static uint32_t client_id;
+
+static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
+static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
+static int kfd_smi_ev_release(struct inode *, struct file *);
+
+static const char kfd_smi_name[] = "kfd_smi_ev";
+
+static const struct file_operations kfd_smi_ev_fops = {
+	.owner = THIS_MODULE,
+	.poll = kfd_smi_ev_poll,
+	.read = kfd_smi_ev_read,
+	.release = kfd_smi_ev_release
+};
+
+static __poll_t kfd_smi_ev_poll(struct file *filep,
+				struct poll_table_struct *wait)
+{
+	__poll_t mask;
+	struct kfd_smi_client *client = filep->private_data;
+
+	poll_wait(filep, &client->wait_queue, wait);
+
+	rcu_read_lock();
+	mask = kfifo_is_empty(&client->fifo) ? 0: POLLIN | POLLRDNORM;
+	rcu_read_unlock();
+
+	return mask;
+}
+
+static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
+			       size_t size, loff_t *offset)
+{
+	int ret, copied = 0;
+	struct kfd_smi_client *client = filep->private_data;
+
+	ret = kfifo_to_user(&client->fifo, user, size, &copied);
+	if (ret || !copied) {
+		pr_debug("smi-events: fail to send msg (%i) (%i)\n",
+			ret, copied);
+		return ret ? ret : -EAGAIN;
+	}
+
+	return copied;
+}
+
+static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
+{
+	struct kfd_smi_client *client = filep->private_data;
+
+	spin_lock(&clients_lock);
+	list_del_rcu(&client->list);
+	spin_unlock(&clients_lock);
+
+	synchronize_rcu();
+	kfifo_free(&client->fifo);
+	kfree(client->dev_ids);
+	kfree(client);
+
+	return 0;
+}
+
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+	struct amdgpu_task_info task_info;
+	/* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */
+	/* 16 bytes event + 1 byte space + 8 bytes gpuid + 1 byte space +
+	 * 25 bytes msg + 1 byte \n = 52
+	 */
+	char fifo_in[52];
+	struct kfd_smi_client *client;
+	int i;
+
+	if (list_empty(&clients))
+		return;
+
+	amdgpu_vm_get_task_info(adev, pasid, &task_info);
+	snprintf(fifo_in, 52, "%x %x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
+		dev->id, task_info.pid,task_info.task_name);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(client, &clients, list) {
+		if (!(client->enabled & KFD_SMI_EVENT_VMFAULT))
+			continue;
+		for (i = 0; i < client->num_devs; i++) {
+			if (client->dev_ids[i] != dev->id)
+				continue;
+			if (kfifo_avail(&client->fifo) < 52) {
+				pr_err("smi_event(vmfault): no space left\n");
+				rcu_read_unlock();
+				return;
+			}
+			kfifo_in(&client->fifo, fifo_in, sizeof(fifo_in));
+			wake_up_all(&client->wait_queue);
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+}
+
+static struct kfd_smi_client *get_client_by_id(uint32_t id)
+{
+	struct kfd_smi_client *client;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(client, &clients, list) {
+		if (client->id == id) {
+			rcu_read_unlock();
+			return client;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static int kfd_smi_event_update_events(uint32_t id, uint64_t events,
+				       bool is_enable)
+{
+	struct kfd_smi_client *client = get_client_by_id(id);
+
+	if (!client)
+		return -EINVAL;
+
+	if (is_enable)
+		client->enabled |= events;
+	else
+		client->enabled &= ~events;
+
+	return 0;
+}
+
+int kfd_smi_event_enable(uint32_t id, uint64_t events)
+{
+	return kfd_smi_event_update_events(id, events, true);
+}
+
+int kfd_smi_event_disable(uint32_t id, uint64_t events)
+{
+	return kfd_smi_event_update_events(id, events, false);
+}
+
+int kfd_smi_event_register(uint32_t num_devs, uint32_t *dev_ids, uint32_t *fd,
+			   uint32_t *id)
+{
+	struct kfd_smi_client *client;
+	int ret;
+
+	client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&client->list);
+
+	ret = kfifo_alloc(&client->fifo, KFD_SMI_MAX_EVENT_MSG * 16,
+			 GFP_KERNEL);
+	if (ret) {
+		kfree(client);
+		return ret;
+	}
+
+	ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
+			       0);
+	if (ret < 0) {
+		kfifo_free(&client->fifo);
+		kfree(client);
+		*fd = 0;
+		return ret;
+	}
+	*fd = ret;
+
+	init_waitqueue_head(&client->wait_queue);
+	client->enabled = 0;
+	client->num_devs = num_devs;
+	client->dev_ids = dev_ids;
+	/* client id is used to identify the client in enable/disable_events */
+	*id = client->id = ++client_id;
+
+	spin_lock(&clients_lock);
+	list_add_rcu(&client->list, &clients);
+	spin_unlock(&clients_lock);
+
+	return 0;
+}
+
+void kfd_smi_event_init(void)
+{
+	INIT_LIST_HEAD(&clients);
+	spin_lock_init(&clients_lock);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
new file mode 100644
index 0000000..a4f9e92
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_SMI_EVENTS_H_INCLUDED
+#define KFD_SMI_EVENTS_H_INCLUDED
+
+void kfd_smi_event_init(void);
+int kfd_smi_event_register(uint32_t num_devs, uint32_t *dev_ids, uint32_t *fd,
+			   uint32_t *id);
+int kfd_smi_event_enable(uint32_t id, uint64_t events);
+int kfd_smi_event_disable(uint32_t id, uint64_t events);
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+
+#endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 4f66764..8146437 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+/*
+ * KFD SMI(System Management Interface) events
+ */
+enum kfd_smi_events_op {
+	KFD_SMI_EVENTS_REGISTER = 1,
+	KFD_SMI_EVENTS_ENABLE,
+	KFD_SMI_EVENTS_DISABLE
+};
+
+/* Event type (defined by bitmask) */
+#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
+
+struct kfd_ioctl_smi_events_args {
+	__u32 op;		/* to KFD */
+	__u64 events;		/* to KFD */
+	__u64 gpuids_array_ptr;	/* to KFD */
+	__u32 num_gpuids;	/* to KFD */
+	__u32 anon_fd;		/* from KFD */
+	__u32 client_id;	/* to/from KFD */
+};
+
+/* 1. All messages must start with (hex)uint64_event(16) + space(1) +
+ *    (hex)gpuid(8) + space(1) =  26 bytes
+ * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25
+ *    When a new event msg uses more memory, change the calculation here.
+ * 3. End with \n(1)
+ * 26 + 25 + 1 = 52
+ */
+#define KFD_SMI_MAX_EVENT_MSG 52
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -546,7 +576,10 @@ enum kfd_mmio_remap {
 #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
 		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
 
+#define AMDKFD_IOC_SMI_EVENTS			\
+		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1F
+#define AMDKFD_COMMAND_END		0x20
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-14 21:30 [PATCH v4] drm/amdkfd: Provide SMI events watch Amber Lin
@ 2020-04-14 23:04 ` Felix Kuehling
  2020-04-15  0:09   ` Lin, Amber
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2020-04-14 23:04 UTC (permalink / raw)
  To: Amber Lin, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 4294 bytes --]

Hi Amber,

Some general remarks about the multi-client support. You added a global
client id that's separate from the file descriptor. That's problematic
for two reasons:

 1. A process could change a different process' event mask
 2. The FD should already be unique per process, no need to invent
    another ID

If we want to allow one process to register for events multiple times
(multiple FDs per process), then the list of clients should be per
process. Each process should only be allowed to change the event masks
of its own clients. The client could be identified by its FD. No need
for another client ID.

But you could also simplify it further by allowing only one event client
per process. Then you don't need the client ID lookup at all. Just have
a single event client in the kfd_process.

Another approach would be to make enable/disable functions of the event
FD, rather than the KFD FD ioctl. It could be an ioctl of the event FD,
or even simpler, you could use the write file-operation to write an
event mask (of arbitrary length if you want to enable growth in the
future). That way everything would be neatly encapsulated in the event
FD private data.

Two more comments inline ...


Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register devices and subscribe events they are interested. After
> registered, the user can use annoymous file descriptor's poll function
> with wait-time specified and wait for events to happen. Once an event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> v2: - remove UNREGISTER and add event ENABLE/DISABLE
>     - correct kfifo usage
>     - move event message API to kfd_ioctl.h
> v3: send the event msg in text than in binary
> v4: support multiple clients
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>

[snip]

> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 4f66764..8146437 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {
>  	__u32 dmabuf_fd;	/* to KFD */
>  };
>  
> +/*
> + * KFD SMI(System Management Interface) events
> + */
> +enum kfd_smi_events_op {
> +	KFD_SMI_EVENTS_REGISTER = 1,
> +	KFD_SMI_EVENTS_ENABLE,
> +	KFD_SMI_EVENTS_DISABLE
> +};
> +
> +/* Event type (defined by bitmask) */
> +#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
> +
> +struct kfd_ioctl_smi_events_args {
> +	__u32 op;		/* to KFD */
> +	__u64 events;		/* to KFD */

The binary layout of the ioctl args structure should be the same on
32/64-bit. That means the 64-bit members should be 64-bit aligned. The
best way to ensure this is to put all the 64-bit members first.


> +	__u64 gpuids_array_ptr;	/* to KFD */
> +	__u32 num_gpuids;	/* to KFD */
> +	__u32 anon_fd;		/* from KFD */
> +	__u32 client_id;	/* to/from KFD */
> +};
> +
> +/* 1. All messages must start with (hex)uint64_event(16) + space(1) +
> + *    (hex)gpuid(8) + space(1) =  26 bytes
> + * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25
> + *    When a new event msg uses more memory, change the calculation here.
> + * 3. End with \n(1)
> + * 26 + 25 + 1 = 52
> + */
> +#define KFD_SMI_MAX_EVENT_MSG 52

If you define the maximum message length here, clients may start
depending on it, and it gets harder to change it later. I'd not define
this in the API header. It's not necessary to write correct clients. And
if used badly, it may encourage writing incorrect clients that break
with longer messages in the future.

Regards,
  Felix


> +
>  /* Register offset inside the remapped mmio page
>   */
>  enum kfd_mmio_remap {
> @@ -546,7 +576,10 @@ enum kfd_mmio_remap {
>  #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
>  		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>  
> +#define AMDKFD_IOC_SMI_EVENTS			\
> +		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
> +
>  #define AMDKFD_COMMAND_START		0x01
> -#define AMDKFD_COMMAND_END		0x1F
> +#define AMDKFD_COMMAND_END		0x20
>  
>  #endif

[-- Attachment #1.2: Type: text/html, Size: 5577 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-14 23:04 ` Felix Kuehling
@ 2020-04-15  0:09   ` Lin, Amber
  2020-04-15  2:40     ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Lin, Amber @ 2020-04-15  0:09 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 5906 bytes --]

[AMD Official Use Only - Internal Distribution Only]

Hi Felix,

That was my assumption too that each registration will get different file descriptor, but it turns out not. When I started two process and both register gpu0 and gpu1, they both got fd=15. If I have process A register gpu0+gpu1, and process B only register gpu0, process A gets fd=15 and process B gets fd=9. That's why I added client ID.

By multiple clients, I mean multiple processes. The ask is users want to have multiple tools and those different tools can use rsmi lib to watch events at the same time. Due to the reason above that two processes can actually get the same fd and I need to add client ID to distinguish the registration, I don't see the point of limiting one registration per process unless I use pid to distinguish the client instead, which was in my consideration too when I was writing the code. But second thought is why adding this restriction when client ID can allow the tool to watch different events on different devices if they want to. Maybe client ID is a bad term and it misleads you. I should call it register ID.

Regards,
Amber

From: Kuehling, Felix <Felix.Kuehling@amd.com>
Sent: Tuesday, April 14, 2020 7:04 PM
To: Lin, Amber <Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH v4] drm/amdkfd: Provide SMI events watch


Hi Amber,

Some general remarks about the multi-client support. You added a global client id that's separate from the file descriptor. That's problematic for two reasons:

  1.  A process could change a different process' event mask
  2.  The FD should already be unique per process, no need to invent another ID

If we want to allow one process to register for events multiple times (multiple FDs per process), then the list of clients should be per process. Each process should only be allowed to change the event masks of its own clients. The client could be identified by its FD. No need for another client ID.

But you could also simplify it further by allowing only one event client per process. Then you don't need the client ID lookup at all. Just have a single event client in the kfd_process.

Another approach would be to make enable/disable functions of the event FD, rather than the KFD FD ioctl. It could be an ioctl of the event FD, or even simpler, you could use the write file-operation to write an event mask (of arbitrary length if you want to enable growth in the future). That way everything would be neatly encapsulated in the event FD private data.

Two more comments inline ...


Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:

When the compute is malfunctioning or performance drops, the system admin

will use SMI (System Management Interface) tool to monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user

space to register devices and subscribe events they are interested. After

registered, the user can use annoymous file descriptor's poll function

with wait-time specified and wait for events to happen. Once an event

happens, the user can use read() to retrieve information related to the

event.



VM fault event is done in this patch.



v2: - remove UNREGISTER and add event ENABLE/DISABLE

    - correct kfifo usage

    - move event message API to kfd_ioctl.h

v3: send the event msg in text than in binary

v4: support multiple clients



Signed-off-by: Amber Lin <Amber.Lin@amd.com><mailto:Amber.Lin@amd.com>

[snip]

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h

index 4f66764..8146437 100644

--- a/include/uapi/linux/kfd_ioctl.h

+++ b/include/uapi/linux/kfd_ioctl.h

@@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {

  __u32 dmabuf_fd;       /* to KFD */

 };



+/*

+ * KFD SMI(System Management Interface) events

+ */

+enum kfd_smi_events_op {

+ KFD_SMI_EVENTS_REGISTER = 1,

+ KFD_SMI_EVENTS_ENABLE,

+ KFD_SMI_EVENTS_DISABLE

+};

+

+/* Event type (defined by bitmask) */

+#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001

+

+struct kfd_ioctl_smi_events_args {

+ __u32 op;              /* to KFD */

+ __u64 events;          /* to KFD */

The binary layout of the ioctl args structure should be the same on 32/64-bit. That means the 64-bit members should be 64-bit aligned. The best way to ensure this is to put all the 64-bit members first.




+ __u64 gpuids_array_ptr;        /* to KFD */

+ __u32 num_gpuids;      /* to KFD */

+ __u32 anon_fd;         /* from KFD */

+ __u32 client_id;       /* to/from KFD */

+};

+

+/* 1. All messages must start with (hex)uint64_event(16) + space(1) +

+ *    (hex)gpuid(8) + space(1) =  26 bytes

+ * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25

+ *    When a new event msg uses more memory, change the calculation here.

+ * 3. End with \n(1)

+ * 26 + 25 + 1 = 52

+ */

+#define KFD_SMI_MAX_EVENT_MSG 52

If you define the maximum message length here, clients may start depending on it, and it gets harder to change it later. I'd not define this in the API header. It's not necessary to write correct clients. And if used badly, it may encourage writing incorrect clients that break with longer messages in the future.

Regards,
  Felix





+

 /* Register offset inside the remapped mmio page

  */

 enum kfd_mmio_remap {

@@ -546,7 +576,10 @@ enum kfd_mmio_remap {

 #define AMDKFD_IOC_ALLOC_QUEUE_GWS            \

         AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)



+#define AMDKFD_IOC_SMI_EVENTS                 \

+        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)

+

 #define AMDKFD_COMMAND_START           0x01

-#define AMDKFD_COMMAND_END             0x1F

+#define AMDKFD_COMMAND_END             0x20



 #endif

[-- Attachment #1.2: Type: text/html, Size: 13510 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-15  0:09   ` Lin, Amber
@ 2020-04-15  2:40     ` Felix Kuehling
  2020-04-15  3:03       ` Deucher, Alexander
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2020-04-15  2:40 UTC (permalink / raw)
  To: Lin, Amber, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 7518 bytes --]

Hi Amber,

I understand that different processes can get the same FD. My statement
about FD being unique is relative to one process.

The main problem with the global client ID is, that it allows process A
to change the event mask of process B just by specifying process B's
client ID. That can lead to denial of service attacks where process A
can cause events not to be delivered to B or can flood process B with
frequent events that it's not prepared to handle.

Therefore you must make the lookup of the client from the client ID not
from a global list, but from a per-process list. That way process A can
only change event masks of process A clients, and not those of any other
process.

But if the client list is process-specific, you can use the FD as a
unique identifier of the client within the process, so you don't need a
separate client ID.

Regards,
  Felix

Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber:
>
> [AMD Official Use Only - Internal Distribution Only]
>
>  
>
> Hi Felix,
>
>  
>
> That was my assumption too that each registration will get different
> file descriptor, but it turns out not. When I started two process and
> both register gpu0 and gpu1, they both got fd=15. If I have process A
> register gpu0+gpu1, and process B only register gpu0, process A gets
> fd=15 and process B gets fd=9. That’s why I added client ID.
>
>  
>
> By multiple clients, I mean multiple processes. The ask is users want
> to have multiple tools and those different tools can use rsmi lib to
> watch events at the same time. Due to the reason above that two
> processes can actually get the same fd and I need to add client ID to
> distinguish the registration, I don’t see the point of limiting one
> registration per process unless I use pid to distinguish the client
> instead, which was in my consideration too when I was writing the
> code. But second thought is why adding this restriction when client ID
> can allow the tool to watch different events on different devices if
> they want to. Maybe client ID is a bad term and it misleads you. I
> should call it register ID.
>
>  
>
> Regards,
>
> Amber
>
>  
>
> *From:* Kuehling, Felix <Felix.Kuehling@amd.com>
> *Sent:* Tuesday, April 14, 2020 7:04 PM
> *To:* Lin, Amber <Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org
> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
>
>  
>
> Hi Amber,
>
> Some general remarks about the multi-client support. You added a
> global client id that's separate from the file descriptor. That's
> problematic for two reasons:
>
>  1. A process could change a different process' event mask
>  2. The FD should already be unique per process, no need to invent
>     another ID
>
> If we want to allow one process to register for events multiple times
> (multiple FDs per process), then the list of clients should be per
> process. Each process should only be allowed to change the event masks
> of its own clients. The client could be identified by its FD. No need
> for another client ID.
>
> But you could also simplify it further by allowing only one event
> client per process. Then you don't need the client ID lookup at all.
> Just have a single event client in the kfd_process.
>
> Another approach would be to make enable/disable functions of the
> event FD, rather than the KFD FD ioctl. It could be an ioctl of the
> event FD, or even simpler, you could use the write file-operation to
> write an event mask (of arbitrary length if you want to enable growth
> in the future). That way everything would be neatly encapsulated in
> the event FD private data.
>
> Two more comments inline ...
>
>  
>
> Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:
>
>     When the compute is malfunctioning or performance drops, the system admin
>
>     will use SMI (System Management Interface) tool to monitor/diagnostic what
>
>     went wrong. This patch provides an event watch interface for the user
>
>     space to register devices and subscribe events they are interested. After
>
>     registered, the user can use annoymous file descriptor's poll function
>
>     with wait-time specified and wait for events to happen. Once an event
>
>     happens, the user can use read() to retrieve information related to the
>
>     event.
>
>      
>
>     VM fault event is done in this patch.
>
>      
>
>     v2: - remove UNREGISTER and add event ENABLE/DISABLE
>
>         - correct kfifo usage
>
>         - move event message API to kfd_ioctl.h
>
>     v3: send the event msg in text than in binary
>
>     v4: support multiple clients
>
>      
>
>     Signed-off-by: Amber Lin <Amber.Lin@amd.com> <mailto:Amber.Lin@amd.com>
>
> [snip]
>
>     diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>
>     index 4f66764..8146437 100644
>
>     --- a/include/uapi/linux/kfd_ioctl.h
>
>     +++ b/include/uapi/linux/kfd_ioctl.h
>
>     @@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {
>
>       __u32 dmabuf_fd;       /* to KFD */
>
>      };
>
>      
>
>     +/*
>
>     + * KFD SMI(System Management Interface) events
>
>     + */
>
>     +enum kfd_smi_events_op {
>
>     + KFD_SMI_EVENTS_REGISTER = 1,
>
>     + KFD_SMI_EVENTS_ENABLE,
>
>     + KFD_SMI_EVENTS_DISABLE
>
>     +};
>
>     +
>
>     +/* Event type (defined by bitmask) */
>
>     +#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
>
>     +
>
>     +struct kfd_ioctl_smi_events_args {
>
>     + __u32 op;              /* to KFD */
>
>     + __u64 events;          /* to KFD */
>
> The binary layout of the ioctl args structure should be the same on
> 32/64-bit. That means the 64-bit members should be 64-bit aligned. The
> best way to ensure this is to put all the 64-bit members first.
>
>  
>
>      
>
>     + __u64 gpuids_array_ptr;        /* to KFD */
>
>     + __u32 num_gpuids;      /* to KFD */
>
>     + __u32 anon_fd;         /* from KFD */
>
>     + __u32 client_id;       /* to/from KFD */
>
>     +};
>
>     +
>
>     +/* 1. All messages must start with (hex)uint64_event(16) + space(1) +
>
>     + *    (hex)gpuid(8) + space(1) =  26 bytes
>
>     + * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25
>
>     + *    When a new event msg uses more memory, change the calculation here.
>
>     + * 3. End with \n(1)
>
>     + * 26 + 25 + 1 = 52
>
>     + */
>
>     +#define KFD_SMI_MAX_EVENT_MSG 52
>
> If you define the maximum message length here, clients may start
> depending on it, and it gets harder to change it later. I'd not define
> this in the API header. It's not necessary to write correct clients.
> And if used badly, it may encourage writing incorrect clients that
> break with longer messages in the future.
>
> Regards,
>   Felix
>
>  
>
>      
>
>     +
>
>      /* Register offset inside the remapped mmio page
>
>       */
>
>      enum kfd_mmio_remap {
>
>     @@ -546,7 +576,10 @@ enum kfd_mmio_remap {
>
>      #define AMDKFD_IOC_ALLOC_QUEUE_GWS            \
>
>              AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>
>      
>
>     +#define AMDKFD_IOC_SMI_EVENTS                 \
>
>     +        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>
>     +
>
>      #define AMDKFD_COMMAND_START           0x01
>
>     -#define AMDKFD_COMMAND_END             0x1F
>
>     +#define AMDKFD_COMMAND_END             0x20
>
>      
>
>      #endif
>

[-- Attachment #1.2: Type: text/html, Size: 16244 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-15  2:40     ` Felix Kuehling
@ 2020-04-15  3:03       ` Deucher, Alexander
  2020-04-15 13:36         ` Amber Lin
  0 siblings, 1 reply; 9+ messages in thread
From: Deucher, Alexander @ 2020-04-15  3:03 UTC (permalink / raw)
  To: Kuehling, Felix, Lin, Amber, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 7463 bytes --]

[AMD Public Use]

Some good advice on getting ioctls right:
https://www.kernel.org/doc/html/v5.4-preprc-cpu/ioctl/botching-up-ioctls.html

Alex

________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Felix Kuehling <felix.kuehling@amd.com>
Sent: Tuesday, April 14, 2020 10:40 PM
To: Lin, Amber <Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH v4] drm/amdkfd: Provide SMI events watch


Hi Amber,

I understand that different processes can get the same FD. My statement about FD being unique is relative to one process.

The main problem with the global client ID is, that it allows process A to change the event mask of process B just by specifying process B's client ID. That can lead to denial of service attacks where process A can cause events not to be delivered to B or can flood process B with frequent events that it's not prepared to handle.

Therefore you must make the lookup of the client from the client ID not from a global list, but from a per-process list. That way process A can only change event masks of process A clients, and not those of any other process.

But if the client list is process-specific, you can use the FD as a unique identifier of the client within the process, so you don't need a separate client ID.

Regards,
  Felix

Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber:

[AMD Official Use Only - Internal Distribution Only]



Hi Felix,



That was my assumption too that each registration will get different file descriptor, but it turns out not. When I started two process and both register gpu0 and gpu1, they both got fd=15. If I have process A register gpu0+gpu1, and process B only register gpu0, process A gets fd=15 and process B gets fd=9. That’s why I added client ID.



By multiple clients, I mean multiple processes. The ask is users want to have multiple tools and those different tools can use rsmi lib to watch events at the same time. Due to the reason above that two processes can actually get the same fd and I need to add client ID to distinguish the registration, I don’t see the point of limiting one registration per process unless I use pid to distinguish the client instead, which was in my consideration too when I was writing the code. But second thought is why adding this restriction when client ID can allow the tool to watch different events on different devices if they want to. Maybe client ID is a bad term and it misleads you. I should call it register ID.



Regards,

Amber



From: Kuehling, Felix <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>
Sent: Tuesday, April 14, 2020 7:04 PM
To: Lin, Amber <Amber.Lin@amd.com><mailto:Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH v4] drm/amdkfd: Provide SMI events watch



Hi Amber,

Some general remarks about the multi-client support. You added a global client id that's separate from the file descriptor. That's problematic for two reasons:

  1.  A process could change a different process' event mask
  2.  The FD should already be unique per process, no need to invent another ID

If we want to allow one process to register for events multiple times (multiple FDs per process), then the list of clients should be per process. Each process should only be allowed to change the event masks of its own clients. The client could be identified by its FD. No need for another client ID.

But you could also simplify it further by allowing only one event client per process. Then you don't need the client ID lookup at all. Just have a single event client in the kfd_process.

Another approach would be to make enable/disable functions of the event FD, rather than the KFD FD ioctl. It could be an ioctl of the event FD, or even simpler, you could use the write file-operation to write an event mask (of arbitrary length if you want to enable growth in the future). That way everything would be neatly encapsulated in the event FD private data.

Two more comments inline ...



Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:

When the compute is malfunctioning or performance drops, the system admin

will use SMI (System Management Interface) tool to monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user

space to register devices and subscribe events they are interested. After

registered, the user can use annoymous file descriptor's poll function

with wait-time specified and wait for events to happen. Once an event

happens, the user can use read() to retrieve information related to the

event.



VM fault event is done in this patch.



v2: - remove UNREGISTER and add event ENABLE/DISABLE

    - correct kfifo usage

    - move event message API to kfd_ioctl.h

v3: send the event msg in text than in binary

v4: support multiple clients



Signed-off-by: Amber Lin <Amber.Lin@amd.com><mailto:Amber.Lin@amd.com>

[snip]

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h

index 4f66764..8146437 100644

--- a/include/uapi/linux/kfd_ioctl.h

+++ b/include/uapi/linux/kfd_ioctl.h

@@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {

  __u32 dmabuf_fd;       /* to KFD */

 };



+/*

+ * KFD SMI(System Management Interface) events

+ */

+enum kfd_smi_events_op {

+ KFD_SMI_EVENTS_REGISTER = 1,

+ KFD_SMI_EVENTS_ENABLE,

+ KFD_SMI_EVENTS_DISABLE

+};

+

+/* Event type (defined by bitmask) */

+#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001

+

+struct kfd_ioctl_smi_events_args {

+ __u32 op;              /* to KFD */

+ __u64 events;          /* to KFD */

The binary layout of the ioctl args structure should be the same on 32/64-bit. That means the 64-bit members should be 64-bit aligned. The best way to ensure this is to put all the 64-bit members first.





+ __u64 gpuids_array_ptr;        /* to KFD */

+ __u32 num_gpuids;      /* to KFD */

+ __u32 anon_fd;         /* from KFD */

+ __u32 client_id;       /* to/from KFD */

+};

+

+/* 1. All messages must start with (hex)uint64_event(16) + space(1) +

+ *    (hex)gpuid(8) + space(1) =  26 bytes

+ * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25

+ *    When a new event msg uses more memory, change the calculation here.

+ * 3. End with \n(1)

+ * 26 + 25 + 1 = 52

+ */

+#define KFD_SMI_MAX_EVENT_MSG 52

If you define the maximum message length here, clients may start depending on it, and it gets harder to change it later. I'd not define this in the API header. It's not necessary to write correct clients. And if used badly, it may encourage writing incorrect clients that break with longer messages in the future.

Regards,
  Felix





+

 /* Register offset inside the remapped mmio page

  */

 enum kfd_mmio_remap {

@@ -546,7 +576,10 @@ enum kfd_mmio_remap {

 #define AMDKFD_IOC_ALLOC_QUEUE_GWS            \

         AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)



+#define AMDKFD_IOC_SMI_EVENTS                 \

+        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)

+

 #define AMDKFD_COMMAND_START           0x01

-#define AMDKFD_COMMAND_END             0x1F

+#define AMDKFD_COMMAND_END             0x20



 #endif

[-- Attachment #1.2: Type: text/html, Size: 13054 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-15  3:03       ` Deucher, Alexander
@ 2020-04-15 13:36         ` Amber Lin
  2020-04-15 13:48           ` Deucher, Alexander
  0 siblings, 1 reply; 9+ messages in thread
From: Amber Lin @ 2020-04-15 13:36 UTC (permalink / raw)
  To: Deucher, Alexander, Kuehling, Felix, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8933 bytes --]

Thank you Felix. Now I understand the problem of global client ID is 
leaking a hole for potential attackers. I didn't take that into 
consideration. I'll change that following your advice below.

Hi Alex,

Thank you for the link. It's helpful. I have a question regarding the 
versioning. One topic in the article talks about how the userspace can 
figure out if the new ioctl is supported in a given kernel. Is it 
correct that with dkms driver, we use the driver version coming from 
AMDGPU_VERSION in amdgpu_drv.c, and in upstream kernel we use the kernel 
version?

Thanks.

Amber

On 2020-04-14 11:03 p.m., Deucher, Alexander wrote:
>
> [AMD Public Use]
>
>
> Some good advice on getting ioctls right:
> https://www.kernel.org/doc/html/v5.4-preprc-cpu/ioctl/botching-up-ioctls.html
>
> Alex
>
> ------------------------------------------------------------------------
> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of 
> Felix Kuehling <felix.kuehling@amd.com>
> *Sent:* Tuesday, April 14, 2020 10:40 PM
> *To:* Lin, Amber <Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
>
> Hi Amber,
>
> I understand that different processes can get the same FD. My 
> statement about FD being unique is relative to one process.
>
> The main problem with the global client ID is, that it allows process 
> A to change the event mask of process B just by specifying process B's 
> client ID. That can lead to denial of service attacks where process A 
> can cause events not to be delivered to B or can flood process B with 
> frequent events that it's not prepared to handle.
>
> Therefore you must make the lookup of the client from the client ID 
> not from a global list, but from a per-process list. That way process 
> A can only change event masks of process A clients, and not those of 
> any other process.
>
> But if the client list is process-specific, you can use the FD as a 
> unique identifier of the client within the process, so you don't need 
> a separate client ID.
>
> Regards,
>   Felix
>
> Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber:
>>
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi Felix,
>>
>> That was my assumption too that each registration will get different 
>> file descriptor, but it turns out not. When I started two process and 
>> both register gpu0 and gpu1, they both got fd=15. If I have process A 
>> register gpu0+gpu1, and process B only register gpu0, process A gets 
>> fd=15 and process B gets fd=9. That’s why I added client ID.
>>
>> By multiple clients, I mean multiple processes. The ask is users want 
>> to have multiple tools and those different tools can use rsmi lib to 
>> watch events at the same time. Due to the reason above that two 
>> processes can actually get the same fd and I need to add client ID to 
>> distinguish the registration, I don’t see the point of limiting one 
>> registration per process unless I use pid to distinguish the client 
>> instead, which was in my consideration too when I was writing the 
>> code. But second thought is why adding this restriction when client 
>> ID can allow the tool to watch different events on different devices 
>> if they want to. Maybe client ID is a bad term and it misleads you. I 
>> should call it register ID.
>>
>> Regards,
>>
>> Amber
>>
>> *From:* Kuehling, Felix <Felix.Kuehling@amd.com> 
>> <mailto:Felix.Kuehling@amd.com>
>> *Sent:* Tuesday, April 14, 2020 7:04 PM
>> *To:* Lin, Amber <Amber.Lin@amd.com> <mailto:Amber.Lin@amd.com>; 
>> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
>>
>> Hi Amber,
>>
>> Some general remarks about the multi-client support. You added a 
>> global client id that's separate from the file descriptor. That's 
>> problematic for two reasons:
>>
>>  1. A process could change a different process' event mask
>>  2. The FD should already be unique per process, no need to invent
>>     another ID
>>
>> If we want to allow one process to register for events multiple times 
>> (multiple FDs per process), then the list of clients should be per 
>> process. Each process should only be allowed to change the event 
>> masks of its own clients. The client could be identified by its FD. 
>> No need for another client ID.
>>
>> But you could also simplify it further by allowing only one event 
>> client per process. Then you don't need the client ID lookup at all. 
>> Just have a single event client in the kfd_process.
>>
>> Another approach would be to make enable/disable functions of the 
>> event FD, rather than the KFD FD ioctl. It could be an ioctl of the 
>> event FD, or even simpler, you could use the write file-operation to 
>> write an event mask (of arbitrary length if you want to enable growth 
>> in the future). That way everything would be neatly encapsulated in 
>> the event FD private data.
>>
>> Two more comments inline ...
>>
>> Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:
>>
>>     When the compute is malfunctioning or performance drops, the system admin
>>
>>     will use SMI (System Management Interface) tool to monitor/diagnostic what
>>
>>     went wrong. This patch provides an event watch interface for the user
>>
>>     space to register devices and subscribe events they are interested. After
>>
>>     registered, the user can use annoymous file descriptor's poll function
>>
>>     with wait-time specified and wait for events to happen. Once an event
>>
>>     happens, the user can use read() to retrieve information related to the
>>
>>     event.
>>
>>       
>>
>>     VM fault event is done in this patch.
>>
>>       
>>
>>     v2: - remove UNREGISTER and add event ENABLE/DISABLE
>>
>>          - correct kfifo usage
>>
>>          - move event message API to kfd_ioctl.h
>>
>>     v3: send the event msg in text than in binary
>>
>>     v4: support multiple clients
>>
>>       
>>
>>     Signed-off-by: Amber Lin<Amber.Lin@amd.com>  <mailto:Amber.Lin@amd.com>
>>
>> [snip]
>>
>>     diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>>
>>     index 4f66764..8146437 100644
>>
>>     --- a/include/uapi/linux/kfd_ioctl.h
>>
>>     +++ b/include/uapi/linux/kfd_ioctl.h
>>
>>     @@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {
>>
>>        __u32 dmabuf_fd;       /* to KFD */
>>
>>       };
>>
>>       
>>
>>     +/*
>>
>>     + * KFD SMI(System Management Interface) events
>>
>>     + */
>>
>>     +enum kfd_smi_events_op {
>>
>>     + KFD_SMI_EVENTS_REGISTER = 1,
>>
>>     + KFD_SMI_EVENTS_ENABLE,
>>
>>     + KFD_SMI_EVENTS_DISABLE
>>
>>     +};
>>
>>     +
>>
>>     +/* Event type (defined by bitmask) */
>>
>>     +#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
>>
>>     +
>>
>>     +struct kfd_ioctl_smi_events_args {
>>
>>     + __u32 op;              /* to KFD */
>>
>>     + __u64 events;          /* to KFD */
>>
>> The binary layout of the ioctl args structure should be the same on 
>> 32/64-bit. That means the 64-bit members should be 64-bit aligned. 
>> The best way to ensure this is to put all the 64-bit members first.
>>
>>       
>>
>>     + __u64 gpuids_array_ptr;        /* to KFD */
>>
>>     + __u32 num_gpuids;      /* to KFD */
>>
>>     + __u32 anon_fd;         /* from KFD */
>>
>>     + __u32 client_id;       /* to/from KFD */
>>
>>     +};
>>
>>     +
>>
>>     +/* 1. All messages must start with (hex)uint64_event(16) + space(1) +
>>
>>     + *    (hex)gpuid(8) + space(1) =  26 bytes
>>
>>     + * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25
>>
>>     + *    When a new event msg uses more memory, change the calculation here.
>>
>>     + * 3. End with \n(1)
>>
>>     + * 26 + 25 + 1 = 52
>>
>>     + */
>>
>>     +#define KFD_SMI_MAX_EVENT_MSG 52
>>
>> If you define the maximum message length here, clients may start 
>> depending on it, and it gets harder to change it later. I'd not 
>> define this in the API header. It's not necessary to write correct 
>> clients. And if used badly, it may encourage writing incorrect 
>> clients that break with longer messages in the future.
>>
>> Regards,
>>   Felix
>>
>>       
>>
>>     +
>>
>>       /* Register offset inside the remapped mmio page
>>
>>        */
>>
>>       enum kfd_mmio_remap {
>>
>>     @@ -546,7 +576,10 @@ enum kfd_mmio_remap {
>>
>>       #define AMDKFD_IOC_ALLOC_QUEUE_GWS            \
>>
>>               AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>
>>       
>>
>>     +#define AMDKFD_IOC_SMI_EVENTS                 \
>>
>>     +        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>
>>     +
>>
>>       #define AMDKFD_COMMAND_START           0x01
>>
>>     -#define AMDKFD_COMMAND_END             0x1F
>>
>>     +#define AMDKFD_COMMAND_END             0x20
>>
>>       
>>
>>       #endif
>>


[-- Attachment #1.2: Type: text/html, Size: 17856 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-15 13:36         ` Amber Lin
@ 2020-04-15 13:48           ` Deucher, Alexander
  2020-04-15 15:26             ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Deucher, Alexander @ 2020-04-15 13:48 UTC (permalink / raw)
  To: Lin, Amber, Kuehling, Felix, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 8785 bytes --]

[AMD Public Use]

We use the drm major/minor in all cases.  Bump  KMS_DRIVER_MINOR in amdgpu_drv.c and add a note about what was added in the comment.

Alex
________________________________
From: Lin, Amber <Amber.Lin@amd.com>
Sent: Wednesday, April 15, 2020 9:36 AM
To: Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix <Felix.Kuehling@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH v4] drm/amdkfd: Provide SMI events watch

Thank you Felix. Now I understand the problem of global client ID is leaking a hole for potential attackers. I didn't take that into consideration. I'll change that following your advice below.

Hi Alex,

Thank you for the link. It's helpful. I have a question regarding the versioning. One topic in the article talks about how the userspace can figure out if the new ioctl is supported in a given kernel. Is it correct that with dkms driver, we use the driver version coming from AMDGPU_VERSION in amdgpu_drv.c, and in upstream kernel we use the kernel version?

Thanks.

Amber

On 2020-04-14 11:03 p.m., Deucher, Alexander wrote:

[AMD Public Use]

Some good advice on getting ioctls right:
https://www.kernel.org/doc/html/v5.4-preprc-cpu/ioctl/botching-up-ioctls.html

Alex

________________________________
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org><mailto:amd-gfx-bounces@lists.freedesktop.org> on behalf of Felix Kuehling <felix.kuehling@amd.com><mailto:felix.kuehling@amd.com>
Sent: Tuesday, April 14, 2020 10:40 PM
To: Lin, Amber <Amber.Lin@amd.com><mailto:Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> <amd-gfx@lists.freedesktop.org><mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH v4] drm/amdkfd: Provide SMI events watch


Hi Amber,

I understand that different processes can get the same FD. My statement about FD being unique is relative to one process.

The main problem with the global client ID is, that it allows process A to change the event mask of process B just by specifying process B's client ID. That can lead to denial of service attacks where process A can cause events not to be delivered to B or can flood process B with frequent events that it's not prepared to handle.

Therefore you must make the lookup of the client from the client ID not from a global list, but from a per-process list. That way process A can only change event masks of process A clients, and not those of any other process.

But if the client list is process-specific, you can use the FD as a unique identifier of the client within the process, so you don't need a separate client ID.

Regards,
  Felix

Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber:

[AMD Official Use Only - Internal Distribution Only]



Hi Felix,



That was my assumption too that each registration will get different file descriptor, but it turns out not. When I started two process and both register gpu0 and gpu1, they both got fd=15. If I have process A register gpu0+gpu1, and process B only register gpu0, process A gets fd=15 and process B gets fd=9. That’s why I added client ID.



By multiple clients, I mean multiple processes. The ask is users want to have multiple tools and those different tools can use rsmi lib to watch events at the same time. Due to the reason above that two processes can actually get the same fd and I need to add client ID to distinguish the registration, I don’t see the point of limiting one registration per process unless I use pid to distinguish the client instead, which was in my consideration too when I was writing the code. But second thought is why adding this restriction when client ID can allow the tool to watch different events on different devices if they want to. Maybe client ID is a bad term and it misleads you. I should call it register ID.



Regards,

Amber



From: Kuehling, Felix <Felix.Kuehling@amd.com><mailto:Felix.Kuehling@amd.com>
Sent: Tuesday, April 14, 2020 7:04 PM
To: Lin, Amber <Amber.Lin@amd.com><mailto:Amber.Lin@amd.com>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH v4] drm/amdkfd: Provide SMI events watch



Hi Amber,

Some general remarks about the multi-client support. You added a global client id that's separate from the file descriptor. That's problematic for two reasons:

  1.  A process could change a different process' event mask
  2.  The FD should already be unique per process, no need to invent another ID

If we want to allow one process to register for events multiple times (multiple FDs per process), then the list of clients should be per process. Each process should only be allowed to change the event masks of its own clients. The client could be identified by its FD. No need for another client ID.

But you could also simplify it further by allowing only one event client per process. Then you don't need the client ID lookup at all. Just have a single event client in the kfd_process.

Another approach would be to make enable/disable functions of the event FD, rather than the KFD FD ioctl. It could be an ioctl of the event FD, or even simpler, you could use the write file-operation to write an event mask (of arbitrary length if you want to enable growth in the future). That way everything would be neatly encapsulated in the event FD private data.

Two more comments inline ...



Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:

When the compute is malfunctioning or performance drops, the system admin

will use SMI (System Management Interface) tool to monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user

space to register devices and subscribe events they are interested. After

registered, the user can use annoymous file descriptor's poll function

with wait-time specified and wait for events to happen. Once an event

happens, the user can use read() to retrieve information related to the

event.



VM fault event is done in this patch.



v2: - remove UNREGISTER and add event ENABLE/DISABLE

    - correct kfifo usage

    - move event message API to kfd_ioctl.h

v3: send the event msg in text than in binary

v4: support multiple clients



Signed-off-by: Amber Lin <Amber.Lin@amd.com><mailto:Amber.Lin@amd.com>

[snip]

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h

index 4f66764..8146437 100644

--- a/include/uapi/linux/kfd_ioctl.h

+++ b/include/uapi/linux/kfd_ioctl.h

@@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {

  __u32 dmabuf_fd;       /* to KFD */

 };



+/*

+ * KFD SMI(System Management Interface) events

+ */

+enum kfd_smi_events_op {

+ KFD_SMI_EVENTS_REGISTER = 1,

+ KFD_SMI_EVENTS_ENABLE,

+ KFD_SMI_EVENTS_DISABLE

+};

+

+/* Event type (defined by bitmask) */

+#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001

+

+struct kfd_ioctl_smi_events_args {

+ __u32 op;              /* to KFD */

+ __u64 events;          /* to KFD */

The binary layout of the ioctl args structure should be the same on 32/64-bit. That means the 64-bit members should be 64-bit aligned. The best way to ensure this is to put all the 64-bit members first.





+ __u64 gpuids_array_ptr;        /* to KFD */

+ __u32 num_gpuids;      /* to KFD */

+ __u32 anon_fd;         /* from KFD */

+ __u32 client_id;       /* to/from KFD */

+};

+

+/* 1. All messages must start with (hex)uint64_event(16) + space(1) +

+ *    (hex)gpuid(8) + space(1) =  26 bytes

+ * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25

+ *    When a new event msg uses more memory, change the calculation here.

+ * 3. End with \n(1)

+ * 26 + 25 + 1 = 52

+ */

+#define KFD_SMI_MAX_EVENT_MSG 52

If you define the maximum message length here, clients may start depending on it, and it gets harder to change it later. I'd not define this in the API header. It's not necessary to write correct clients. And if used badly, it may encourage writing incorrect clients that break with longer messages in the future.

Regards,
  Felix





+

 /* Register offset inside the remapped mmio page

  */

 enum kfd_mmio_remap {

@@ -546,7 +576,10 @@ enum kfd_mmio_remap {

 #define AMDKFD_IOC_ALLOC_QUEUE_GWS            \

         AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)



+#define AMDKFD_IOC_SMI_EVENTS                 \

+        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)

+

 #define AMDKFD_COMMAND_START           0x01

-#define AMDKFD_COMMAND_END             0x1F

+#define AMDKFD_COMMAND_END             0x20



 #endif


[-- Attachment #1.2: Type: text/html, Size: 15574 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
  2020-04-15 13:48           ` Deucher, Alexander
@ 2020-04-15 15:26             ` Felix Kuehling
  0 siblings, 0 replies; 9+ messages in thread
From: Felix Kuehling @ 2020-04-15 15:26 UTC (permalink / raw)
  To: Deucher, Alexander, Lin, Amber, amd-gfx


[-- Attachment #1.1: Type: text/plain, Size: 10715 bytes --]


Am 2020-04-15 um 9:48 a.m. schrieb Deucher, Alexander:
>
> [AMD Public Use]
>
>
> We use the drm major/minor in all cases.  Bump  KMS_DRIVER_MINOR in
> amdgpu_drv.c and add a note about what was added in the comment.

The KFD ioctl API has its own major and minor version defined in
include/uapi/linux/kfd_ioctl.h. Thunk clients can query that version
with hsaKmtGetVersion. We haven't been good at updating that version
when we make API changes. Currently two versions are in use:


Upstream: 1.1

DKMS: 1.2


I think this would be a good time to start a habit of bumping the KFD
ioctl minor version every time we change the upstream KFD ioctl API. We
should skip version 1.2, since that is in use by the DKMS driver. We
should also add a comment block above the version definition that
explains the changes in each future API version update.


Thanks,
  Felix


>
> Alex
> ------------------------------------------------------------------------
> *From:* Lin, Amber <Amber.Lin@amd.com>
> *Sent:* Wednesday, April 15, 2020 9:36 AM
> *To:* Deucher, Alexander <Alexander.Deucher@amd.com>; Kuehling, Felix
> <Felix.Kuehling@amd.com>; amd-gfx@lists.freedesktop.org
> <amd-gfx@lists.freedesktop.org>
> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
>  
> Thank you Felix. Now I understand the problem of global client ID is
> leaking a hole for potential attackers. I didn't take that into
> consideration. I'll change that following your advice below.
>
> Hi Alex,
>
> Thank you for the link. It's helpful. I have a question regarding the
> versioning. One topic in the article talks about how the userspace can
> figure out if the new ioctl is supported in a given kernel. Is it
> correct that with dkms driver, we use the driver version coming from
> AMDGPU_VERSION in amdgpu_drv.c, and in upstream kernel we use the
> kernel version?
>
> Thanks.
>
> Amber
>
> On 2020-04-14 11:03 p.m., Deucher, Alexander wrote:
>>
>> [AMD Public Use]
>>
>>
>> Some good advice on getting ioctls right:
>> https://www.kernel.org/doc/html/v5.4-preprc-cpu/ioctl/botching-up-ioctls.html
>>
>> Alex
>>
>> ------------------------------------------------------------------------
>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org>
>> <mailto:amd-gfx-bounces@lists.freedesktop.org> on behalf of Felix
>> Kuehling <felix.kuehling@amd.com> <mailto:felix.kuehling@amd.com>
>> *Sent:* Tuesday, April 14, 2020 10:40 PM
>> *To:* Lin, Amber <Amber.Lin@amd.com> <mailto:Amber.Lin@amd.com>;
>> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>> <amd-gfx@lists.freedesktop.org> <mailto:amd-gfx@lists.freedesktop.org>
>> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
>>  
>>
>> Hi Amber,
>>
>> I understand that different processes can get the same FD. My
>> statement about FD being unique is relative to one process.
>>
>> The main problem with the global client ID is, that it allows process
>> A to change the event mask of process B just by specifying process
>> B's client ID. That can lead to denial of service attacks where
>> process A can cause events not to be delivered to B or can flood
>> process B with frequent events that it's not prepared to handle.
>>
>> Therefore you must make the lookup of the client from the client ID
>> not from a global list, but from a per-process list. That way process
>> A can only change event masks of process A clients, and not those of
>> any other process.
>>
>> But if the client list is process-specific, you can use the FD as a
>> unique identifier of the client within the process, so you don't need
>> a separate client ID.
>>
>> Regards,
>>   Felix
>>
>> Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber:
>>>
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>>  
>>>
>>> Hi Felix,
>>>
>>>  
>>>
>>> That was my assumption too that each registration will get different
>>> file descriptor, but it turns out not. When I started two process
>>> and both register gpu0 and gpu1, they both got fd=15. If I have
>>> process A register gpu0+gpu1, and process B only register gpu0,
>>> process A gets fd=15 and process B gets fd=9. That’s why I added
>>> client ID.
>>>
>>>  
>>>
>>> By multiple clients, I mean multiple processes. The ask is users
>>> want to have multiple tools and those different tools can use rsmi
>>> lib to watch events at the same time. Due to the reason above that
>>> two processes can actually get the same fd and I need to add client
>>> ID to distinguish the registration, I don’t see the point of
>>> limiting one registration per process unless I use pid to
>>> distinguish the client instead, which was in my consideration too
>>> when I was writing the code. But second thought is why adding this
>>> restriction when client ID can allow the tool to watch different
>>> events on different devices if they want to. Maybe client ID is a
>>> bad term and it misleads you. I should call it register ID.
>>>
>>>  
>>>
>>> Regards,
>>>
>>> Amber
>>>
>>>  
>>>
>>> *From:* Kuehling, Felix <Felix.Kuehling@amd.com>
>>> <mailto:Felix.Kuehling@amd.com>
>>> *Sent:* Tuesday, April 14, 2020 7:04 PM
>>> *To:* Lin, Amber <Amber.Lin@amd.com> <mailto:Amber.Lin@amd.com>;
>>> amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>
>>> *Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch
>>>
>>>  
>>>
>>> Hi Amber,
>>>
>>> Some general remarks about the multi-client support. You added a
>>> global client id that's separate from the file descriptor. That's
>>> problematic for two reasons:
>>>
>>>  1. A process could change a different process' event mask
>>>  2. The FD should already be unique per process, no need to invent
>>>     another ID
>>>
>>> If we want to allow one process to register for events multiple
>>> times (multiple FDs per process), then the list of clients should be
>>> per process. Each process should only be allowed to change the event
>>> masks of its own clients. The client could be identified by its FD.
>>> No need for another client ID.
>>>
>>> But you could also simplify it further by allowing only one event
>>> client per process. Then you don't need the client ID lookup at all.
>>> Just have a single event client in the kfd_process.
>>>
>>> Another approach would be to make enable/disable functions of the
>>> event FD, rather than the KFD FD ioctl. It could be an ioctl of the
>>> event FD, or even simpler, you could use the write file-operation to
>>> write an event mask (of arbitrary length if you want to enable
>>> growth in the future). That way everything would be neatly
>>> encapsulated in the event FD private data.
>>>
>>> Two more comments inline ...
>>>
>>>  
>>>
>>> Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:
>>>
>>>     When the compute is malfunctioning or performance drops, the system admin
>>>
>>>     will use SMI (System Management Interface) tool to monitor/diagnostic what
>>>
>>>     went wrong. This patch provides an event watch interface for the user
>>>
>>>     space to register devices and subscribe events they are interested. After
>>>
>>>     registered, the user can use annoymous file descriptor's poll function
>>>
>>>     with wait-time specified and wait for events to happen. Once an event
>>>
>>>     happens, the user can use read() to retrieve information related to the
>>>
>>>     event.
>>>
>>>      
>>>
>>>     VM fault event is done in this patch.
>>>
>>>      
>>>
>>>     v2: - remove UNREGISTER and add event ENABLE/DISABLE
>>>
>>>         - correct kfifo usage
>>>
>>>         - move event message API to kfd_ioctl.h
>>>
>>>     v3: send the event msg in text than in binary
>>>
>>>     v4: support multiple clients
>>>
>>>      
>>>
>>>     Signed-off-by: Amber Lin <Amber.Lin@amd.com> <mailto:Amber.Lin@amd.com>
>>>
>>> [snip]
>>>
>>>     diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
>>>
>>>     index 4f66764..8146437 100644
>>>
>>>     --- a/include/uapi/linux/kfd_ioctl.h
>>>
>>>     +++ b/include/uapi/linux/kfd_ioctl.h
>>>
>>>     @@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {
>>>
>>>       __u32 dmabuf_fd;       /* to KFD */
>>>
>>>      };
>>>
>>>      
>>>
>>>     +/*
>>>
>>>     + * KFD SMI(System Management Interface) events
>>>
>>>     + */
>>>
>>>     +enum kfd_smi_events_op {
>>>
>>>     + KFD_SMI_EVENTS_REGISTER = 1,
>>>
>>>     + KFD_SMI_EVENTS_ENABLE,
>>>
>>>     + KFD_SMI_EVENTS_DISABLE
>>>
>>>     +};
>>>
>>>     +
>>>
>>>     +/* Event type (defined by bitmask) */
>>>
>>>     +#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
>>>
>>>     +
>>>
>>>     +struct kfd_ioctl_smi_events_args {
>>>
>>>     + __u32 op;              /* to KFD */
>>>
>>>     + __u64 events;          /* to KFD */
>>>
>>> The binary layout of the ioctl args structure should be the same on
>>> 32/64-bit. That means the 64-bit members should be 64-bit aligned.
>>> The best way to ensure this is to put all the 64-bit members first.
>>>
>>>  
>>>
>>>      
>>>
>>>     + __u64 gpuids_array_ptr;        /* to KFD */
>>>
>>>     + __u32 num_gpuids;      /* to KFD */
>>>
>>>     + __u32 anon_fd;         /* from KFD */
>>>
>>>     + __u32 client_id;       /* to/from KFD */
>>>
>>>     +};
>>>
>>>     +
>>>
>>>     +/* 1. All messages must start with (hex)uint64_event(16) + space(1) +
>>>
>>>     + *    (hex)gpuid(8) + space(1) =  26 bytes
>>>
>>>     + * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25
>>>
>>>     + *    When a new event msg uses more memory, change the calculation here.
>>>
>>>     + * 3. End with \n(1)
>>>
>>>     + * 26 + 25 + 1 = 52
>>>
>>>     + */
>>>
>>>     +#define KFD_SMI_MAX_EVENT_MSG 52
>>>
>>> If you define the maximum message length here, clients may start
>>> depending on it, and it gets harder to change it later. I'd not
>>> define this in the API header. It's not necessary to write correct
>>> clients. And if used badly, it may encourage writing incorrect
>>> clients that break with longer messages in the future.
>>>
>>> Regards,
>>>   Felix
>>>
>>>  
>>>
>>>      
>>>
>>>     +
>>>
>>>      /* Register offset inside the remapped mmio page
>>>
>>>       */
>>>
>>>      enum kfd_mmio_remap {
>>>
>>>     @@ -546,7 +576,10 @@ enum kfd_mmio_remap {
>>>
>>>      #define AMDKFD_IOC_ALLOC_QUEUE_GWS            \
>>>
>>>              AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
>>>
>>>      
>>>
>>>     +#define AMDKFD_IOC_SMI_EVENTS                 \
>>>
>>>     +        AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
>>>
>>>     +
>>>
>>>      #define AMDKFD_COMMAND_START           0x01
>>>
>>>     -#define AMDKFD_COMMAND_END             0x1F
>>>
>>>     +#define AMDKFD_COMMAND_END             0x20
>>>
>>>      
>>>
>>>      #endif
>>>
>

[-- Attachment #1.2: Type: text/html, Size: 23403 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4] drm/amdkfd: Provide SMI events watch
@ 2020-04-13  5:15 Amber Lin
  0 siblings, 0 replies; 9+ messages in thread
From: Amber Lin @ 2020-04-13  5:15 UTC (permalink / raw)
  To: amd-gfx; +Cc: Amber Lin

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
    - correct kfifo usage
    - move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile              |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         |  43 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_module.c          |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c      | 235 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h      |  33 ++++
 include/uapi/linux/kfd_ioctl.h                   |  35 +++-
 9 files changed, 354 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_dbgdev.o \
 		$(AMDKFD_PATH)/kfd_dbgmgr.o \
+		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 		struct kfd_vm_fault_info info;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 
 		memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..f13fde59 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,45 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
 	return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_smi_events_args *args = data;
+	uint32_t *ids_array;
+	int ret = 0;
+
+	switch (args->op) {
+	case KFD_SMI_EVENTS_REGISTER:
+		ids_array = kmalloc_array(args->num_gpuids, sizeof(uint32_t),
+					  GFP_KERNEL);
+		if (!ids_array)
+			return -ENOMEM;
+		if (copy_from_user(ids_array,
+				  (void __user *)args->gpuids_array_ptr,
+				  args->num_gpuids * sizeof(uint32_t))) {
+			kfree(ids_array);
+			return -EFAULT;
+		}
+
+		ret = kfd_smi_event_register(args->num_gpuids, ids_array,
+					     &args->anon_fd, &args->client_id);
+		if (ret)
+			kfree(ids_array);
+
+		return ret;
+
+	case KFD_SMI_EVENTS_ENABLE:
+		/* subscribe events */
+		return kfd_smi_event_enable(args->client_id, args->events);
+	case KFD_SMI_EVENTS_DISABLE:
+		/* unsubscribe events */
+		return kfd_smi_event_disable(args->client_id, args->events);
+	}
+
+	return -EINVAL;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -1827,6 +1867,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
 			kfd_ioctl_alloc_queue_gws, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+			kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e05d75e..151e83e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_smi_events.h"
 
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 					const uint32_t *ih_ring_entry,
@@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
+		kfd_smi_event_update_vmfault(dev, pasid);
 		kfd_process_vm_fault(dev->dqm, pasid);
 		kfd_signal_vm_fault_event(dev, pasid, &info);
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index f4b7f7e..1aa96a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -62,6 +62,7 @@ static int kfd_init(void)
 	kfd_procfs_init();
 
 	kfd_debugfs_init();
+	kfd_smi_event_init();
 
 	return 0;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 43b888b..3cdff5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1087,4 +1087,7 @@ static inline void kfd_debugfs_fini(void) {}
 
 #endif
 
+/* SMI events */
+void kfd_smi_event_init(void);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
new file mode 100644
index 0000000..6520a35
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_vm.h"
+#include "kfd_priv.h"
+#include "kfd_smi_events.h"
+
+struct kfd_smi_client {
+	uint32_t id;
+	struct list_head list;
+	struct kfifo fifo;
+	wait_queue_head_t wait_queue;
+	/* events enabled */
+	uint64_t enabled;
+	/* devices in watching */
+	uint32_t num_devs;
+	uint32_t *dev_ids;
+};
+static LIST_HEAD(clients);
+static spinlock_t clients_lock;
+static uint32_t client_id;
+
+static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
+static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
+static int kfd_smi_ev_release(struct inode *, struct file *);
+
+static const char kfd_smi_name[] = "kfd_smi_ev";
+
+static const struct file_operations kfd_smi_ev_fops = {
+	.owner = THIS_MODULE,
+	.poll = kfd_smi_ev_poll,
+	.read = kfd_smi_ev_read,
+	.release = kfd_smi_ev_release
+};
+
+static __poll_t kfd_smi_ev_poll(struct file *filep,
+				struct poll_table_struct *wait)
+{
+	__poll_t mask;
+	struct kfd_smi_client *client = filep->private_data;
+
+	poll_wait(filep, &client->wait_queue, wait);
+
+	rcu_read_lock();
+	mask = kfifo_is_empty(&client->fifo) ? 0: POLLIN | POLLRDNORM;
+	rcu_read_unlock();
+
+	return mask;
+}
+
+static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
+			       size_t size, loff_t *offset)
+{
+	int ret, copied = 0;
+	struct kfd_smi_client *client = filep->private_data;
+
+	ret = kfifo_to_user(&client->fifo, user, size, &copied);
+	if (ret || !copied) {
+		pr_debug("smi-events: fail to send msg (%i) (%i)\n",
+			ret, copied);
+		return ret ? ret : -EAGAIN;
+	}
+
+	return copied;
+}
+
+static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
+{
+	struct kfd_smi_client *client = filep->private_data;
+
+	spin_lock(&clients_lock);
+	list_del_rcu(&client->list);
+	spin_unlock(&clients_lock);
+
+	synchronize_rcu();
+	kfifo_free(&client->fifo);
+	kfree(client->dev_ids);
+	kfree(client);
+
+	return 0;
+}
+
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+	struct amdgpu_task_info task_info;
+	/* VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25 */
+	/* 16 bytes event + 1 byte space + 8 bytes gpuid + 1 byte space +
+	 * 25 bytes msg + 1 byte \n = 52
+	 */
+	char fifo_in[52];
+	struct kfd_smi_client *client;
+	int i;
+
+	if (list_empty(&clients))
+		return;
+
+	amdgpu_vm_get_task_info(adev, pasid, &task_info);
+	snprintf(fifo_in, 52, "%x %x %x %s\n", KFD_SMI_EVENT_VMFAULT,
+		dev->id, task_info.pid,task_info.task_name);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(client, &clients, list) {
+		if (!(client->enabled & KFD_SMI_EVENT_VMFAULT))
+			continue;
+		for (i = 0; i < client->num_devs; i++) {
+			if (client->dev_ids[i] != dev->id)
+				continue;
+			if (kfifo_avail(&client->fifo) < 52) {
+				pr_err("smi_event(vmfault): no space left\n");
+				rcu_read_unlock();
+				return;
+			}
+			kfifo_in(&client->fifo, fifo_in, sizeof(fifo_in));
+			wake_up_all(&client->wait_queue);
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+}
+
+static struct kfd_smi_client *get_client_by_id(uint32_t id)
+{
+	struct kfd_smi_client *client;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(client, &clients, list) {
+		if (client->id == id) {
+			rcu_read_unlock();
+			return client;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static int kfd_smi_event_update_events(uint32_t id, uint64_t events,
+				       bool is_enable)
+{
+	struct kfd_smi_client *client = get_client_by_id(id);
+
+	if (!client)
+		return -EINVAL;
+
+	if (is_enable)
+		client->enabled |= events;
+	else
+		client->enabled &= ~events;
+
+	return 0;
+}
+
+int kfd_smi_event_enable(uint32_t id, uint64_t events)
+{
+	return kfd_smi_event_update_events(id, events, true);
+}
+
+int kfd_smi_event_disable(uint32_t id, uint64_t events)
+{
+	return kfd_smi_event_update_events(id, events, false);
+}
+
+int kfd_smi_event_register(uint32_t num_devs, uint32_t *dev_ids, uint32_t *fd,
+			   uint32_t *id)
+{
+	struct kfd_smi_client *client;
+	int ret;
+
+	client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&client->list);
+
+	ret = kfifo_alloc(&client->fifo, KFD_SMI_MAX_EVENT_MSG * 16,
+			 GFP_KERNEL);
+	if (ret) {
+		kfree(client);
+		return ret;
+	}
+
+	ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client,
+			       0);
+	if (ret < 0) {
+		kfifo_free(&client->fifo);
+		kfree(client);
+		*fd = 0;
+		return ret;
+	}
+	*fd = ret;
+
+	init_waitqueue_head(&client->wait_queue);
+	client->enabled = 0;
+	client->num_devs = num_devs;
+	client->dev_ids = dev_ids;
+	/* client id is used to identify the client in enable/disable_events */
+	*id = client->id = ++client_id;
+
+	spin_lock(&clients_lock);
+	list_add_rcu(&client->list, &clients);
+	spin_unlock(&clients_lock);
+
+	return 0;
+}
+
+void kfd_smi_event_init(void)
+{
+	INIT_LIST_HEAD(&clients);
+	spin_lock_init(&clients_lock);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
new file mode 100644
index 0000000..a4f9e92
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_SMI_EVENTS_H_INCLUDED
+#define KFD_SMI_EVENTS_H_INCLUDED
+
+void kfd_smi_event_init(void);
+int kfd_smi_event_register(uint32_t num_devs, uint32_t *dev_ids, uint32_t *fd,
+			   uint32_t *id);
+int kfd_smi_event_enable(uint32_t id, uint64_t events);
+int kfd_smi_event_disable(uint32_t id, uint64_t events);
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+
+#endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 4f66764..8146437 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -442,6 +442,36 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+/*
+ * KFD SMI(System Management Interface) events
+ */
+enum kfd_smi_events_op {
+	KFD_SMI_EVENTS_REGISTER = 1,
+	KFD_SMI_EVENTS_ENABLE,
+	KFD_SMI_EVENTS_DISABLE
+};
+
+/* Event type (defined by bitmask) */
+#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
+
+struct kfd_ioctl_smi_events_args {
+	__u32 op;		/* to KFD */
+	__u64 events;		/* to KFD */
+	__u64 gpuids_array_ptr;	/* to KFD */
+	__u32 num_gpuids;	/* to KFD */
+	__u32 anon_fd;		/* from KFD */
+	__u32 client_id;	/* to/from KFD */
+};
+
+/* 1. All messages must start with (hex)uint64_event(16) + space(1) +
+ *    (hex)gpuid(8) + space(1) =  26 bytes
+ * 2. VmFault msg = (hex)uint32_pid(8) + space(1) + task name(16) = 25
+ *    When a new event msg uses more memory, change the calculation here.
+ * 3. End with \n(1)
+ * 26 + 25 + 1 = 52
+ */
+#define KFD_SMI_MAX_EVENT_MSG 52
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -546,7 +576,10 @@ enum kfd_mmio_remap {
 #define AMDKFD_IOC_ALLOC_QUEUE_GWS		\
 		AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
 
+#define AMDKFD_IOC_SMI_EVENTS			\
+		AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x1F
+#define AMDKFD_COMMAND_END		0x20
 
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-04-15 15:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-14 21:30 [PATCH v4] drm/amdkfd: Provide SMI events watch Amber Lin
2020-04-14 23:04 ` Felix Kuehling
2020-04-15  0:09   ` Lin, Amber
2020-04-15  2:40     ` Felix Kuehling
2020-04-15  3:03       ` Deucher, Alexander
2020-04-15 13:36         ` Amber Lin
2020-04-15 13:48           ` Deucher, Alexander
2020-04-15 15:26             ` Felix Kuehling
  -- strict thread matches above, loose matches on Subject: below --
2020-04-13  5:15 Amber Lin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.