[AMD Official Use Only - Internal Distribution Only] ________________________________ From: Alex Deucher Sent: Tuesday, March 17, 2020 3:03 PM To: Lin, Amber Cc: amd-gfx list Subject: Re: [PATCH] drm/amdkfd: Provide SMI events watch On Tue, Mar 17, 2020 at 1:57 PM Amber Lin wrote: > > When the compute is malfunctioning or performance drops, the system admin > will use SMI (System Management Interface) tool to monitor/diagnostic what > went wrong. This patch provides an event watch interface for the user > space to register events they are interested. After the event is > registered, the user can use annoymous file descriptor's pull function > with wait-time specified to wait for the event to happen. Once the event > happens, the user can use read() to retrieve information related to the > event. > > VM fault event is done in this patch. > > Signed-off-by: Amber Lin Can you provide a link to the userspace tools that make use of this interface? Thanks, Alex ===== Hi Alex, https://github.com/RadeonOpenCompute/rocm_smi_lib will use this interface. Those functions will be added to this library: /* Get a handler for watching events */ rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle); /* Register events for the device using the handler from init */ rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events, rsmi_event_handle_t *handle); /* Wait for events. If one of the events happens, a success is returned with * with details in data. */ rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms, rsmi_event_data_t *data); /* Stop watching events */ rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle); Regards, Amber > --- > drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- > drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 38 ++++++ > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 10 ++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 143 +++++++++++++++++++++++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 41 +++++++ > include/uapi/linux/kfd_ioctl.h | 27 ++++- > 9 files changed, 265 insertions(+), 2 deletions(-) > create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > > diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile > index 6147462..cc98b4a 100644 > --- a/drivers/gpu/drm/amd/amdkfd/Makefile > +++ b/drivers/gpu/drm/amd/amdkfd/Makefile > @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ > $(AMDKFD_PATH)/kfd_int_process_v9.o \ > $(AMDKFD_PATH)/kfd_dbgdev.o \ > $(AMDKFD_PATH)/kfd_dbgmgr.o \ > - $(AMDKFD_PATH)/kfd_crat.o > + $(AMDKFD_PATH)/kfd_crat.o \ > + $(AMDKFD_PATH)/kfd_smi_events.o > > ifneq ($(CONFIG_AMD_IOMMU_V2),) > AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o > diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c > index 9f59ba9..24b4717 100644 > --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c > +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c > @@ -24,6 +24,7 @@ > #include "kfd_events.h" > #include "cik_int.h" > #include "amdgpu_amdkfd.h" > +#include "kfd_smi_events.h" > > static bool cik_event_interrupt_isr(struct kfd_dev *dev, > const uint32_t *ih_ring_entry, > @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, > ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { > struct kfd_vm_fault_info info; > > + kfd_smi_event_update_vmfault(dev, pasid); > kfd_process_vm_fault(dev->dqm, pasid); > > memset(&info, 0, sizeof(info)); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index f8fa03a..8e92956 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -39,6 +39,7 @@ > #include "kfd_device_queue_manager.h" > #include "kfd_dbgmgr.h" > #include "amdgpu_amdkfd.h" > +#include "kfd_smi_events.h" > > static long kfd_ioctl(struct file *, unsigned int, unsigned long); > static int kfd_open(struct inode *, struct file *); > @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, > return ret; > } > > +/* Handle requests for watching SMI events */ > +static int kfd_ioctl_smi_events(struct file *filep, > + struct kfd_process *p, void *data) > +{ > + struct kfd_ioctl_smi_events_args *args = data; > + struct kfd_dev *dev; > + int ret = 0; > + > + dev = kfd_device_by_id(args->gpu_id); > + if (!dev) > + return -EINVAL; > + > + switch (args->op) { > + case KFD_SMI_EVENTS_REGISTER: > + ret = kfd_smi_event_register(dev, args->events); > + if (ret >= 0) { > + /* When the registration is successful, it returns the > + * annoymous inode. Pass it to the user in data1 > + */ > + args->data1 = ret; > + ret = 0; > + } > + break; > + case KFD_SMI_EVENTS_UNREGISTER: > + kfd_smi_event_unregister(dev, args->events); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + > + return ret; > +} > + > bool kfd_dev_is_large_bar(struct kfd_dev *dev) > { > struct kfd_local_mem_info mem_info; > @@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { > > AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, > kfd_ioctl_alloc_queue_gws, 0), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, > + kfd_ioctl_smi_events, 0), > }; > > #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index 7866cd06..450368c 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, > kfd->device_info = device_info; > kfd->pdev = pdev; > kfd->init_complete = false; > + kfd->smi_events.events = 0; > kfd->kfd2kgd = f2g; > atomic_set(&kfd->compute_profile, 0); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index e05d75e..151e83e 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -24,6 +24,7 @@ > #include "kfd_events.h" > #include "soc15_int.h" > #include "kfd_device_queue_manager.h" > +#include "kfd_smi_events.h" > > static bool event_interrupt_isr_v9(struct kfd_dev *dev, > const uint32_t *ih_ring_entry, > @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, > info.prot_read = ring_id & 0x10; > info.prot_write = ring_id & 0x20; > > + kfd_smi_event_update_vmfault(dev, pasid); > kfd_process_vm_fault(dev->dqm, pasid); > kfd_signal_vm_fault_event(dev, pasid, &info); > } > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 43b888b..fdb51de 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -223,6 +223,13 @@ struct kfd_vmid_info { > uint32_t vmid_num_kfd; > }; > > +struct kfd_smi_events { > + uint64_t events; > + struct kfifo fifo; > + wait_queue_head_t wait_queue; > + uint32_t max_events; > +}; > + > struct kfd_dev { > struct kgd_dev *kgd; > > @@ -309,6 +316,9 @@ struct kfd_dev { > > /* Global GWS resource shared b/t processes*/ > void *gws; > + > + /* if this device is in SMI events watch */ > + struct kfd_smi_events smi_events; > }; > > enum kfd_mempool { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > new file mode 100644 > index 0000000..ba9d036 > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > @@ -0,0 +1,143 @@ > +/* > + * Copyright 2020 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#include > +#include > +#include > +#include > +#include "amdgpu_vm.h" > +#include "kfd_priv.h" > +#include "kfd_smi_events.h" > + > +static DEFINE_MUTEX(kfd_smi_mutex); > + > +struct mutex *kfd_get_smi_mutex(void) > +{ > + return &kfd_smi_mutex; > +} > + > +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); > +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); > +static int kfd_smi_ev_release(struct inode *, struct file *); > + > +static const char kfd_smi_name[] = "kfd_smi_ev"; > + > +static const struct file_operations kfd_smi_ev_fops = { > + .owner = THIS_MODULE, > + .poll = kfd_smi_ev_poll, > + .read = kfd_smi_ev_read, > + .release = kfd_smi_ev_release > +}; > + > +static __poll_t kfd_smi_ev_poll(struct file *filep, > + struct poll_table_struct *wait) > +{ > + struct kfd_dev *dev = filep->private_data; > + struct kfd_smi_events *ev = &dev->smi_events; > + > + __poll_t mask = 0; > + > + poll_wait(filep, &ev->wait_queue, wait); > + mask |= !kfifo_is_empty(&ev->fifo) ? POLLIN | POLLRDNORM : mask; > + > + return mask; > +} > + > +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, > + size_t size, loff_t *offset) > +{ > + int ret, copied = 0; > + struct kfd_dev *dev = filep->private_data; > + > + ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied); > + if (ret || !copied) { > + pr_debug("kfd smi-events: Fail to read fd (%i) (%i)\n", > + ret, copied); > + return ret ? ret : -EAGAIN; > + } > + > + return copied; > +} > + > +static int kfd_smi_ev_release(struct inode *inode, struct file *filep) > +{ > + struct kfd_dev *dev = filep->private_data; > + > + kfifo_free(&dev->smi_events.fifo); > + return 0; > +} > + > +void kfd_smi_event_update_vmfault(struct kfd_dev *kdev, uint16_t pasid) > +{ > + struct kfd_smi_vmfault_fifo fifo_out; > + struct amdgpu_device *adev = (struct amdgpu_device *)kdev->kgd; > + struct amdgpu_task_info task_info; > + > + if (!kdev->smi_events.events) > + return; > + > + if (!(kdev->smi_events.events & KFD_SMI_EV_VMFAULT)) > + return; > + > + memset(&task_info, 0, sizeof(struct amdgpu_task_info)); > + amdgpu_vm_get_task_info(adev, pasid, &task_info); > + > + fifo_out.group = 0; > + fifo_out.event = KFD_SMI_EV_VMFAULT; > + fifo_out.gpu_id = kdev->id; > + fifo_out.pid = task_info.pid; > + strcpy(fifo_out.task_name, task_info.task_name); > + kfifo_in(&kdev->smi_events.fifo, &fifo_out, sizeof(fifo_out)); > + wake_up_all(&kdev->smi_events.wait_queue); > +} > + > +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events) > +{ > + mutex_lock(kfd_get_smi_mutex()); > + dev->smi_events.events &= ~events; > + mutex_unlock(kfd_get_smi_mutex()); > +} > + > +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events) > +{ > + int ret; > + > + mutex_lock(kfd_get_smi_mutex()); > + dev->smi_events.events |= events; > + mutex_unlock(kfd_get_smi_mutex()); > + > + /* We use the lower 32 bits for now. Each bit represents one event. If > + * featured events are increased to more than 32, we'll use the upper > + * bits as groups so the total number of events can be up to 32*32. > + */ > + dev->smi_events.max_events = 32; > + ret = kfifo_alloc(&dev->smi_events.fifo, dev->smi_events.max_events, > + GFP_KERNEL); > + if (ret) { > + pr_err("fail to allocate kfifo\n"); > + return ret; > + } > + init_waitqueue_head(&dev->smi_events.wait_queue); > + > + return anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, > + (void *)dev, 0); > +} > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > new file mode 100644 > index 0000000..2e320d3 > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > @@ -0,0 +1,41 @@ > +/* > + * Copyright 2020 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#ifndef KFD_SMI_EVENTS_H_INCLUDED > +#define KFD_SMI_EVENTS_H_INCLUDED > + > +int kfd_smi_event_register(struct kfd_dev *dev, uint64_t events); > +void kfd_smi_event_unregister(struct kfd_dev *dev, uint64_t events); > +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); > + > +/* All FIFO must start with "uint32_t group" and "uint32_t event" so the user > + * can read the first 8 bytes to determine the next read length. > + */ > +struct kfd_smi_vmfault_fifo { > + uint32_t group; > + uint32_t event; > + unsigned int gpu_id; > + pid_t pid; > + char task_name[TASK_COMM_LEN]; > +}; > + > +#endif > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index 4f66764..6ce7c69 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -442,6 +442,28 @@ struct kfd_ioctl_import_dmabuf_args { > __u32 dmabuf_fd; /* to KFD */ > }; > > +/* > + * KFD SMI(System Management Interface) events > + */ > +enum kfd_smi_events_op { > + KFD_SMI_EVENTS_REGISTER = 1, > + KFD_SMI_EVENTS_UNREGISTER > +}; > + > +/* Event ID (mask) */ > +#define KFD_SMI_EV_VMFAULT 0x00000001 > + > +struct kfd_ioctl_smi_events_args { > + __u32 op; /* to KFD */ > + /* upper 32 bits: group. lower 32 bits: event IDs */ > + __u64 events; /* to KFD */ > + __u32 gpu_id; /* to KFD */ > + pid_t pid; /* to KFD */ > + __u32 data1; /* from KFD */ > + __u32 data2; > + __u32 data3; > +}; > + > /* Register offset inside the remapped mmio page > */ > enum kfd_mmio_remap { > @@ -546,7 +568,10 @@ enum kfd_mmio_remap { > #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ > AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) > > +#define AMDKFD_IOC_SMI_EVENTS \ > + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) > + > #define AMDKFD_COMMAND_START 0x01 > -#define AMDKFD_COMMAND_END 0x1F > +#define AMDKFD_COMMAND_END 0x20 > > #endif > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CAmber.Lin%40amd.com%7C2f64059b66554d40898208d7caa5e374%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637200686149934820&sdata=vBpnnq1xhSxEMjEWffwSLVIcSymSemDQTiEoYXj5lEE%3D&reserved=0