All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
@ 2020-07-22 16:08 Mukul Joshi
  2020-07-23 20:36 ` Felix Kuehling
  2020-07-23 21:27 ` Amber Lin
  0 siblings, 2 replies; 6+ messages in thread
From: Mukul Joshi @ 2020-07-22 16:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Mukul Joshi, Felix.Kuehling

Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
 .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
 include/uapi/linux/kfd_ioctl.h                |  3 +-
 10 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1b865fed74ca..19e4658756d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
 {
 }
+
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
+{
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3f2b695cf19e..e8b0258aae24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
 					       struct dma_fence *fence);
 void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
 
 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 4bfedaab183f..d5e790f046b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -29,6 +29,7 @@
 #include "cwsr_trap_handler.h"
 #include "kfd_iommu.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 #define MQD_SIZE_ALIGNED 768
 
@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
 	WARN_ONCE(count < 0, "Compute profile ref. count error");
 }
 
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
+{
+	if (kfd)
+		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 7b348bf9df21..00c90b47155b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -24,6 +24,7 @@
 #include <linux/wait.h>
 #include <linux/anon_inodes.h>
 #include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu.h"
 #include "amdgpu_vm.h"
 #include "kfd_priv.h"
 #include "kfd_smi_events.h"
@@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
 	return 0;
 }
 
+static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
+			      char *event_msg, int len)
+{
+	struct kfd_smi_client *client;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
+		if (!(READ_ONCE(client->events) & smi_event))
+			continue;
+		spin_lock(&client->lock);
+		if (kfifo_avail(&client->fifo) >= len) {
+			kfifo_in(&client->fifo, event_msg, len);
+			wake_up_all(&client->wait_queue);
+		} else {
+			pr_debug("smi_event(EventID: %llu): no space left\n",
+					smi_event);
+		}
+		spin_unlock(&client->lock);
+	}
+
+	rcu_read_unlock();
+}
+
+void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
+					     uint32_t throttle_bitmask)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+	/*
+	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
+	 * 			 thermal_interrupt_count(8):
+	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
+	 * 4 byte throttle_bitmask + 1 byte : +
+	 * 8 byte thermal_interupt_counter + 1 byte \n = 36
+	 */
+	char fifo_in[36];
+	int len;
+
+	if (list_empty(&dev->smi_clients))
+		return;
+
+	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
+		       KFD_SMI_EVENT_THERMAL_THROTTLE,
+		       dev->id, throttle_bitmask,
+		       atomic64_read(&adev->smu.throttle_int_counter));
+
+	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
+}
+
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
@@ -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
 	 */
 	char fifo_in[43];
-	struct kfd_smi_client *client;
 	int len;
 
 	if (list_empty(&dev->smi_clients))
@@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
 		task_info.pid, task_info.task_name);
 
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
-		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
-			continue;
-		spin_lock(&client->lock);
-		if (kfifo_avail(&client->fifo) >= len) {
-			kfifo_in(&client->fifo, fifo_in, len);
-			wake_up_all(&client->wait_queue);
-		}
-		else
-			pr_debug("smi_event(vmfault): no space left\n");
-		spin_unlock(&client->lock);
-	}
-
-	rcu_read_unlock();
+	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
 }
 
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index a9cb218fef96..15537b2cccb5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -25,5 +25,7 @@
 
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
+					     uint32_t throttle_bitmask);
 
 #endif
diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index b197dcaed064..52b52cbb90e2 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
 	mutex_init(&smu->message_lock);
 
 	INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
+	atomic64_set(&smu->throttle_int_counter, 0);
 	smu->watermarks_bitmap = 0;
 	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
 	smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index 9b68760dd35b..eb3a57593f69 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
 
 	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
 			log_buf);
+	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
 }
 
 static const struct pptable_funcs arcturus_ppt_funcs = {
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index 896b443f1ce8..18ba421c59bd 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -446,6 +446,7 @@ struct smu_context
 	bool dc_controlled_by_gpio;
 
 	struct work_struct throttling_logging_work;
+	atomic64_t throttle_int_counter;
 };
 
 struct i2c_adapter;
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index fd82402065e6..a9453ec01619 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
 				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
 				break;
 			case 0x7:
+				/*
+				 * Increment the throttle interrupt counter
+				 */
+				atomic64_inc(&smu->throttle_int_counter);
+
 				if (!atomic_read(&adev->throttling_logging_enabled))
 					return 0;
 
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index f738c3b53f4e..df6c7a43aadc 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
  * KFD SMI(System Management Interface) events
  */
 /* Event type (defined by bitmask) */
-#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
+#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
+#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
 
 struct kfd_ioctl_smi_events_args {
 	__u32 gpuid;	/* to KFD */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
  2020-07-22 16:08 [PATCH v2] drm/amdkfd: Add thermal throttling SMI event Mukul Joshi
@ 2020-07-23 20:36 ` Felix Kuehling
  2020-07-23 21:27 ` Amber Lin
  1 sibling, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2020-07-23 20:36 UTC (permalink / raw)
  To: Mukul Joshi, amd-gfx

Am 2020-07-22 um 12:08 p.m. schrieb Mukul Joshi:
> Add support for reporting thermal throttling events through SMI.
> Also, add a counter to count the number of throttling interrupts
> observed and report the count in the SMI event message.

There is still a problem with the message size calculation. See inline.
With that fixed, the patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


>
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>  drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>  .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>  include/uapi/linux/kfd_ioctl.h                |  3 +-
>  10 files changed, 75 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 1b865fed74ca..19e4658756d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>  {
>  }
> +
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
> +{
> +}
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3f2b695cf19e..e8b0258aae24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>  int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>  					       struct dma_fence *fence);
>  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
>  
>  #endif /* AMDGPU_AMDKFD_H_INCLUDED */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 4bfedaab183f..d5e790f046b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -29,6 +29,7 @@
>  #include "cwsr_trap_handler.h"
>  #include "kfd_iommu.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>  
>  #define MQD_SIZE_ALIGNED 768
>  
> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>  	WARN_ONCE(count < 0, "Compute profile ref. count error");
>  }
>  
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
> +{
> +	if (kfd)
> +		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
> +}
> +
>  #if defined(CONFIG_DEBUG_FS)
>  
>  /* This function will send a package to HIQ to hang the HWS
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 7b348bf9df21..00c90b47155b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -24,6 +24,7 @@
>  #include <linux/wait.h>
>  #include <linux/anon_inodes.h>
>  #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu.h"
>  #include "amdgpu_vm.h"
>  #include "kfd_priv.h"
>  #include "kfd_smi_events.h"
> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>  	return 0;
>  }
>  
> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
> +			      char *event_msg, int len)
> +{
> +	struct kfd_smi_client *client;
> +
> +	rcu_read_lock();
> +
> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> +		if (!(READ_ONCE(client->events) & smi_event))
> +			continue;
> +		spin_lock(&client->lock);
> +		if (kfifo_avail(&client->fifo) >= len) {
> +			kfifo_in(&client->fifo, event_msg, len);
> +			wake_up_all(&client->wait_queue);
> +		} else {
> +			pr_debug("smi_event(EventID: %llu): no space left\n",
> +					smi_event);
> +		}
> +		spin_unlock(&client->lock);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +					     uint32_t throttle_bitmask)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> +	/*
> +	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
> +	 * 			 thermal_interrupt_count(8):
> +	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
> +	 * 4 byte throttle_bitmask + 1 byte : +

The throttle_bitmask is declared as uint32_t. That means you need up to
8 hexadecimal digits.


> +	 * 8 byte thermal_interupt_counter + 1 byte \n = 36

A 64-bit counter in hexadecimal can be up to 16 digits.

You also need to reserve one more byte for the terminating \0.

Regards,
  Felix


> +	 */
> +	char fifo_in[36];
> +	int len;
> +
> +	if (list_empty(&dev->smi_clients))
> +		return;
> +
> +	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
> +		       KFD_SMI_EVENT_THERMAL_THROTTLE,
> +		       dev->id, throttle_bitmask,
> +		       atomic64_read(&adev->smu.throttle_int_counter));
> +
> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
> +}
> +
>  void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> @@ -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>  	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>  	 */
>  	char fifo_in[43];
> -	struct kfd_smi_client *client;
>  	int len;
>  
>  	if (list_empty(&dev->smi_clients))
> @@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>  	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>  		task_info.pid, task_info.task_name);
>  
> -	rcu_read_lock();
> -
> -	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> -		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
> -			continue;
> -		spin_lock(&client->lock);
> -		if (kfifo_avail(&client->fifo) >= len) {
> -			kfifo_in(&client->fifo, fifo_in, len);
> -			wake_up_all(&client->wait_queue);
> -		}
> -		else
> -			pr_debug("smi_event(vmfault): no space left\n");
> -		spin_unlock(&client->lock);
> -	}
> -
> -	rcu_read_unlock();
> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>  }
>  
>  int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index a9cb218fef96..15537b2cccb5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -25,5 +25,7 @@
>  
>  int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>  void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +					     uint32_t throttle_bitmask);
>  
>  #endif
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index b197dcaed064..52b52cbb90e2 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>  	mutex_init(&smu->message_lock);
>  
>  	INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
> +	atomic64_set(&smu->throttle_int_counter, 0);
>  	smu->watermarks_bitmap = 0;
>  	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>  	smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 9b68760dd35b..eb3a57593f69 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
>  
>  	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
>  			log_buf);
> +	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>  }
>  
>  static const struct pptable_funcs arcturus_ppt_funcs = {
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 896b443f1ce8..18ba421c59bd 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -446,6 +446,7 @@ struct smu_context
>  	bool dc_controlled_by_gpio;
>  
>  	struct work_struct throttling_logging_work;
> +	atomic64_t throttle_int_counter;
>  };
>  
>  struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index fd82402065e6..a9453ec01619 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>  				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>  				break;
>  			case 0x7:
> +				/*
> +				 * Increment the throttle interrupt counter
> +				 */
> +				atomic64_inc(&smu->throttle_int_counter);
> +
>  				if (!atomic_read(&adev->throttling_logging_enabled))
>  					return 0;
>  
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index f738c3b53f4e..df6c7a43aadc 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>   * KFD SMI(System Management Interface) events
>   */
>  /* Event type (defined by bitmask) */
> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
> +#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
> +#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
>  
>  struct kfd_ioctl_smi_events_args {
>  	__u32 gpuid;	/* to KFD */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
  2020-07-22 16:08 [PATCH v2] drm/amdkfd: Add thermal throttling SMI event Mukul Joshi
  2020-07-23 20:36 ` Felix Kuehling
@ 2020-07-23 21:27 ` Amber Lin
  2020-07-23 21:41   ` Joshi, Mukul
  1 sibling, 1 reply; 6+ messages in thread
From: Amber Lin @ 2020-07-23 21:27 UTC (permalink / raw)
  To: Mukul Joshi, amd-gfx; +Cc: Felix.Kuehling



On 2020-07-22 12:08 p.m., Mukul Joshi wrote:
> Add support for reporting thermal throttling events through SMI.
> Also, add a counter to count the number of throttling interrupts
> observed and report the count in the SMI event message.
>
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>   drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>   drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>   .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>   drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>   include/uapi/linux/kfd_ioctl.h                |  3 +-
>   10 files changed, 75 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 1b865fed74ca..19e4658756d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>   {
>   }
> +
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
> +{
> +}
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3f2b695cf19e..e8b0258aae24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>   int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>   					       struct dma_fence *fence);
>   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
>   
>   #endif /* AMDGPU_AMDKFD_H_INCLUDED */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 4bfedaab183f..d5e790f046b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -29,6 +29,7 @@
>   #include "cwsr_trap_handler.h"
>   #include "kfd_iommu.h"
>   #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>   
>   #define MQD_SIZE_ALIGNED 768
>   
> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>   	WARN_ONCE(count < 0, "Compute profile ref. count error");
>   }
>   
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
> +{
> +	if (kfd)
> +		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   /* This function will send a package to HIQ to hang the HWS
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 7b348bf9df21..00c90b47155b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -24,6 +24,7 @@
>   #include <linux/wait.h>
>   #include <linux/anon_inodes.h>
>   #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu.h"
>   #include "amdgpu_vm.h"
>   #include "kfd_priv.h"
>   #include "kfd_smi_events.h"
> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>   	return 0;
>   }
>   
> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
> +			      char *event_msg, int len)
> +{
> +	struct kfd_smi_client *client;
> +
> +	rcu_read_lock();
> +
> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> +		if (!(READ_ONCE(client->events) & smi_event))
> +			continue;
> +		spin_lock(&client->lock);
> +		if (kfifo_avail(&client->fifo) >= len) {
> +			kfifo_in(&client->fifo, event_msg, len);
> +			wake_up_all(&client->wait_queue);
> +		} else {
> +			pr_debug("smi_event(EventID: %llu): no space left\n",
> +					smi_event);
> +		}
> +		spin_unlock(&client->lock);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +					     uint32_t throttle_bitmask)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> +	/*
> +	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
gpu_id is not needed. The user calls ioctl with GPU specified and KFD 
returns an anonymous fd. Read from this anon_fd already identify the GPU.
> +	 * 			 thermal_interrupt_count(8):
> +	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
> +	 * 4 byte throttle_bitmask + 1 byte : +
> +	 * 8 byte thermal_interupt_counter + 1 byte \n = 36
> +	 */
> +	char fifo_in[36];
> +	int len;
> +
> +	if (list_empty(&dev->smi_clients))
> +		return;
> +
> +	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
> +		       KFD_SMI_EVENT_THERMAL_THROTTLE,
> +		       dev->id, throttle_bitmask,
> +		       atomic64_read(&adev->smu.throttle_int_counter));
> +
> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
> +}
> +
>   void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> @@ -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>   	 */
>   	char fifo_in[43];
> -	struct kfd_smi_client *client;
>   	int len;
>   
>   	if (list_empty(&dev->smi_clients))
> @@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>   		task_info.pid, task_info.task_name);
>   
> -	rcu_read_lock();
> -
> -	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> -		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
> -			continue;
> -		spin_lock(&client->lock);
> -		if (kfifo_avail(&client->fifo) >= len) {
> -			kfifo_in(&client->fifo, fifo_in, len);
> -			wake_up_all(&client->wait_queue);
> -		}
> -		else
> -			pr_debug("smi_event(vmfault): no space left\n");
> -		spin_unlock(&client->lock);
> -	}
> -
> -	rcu_read_unlock();
> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>   }
>   
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index a9cb218fef96..15537b2cccb5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -25,5 +25,7 @@
>   
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>   void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +					     uint32_t throttle_bitmask);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index b197dcaed064..52b52cbb90e2 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>   	mutex_init(&smu->message_lock);
>   
>   	INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
> +	atomic64_set(&smu->throttle_int_counter, 0);
>   	smu->watermarks_bitmap = 0;
>   	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>   	smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 9b68760dd35b..eb3a57593f69 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
>   
>   	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
>   			log_buf);
> +	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>   }
>   
>   static const struct pptable_funcs arcturus_ppt_funcs = {
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 896b443f1ce8..18ba421c59bd 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -446,6 +446,7 @@ struct smu_context
>   	bool dc_controlled_by_gpio;
>   
>   	struct work_struct throttling_logging_work;
> +	atomic64_t throttle_int_counter;
>   };
>   
>   struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index fd82402065e6..a9453ec01619 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>   				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>   				break;
>   			case 0x7:
> +				/*
> +				 * Increment the throttle interrupt counter
> +				 */
> +				atomic64_inc(&smu->throttle_int_counter);
> +
>   				if (!atomic_read(&adev->throttling_logging_enabled))
>   					return 0;
>   
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index f738c3b53f4e..df6c7a43aadc 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>    * KFD SMI(System Management Interface) events
>    */
>   /* Event type (defined by bitmask) */
> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
> +#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
> +#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
>   
>   struct kfd_ioctl_smi_events_args {
>   	__u32 gpuid;	/* to KFD */

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
  2020-07-23 21:27 ` Amber Lin
@ 2020-07-23 21:41   ` Joshi, Mukul
  2020-07-23 21:57     ` Amber Lin
  0 siblings, 1 reply; 6+ messages in thread
From: Joshi, Mukul @ 2020-07-23 21:41 UTC (permalink / raw)
  To: Lin, Amber, amd-gfx; +Cc: Kuehling, Felix

[AMD Official Use Only - Internal Distribution Only]



-----Original Message-----
From: Lin, Amber <Amber.Lin@amd.com> 
Sent: Thursday, July 23, 2020 5:27 PM
To: Joshi, Mukul <Mukul.Joshi@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>
Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event



On 2020-07-22 12:08 p.m., Mukul Joshi wrote:
> Add support for reporting thermal throttling events through SMI.
> Also, add a counter to count the number of throttling interrupts 
> observed and report the count in the SMI event message.
>
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>   drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>   drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>   .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>   drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>   include/uapi/linux/kfd_ioctl.h                |  3 +-
>   10 files changed, 75 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 1b865fed74ca..19e4658756d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>   {
>   }
> +
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> +throttle_bitmask) { }
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 3f2b695cf19e..e8b0258aae24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>   int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>   					       struct dma_fence *fence);
>   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> +throttle_bitmask);
>   
>   #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 4bfedaab183f..d5e790f046b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -29,6 +29,7 @@
>   #include "cwsr_trap_handler.h"
>   #include "kfd_iommu.h"
>   #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>   
>   #define MQD_SIZE_ALIGNED 768
>   
> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>   	WARN_ONCE(count < 0, "Compute profile ref. count error");
>   }
>   
> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
> +throttle_bitmask) {
> +	if (kfd)
> +		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); }
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   /* This function will send a package to HIQ to hang the HWS diff 
> --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 7b348bf9df21..00c90b47155b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -24,6 +24,7 @@
>   #include <linux/wait.h>
>   #include <linux/anon_inodes.h>
>   #include <uapi/linux/kfd_ioctl.h>
> +#include "amdgpu.h"
>   #include "amdgpu_vm.h"
>   #include "kfd_priv.h"
>   #include "kfd_smi_events.h"
> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>   	return 0;
>   }
>   
> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
> +			      char *event_msg, int len)
> +{
> +	struct kfd_smi_client *client;
> +
> +	rcu_read_lock();
> +
> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> +		if (!(READ_ONCE(client->events) & smi_event))
> +			continue;
> +		spin_lock(&client->lock);
> +		if (kfifo_avail(&client->fifo) >= len) {
> +			kfifo_in(&client->fifo, event_msg, len);
> +			wake_up_all(&client->wait_queue);
> +		} else {
> +			pr_debug("smi_event(EventID: %llu): no space left\n",
> +					smi_event);
> +		}
> +		spin_unlock(&client->lock);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +					     uint32_t throttle_bitmask)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
> +	/*
> +	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns an anonymous fd. Read from this anon_fd already identify the GPU.

I agree with you. But I would prefer to keep gpu_id in the SMI event message. The benefit is it explicitly identifies the GPU sending the message. This way user-space doesn't need to maintain an internal mapping of anon_fd to gpu_id.

Regards,
Mukul

> +	 * 			 thermal_interrupt_count(8):
> +	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
> +	 * 4 byte throttle_bitmask + 1 byte : +
> +	 * 8 byte thermal_interupt_counter + 1 byte \n = 36
> +	 */
> +	char fifo_in[36];
> +	int len;
> +
> +	if (list_empty(&dev->smi_clients))
> +		return;
> +
> +	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
> +		       KFD_SMI_EVENT_THERMAL_THROTTLE,
> +		       dev->id, throttle_bitmask,
> +		       atomic64_read(&adev->smu.throttle_int_counter));
> +
> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, 
> +len); }
> +
>   void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; @@ 
> -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>   	 */
>   	char fifo_in[43];
> -	struct kfd_smi_client *client;
>   	int len;
>   
>   	if (list_empty(&dev->smi_clients))
> @@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>   		task_info.pid, task_info.task_name);
>   
> -	rcu_read_lock();
> -
> -	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
> -		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
> -			continue;
> -		spin_lock(&client->lock);
> -		if (kfifo_avail(&client->fifo) >= len) {
> -			kfifo_in(&client->fifo, fifo_in, len);
> -			wake_up_all(&client->wait_queue);
> -		}
> -		else
> -			pr_debug("smi_event(vmfault): no space left\n");
> -		spin_unlock(&client->lock);
> -	}
> -
> -	rcu_read_unlock();
> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>   }
>   
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index a9cb218fef96..15537b2cccb5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -25,5 +25,7 @@
>   
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>   void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
> pasid);
> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> +					     uint32_t throttle_bitmask);
>   
>   #endif
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index b197dcaed064..52b52cbb90e2 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>   	mutex_init(&smu->message_lock);
>   
>   	INIT_WORK(&smu->throttling_logging_work, 
> smu_throttling_logging_work_fn);
> +	atomic64_set(&smu->throttle_int_counter, 0);
>   	smu->watermarks_bitmap = 0;
>   	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>   	smu->default_power_profile_mode = 
> PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 9b68760dd35b..eb3a57593f69 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -2251,6 +2251,7 @@ static void 
> arcturus_log_thermal_throttling_event(struct smu_context *smu)
>   
>   	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
>   			log_buf);
> +	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>   }
>   
>   static const struct pptable_funcs arcturus_ppt_funcs = { diff --git 
> a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 896b443f1ce8..18ba421c59bd 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -446,6 +446,7 @@ struct smu_context
>   	bool dc_controlled_by_gpio;
>   
>   	struct work_struct throttling_logging_work;
> +	atomic64_t throttle_int_counter;
>   };
>   
>   struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index fd82402065e6..a9453ec01619 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>   				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>   				break;
>   			case 0x7:
> +				/*
> +				 * Increment the throttle interrupt counter
> +				 */
> +				atomic64_inc(&smu->throttle_int_counter);
> +
>   				if (!atomic_read(&adev->throttling_logging_enabled))
>   					return 0;
>   
> diff --git a/include/uapi/linux/kfd_ioctl.h 
> b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc 
> 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>    * KFD SMI(System Management Interface) events
>    */
>   /* Event type (defined by bitmask) */
> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
> +#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
> +#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
>   
>   struct kfd_ioctl_smi_events_args {
>   	__u32 gpuid;	/* to KFD */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
  2020-07-23 21:41   ` Joshi, Mukul
@ 2020-07-23 21:57     ` Amber Lin
  2020-07-23 22:05       ` Joshi, Mukul
  0 siblings, 1 reply; 6+ messages in thread
From: Amber Lin @ 2020-07-23 21:57 UTC (permalink / raw)
  To: Joshi, Mukul, amd-gfx; +Cc: Kuehling, Felix



On 2020-07-23 5:41 p.m., Joshi, Mukul wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> -----Original Message-----
> From: Lin, Amber <Amber.Lin@amd.com>
> Sent: Thursday, July 23, 2020 5:27 PM
> To: Joshi, Mukul <Mukul.Joshi@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>
> Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
>
>
>
> On 2020-07-22 12:08 p.m., Mukul Joshi wrote:
>> Add support for reporting thermal throttling events through SMI.
>> Also, add a counter to count the number of throttling interrupts
>> observed and report the count in the SMI event message.
>>
>> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>>    drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>>    drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>>    drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>>    drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>>    .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>>    drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>>    include/uapi/linux/kfd_ioctl.h                |  3 +-
>>    10 files changed, 75 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 1b865fed74ca..19e4658756d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>>    void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>>    {
>>    }
>> +
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask) { }
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 3f2b695cf19e..e8b0258aae24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>>    int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>>    					       struct dma_fence *fence);
>>    void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask);
>>    
>>    #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 4bfedaab183f..d5e790f046b4 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -29,6 +29,7 @@
>>    #include "cwsr_trap_handler.h"
>>    #include "kfd_iommu.h"
>>    #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>    
>>    #define MQD_SIZE_ALIGNED 768
>>    
>> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>>    	WARN_ONCE(count < 0, "Compute profile ref. count error");
>>    }
>>    
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask) {
>> +	if (kfd)
>> +		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); }
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>>    /* This function will send a package to HIQ to hang the HWS diff
>> --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> index 7b348bf9df21..00c90b47155b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -24,6 +24,7 @@
>>    #include <linux/wait.h>
>>    #include <linux/anon_inodes.h>
>>    #include <uapi/linux/kfd_ioctl.h>
>> +#include "amdgpu.h"
>>    #include "amdgpu_vm.h"
>>    #include "kfd_priv.h"
>>    #include "kfd_smi_events.h"
>> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>    	return 0;
>>    }
>>    
>> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
>> +			      char *event_msg, int len)
>> +{
>> +	struct kfd_smi_client *client;
>> +
>> +	rcu_read_lock();
>> +
>> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>> +		if (!(READ_ONCE(client->events) & smi_event))
>> +			continue;
>> +		spin_lock(&client->lock);
>> +		if (kfifo_avail(&client->fifo) >= len) {
>> +			kfifo_in(&client->fifo, event_msg, len);
>> +			wake_up_all(&client->wait_queue);
>> +		} else {
>> +			pr_debug("smi_event(EventID: %llu): no space left\n",
>> +					smi_event);
>> +		}
>> +		spin_unlock(&client->lock);
>> +	}
>> +
>> +	rcu_read_unlock();
>> +}
>> +
>> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>> +					     uint32_t throttle_bitmask)
>> +{
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
>> +	/*
>> +	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
> gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns an anonymous fd. Read from this anon_fd already identify the GPU.
>
> I agree with you. But I would prefer to keep gpu_id in the SMI event message. The benefit is it explicitly identifies the GPU sending the message. This way user-space doesn't need to maintain an internal mapping of anon_fd to gpu_id.
We want to keep the message as short as possible so the 1K kfifo can 
contain more events in case multiple events rush in and the user doesn't 
consume fast enough. For the same reason vmfault event doesn't log 
gpu_id. On the user side, the user shouldn't need to do the 
anon_fd/gpu_id convert. That information should be already in their 
structure. Instead of reading/parsing the message, the user reads from 
their structure.
>
> Regards,
> Mukul
>
>> +	 * 			 thermal_interrupt_count(8):
>> +	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
>> +	 * 4 byte throttle_bitmask + 1 byte : +
>> +	 * 8 byte thermal_interupt_counter + 1 byte \n = 36
>> +	 */
>> +	char fifo_in[36];
>> +	int len;
>> +
>> +	if (list_empty(&dev->smi_clients))
>> +		return;
>> +
>> +	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
>> +		       KFD_SMI_EVENT_THERMAL_THROTTLE,
>> +		       dev->id, throttle_bitmask,
>> +		       atomic64_read(&adev->smu.throttle_int_counter));
>> +
>> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in,
>> +len); }
>> +
>>    void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    {
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; @@
>> -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>>    	 */
>>    	char fifo_in[43];
>> -	struct kfd_smi_client *client;
>>    	int len;
>>    
>>    	if (list_empty(&dev->smi_clients))
>> @@ -171,22 +220,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>>    		task_info.pid, task_info.task_name);
>>    
>> -	rcu_read_lock();
>> -
>> -	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>> -		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
>> -			continue;
>> -		spin_lock(&client->lock);
>> -		if (kfifo_avail(&client->fifo) >= len) {
>> -			kfifo_in(&client->fifo, fifo_in, len);
>> -			wake_up_all(&client->wait_queue);
>> -		}
>> -		else
>> -			pr_debug("smi_event(vmfault): no space left\n");
>> -		spin_unlock(&client->lock);
>> -	}
>> -
>> -	rcu_read_unlock();
>> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>>    }
>>    
>>    int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> index a9cb218fef96..15537b2cccb5 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> @@ -25,5 +25,7 @@
>>    
>>    int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>>    void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t
>> pasid);
>> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>> +					     uint32_t throttle_bitmask);
>>    
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> index b197dcaed064..52b52cbb90e2 100644
>> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>>    	mutex_init(&smu->message_lock);
>>    
>>    	INIT_WORK(&smu->throttling_logging_work,
>> smu_throttling_logging_work_fn);
>> +	atomic64_set(&smu->throttle_int_counter, 0);
>>    	smu->watermarks_bitmap = 0;
>>    	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>>    	smu->default_power_profile_mode =
>> PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> index 9b68760dd35b..eb3a57593f69 100644
>> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> @@ -2251,6 +2251,7 @@ static void
>> arcturus_log_thermal_throttling_event(struct smu_context *smu)
>>    
>>    	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
>>    			log_buf);
>> +	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>>    }
>>    
>>    static const struct pptable_funcs arcturus_ppt_funcs = { diff --git
>> a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> index 896b443f1ce8..18ba421c59bd 100644
>> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> @@ -446,6 +446,7 @@ struct smu_context
>>    	bool dc_controlled_by_gpio;
>>    
>>    	struct work_struct throttling_logging_work;
>> +	atomic64_t throttle_int_counter;
>>    };
>>    
>>    struct i2c_adapter;
>> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> index fd82402065e6..a9453ec01619 100644
>> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>>    				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>>    				break;
>>    			case 0x7:
>> +				/*
>> +				 * Increment the throttle interrupt counter
>> +				 */
>> +				atomic64_inc(&smu->throttle_int_counter);
>> +
>>    				if (!atomic_read(&adev->throttling_logging_enabled))
>>    					return 0;
>>    
>> diff --git a/include/uapi/linux/kfd_ioctl.h
>> b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc
>> 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>>     * KFD SMI(System Management Interface) events
>>     */
>>    /* Event type (defined by bitmask) */
>> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
>> +#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
>> +#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
>>    
>>    struct kfd_ioctl_smi_events_args {
>>    	__u32 gpuid;	/* to KFD */

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
  2020-07-23 21:57     ` Amber Lin
@ 2020-07-23 22:05       ` Joshi, Mukul
  0 siblings, 0 replies; 6+ messages in thread
From: Joshi, Mukul @ 2020-07-23 22:05 UTC (permalink / raw)
  To: Lin, Amber, amd-gfx; +Cc: Kuehling, Felix

[AMD Official Use Only - Internal Distribution Only]



-----Original Message-----
From: Lin, Amber <Amber.Lin@amd.com> 
Sent: Thursday, July 23, 2020 5:58 PM
To: Joshi, Mukul <Mukul.Joshi@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>
Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event



On 2020-07-23 5:41 p.m., Joshi, Mukul wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> -----Original Message-----
> From: Lin, Amber <Amber.Lin@amd.com>
> Sent: Thursday, July 23, 2020 5:27 PM
> To: Joshi, Mukul <Mukul.Joshi@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling@amd.com>
> Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event
>
>
>
> On 2020-07-22 12:08 p.m., Mukul Joshi wrote:
>> Add support for reporting thermal throttling events through SMI.
>> Also, add a counter to count the number of throttling interrupts 
>> observed and report the count in the SMI event message.
>>
>> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  4 ++
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  7 ++
>>    drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++++++++++++++-----
>>    drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
>>    drivers/gpu/drm/amd/powerplay/amdgpu_smu.c    |  1 +
>>    drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
>>    .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h    |  1 +
>>    drivers/gpu/drm/amd/powerplay/smu_v11_0.c     |  5 ++
>>    include/uapi/linux/kfd_ioctl.h                |  3 +-
>>    10 files changed, 75 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 1b865fed74ca..19e4658756d8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>>    void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
>>    {
>>    }
>> +
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask) { }
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 3f2b695cf19e..e8b0258aae24 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
>>    int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>>    					       struct dma_fence *fence);
>>    void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
>> +throttle_bitmask);
>>    
>>    #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git 
>> a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 4bfedaab183f..d5e790f046b4 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -29,6 +29,7 @@
>>    #include "cwsr_trap_handler.h"
>>    #include "kfd_iommu.h"
>>    #include "amdgpu_amdkfd.h"
>> +#include "kfd_smi_events.h"
>>    
>>    #define MQD_SIZE_ALIGNED 768
>>    
>> @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
>>    	WARN_ONCE(count < 0, "Compute profile ref. count error");
>>    }
>>    
>> +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
>> +throttle_bitmask) {
>> +	if (kfd)
>> +		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); }
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>>    /* This function will send a package to HIQ to hang the HWS diff 
>> --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> index 7b348bf9df21..00c90b47155b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -24,6 +24,7 @@
>>    #include <linux/wait.h>
>>    #include <linux/anon_inodes.h>
>>    #include <uapi/linux/kfd_ioctl.h>
>> +#include "amdgpu.h"
>>    #include "amdgpu_vm.h"
>>    #include "kfd_priv.h"
>>    #include "kfd_smi_events.h"
>> @@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
>>    	return 0;
>>    }
>>    
>> +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
>> +			      char *event_msg, int len)
>> +{
>> +	struct kfd_smi_client *client;
>> +
>> +	rcu_read_lock();
>> +
>> +	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>> +		if (!(READ_ONCE(client->events) & smi_event))
>> +			continue;
>> +		spin_lock(&client->lock);
>> +		if (kfifo_avail(&client->fifo) >= len) {
>> +			kfifo_in(&client->fifo, event_msg, len);
>> +			wake_up_all(&client->wait_queue);
>> +		} else {
>> +			pr_debug("smi_event(EventID: %llu): no space left\n",
>> +					smi_event);
>> +		}
>> +		spin_unlock(&client->lock);
>> +	}
>> +
>> +	rcu_read_unlock();
>> +}
>> +
>> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>> +					     uint32_t throttle_bitmask) {
>> +	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
>> +	/*
>> +	 * ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
> gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns an anonymous fd. Read from this anon_fd already identify the GPU.
>
> I agree with you. But I would prefer to keep gpu_id in the SMI event message. The benefit is it explicitly identifies the GPU sending the message. This way user-space doesn't need to maintain an internal mapping of anon_fd to gpu_id.
We want to keep the message as short as possible so the 1K kfifo can contain more events in case multiple events rush in and the user doesn't consume fast enough. For the same reason vmfault event doesn't log gpu_id. On the user side, the user shouldn't need to do the anon_fd/gpu_id convert. That information should be already in their structure. Instead of reading/parsing the message, the user reads from their structure.

MJ> OK that makes sense. I can remove the gpu_id from the message.

Regards,
Mukul

>
> Regards,
> Mukul
>
>> +	 * 			 thermal_interrupt_count(8):
>> +	 * 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
>> +	 * 4 byte throttle_bitmask + 1 byte : +
>> +	 * 8 byte thermal_interupt_counter + 1 byte \n = 36
>> +	 */
>> +	char fifo_in[36];
>> +	int len;
>> +
>> +	if (list_empty(&dev->smi_clients))
>> +		return;
>> +
>> +	len = snprintf(fifo_in, 36, "%x %x:%x:%llx\n",
>> +		       KFD_SMI_EVENT_THERMAL_THROTTLE,
>> +		       dev->id, throttle_bitmask,
>> +		       atomic64_read(&adev->smu.throttle_int_counter));
>> +
>> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, 
>> +len); }
>> +
>>    void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    {
>>    	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; @@
>> -156,7 +206,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
>>    	 */
>>    	char fifo_in[43];
>> -	struct kfd_smi_client *client;
>>    	int len;
>>    
>>    	if (list_empty(&dev->smi_clients)) @@ -171,22 +220,7 @@ void 
>> kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>>    	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
>>    		task_info.pid, task_info.task_name);
>>    
>> -	rcu_read_lock();
>> -
>> -	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
>> -		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
>> -			continue;
>> -		spin_lock(&client->lock);
>> -		if (kfifo_avail(&client->fifo) >= len) {
>> -			kfifo_in(&client->fifo, fifo_in, len);
>> -			wake_up_all(&client->wait_queue);
>> -		}
>> -		else
>> -			pr_debug("smi_event(vmfault): no space left\n");
>> -		spin_unlock(&client->lock);
>> -	}
>> -
>> -	rcu_read_unlock();
>> +	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
>>    }
>>    
>>    int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff 
>> --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> index a9cb218fef96..15537b2cccb5 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>> @@ -25,5 +25,7 @@
>>    
>>    int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
>>    void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t 
>> pasid);
>> +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>> +					     uint32_t throttle_bitmask);
>>    
>>    #endif
>> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> index b197dcaed064..52b52cbb90e2 100644
>> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
>> @@ -639,6 +639,7 @@ static int smu_sw_init(void *handle)
>>    	mutex_init(&smu->message_lock);
>>    
>>    	INIT_WORK(&smu->throttling_logging_work,
>> smu_throttling_logging_work_fn);
>> +	atomic64_set(&smu->throttle_int_counter, 0);
>>    	smu->watermarks_bitmap = 0;
>>    	smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>>    	smu->default_power_profile_mode = 
>> PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
>> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> index 9b68760dd35b..eb3a57593f69 100644
>> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
>> @@ -2251,6 +2251,7 @@ static void
>> arcturus_log_thermal_throttling_event(struct smu_context *smu)
>>    
>>    	dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
>>    			log_buf);
>> +	kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
>>    }
>>    
>>    static const struct pptable_funcs arcturus_ppt_funcs = { diff 
>> --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> index 896b443f1ce8..18ba421c59bd 100644
>> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
>> @@ -446,6 +446,7 @@ struct smu_context
>>    	bool dc_controlled_by_gpio;
>>    
>>    	struct work_struct throttling_logging_work;
>> +	atomic64_t throttle_int_counter;
>>    };
>>    
>>    struct i2c_adapter;
>> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> index fd82402065e6..a9453ec01619 100644
>> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
>> @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>>    				smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
>>    				break;
>>    			case 0x7:
>> +				/*
>> +				 * Increment the throttle interrupt counter
>> +				 */
>> +				atomic64_inc(&smu->throttle_int_counter);
>> +
>>    				if (!atomic_read(&adev->throttling_logging_enabled))
>>    					return 0;
>>    
>> diff --git a/include/uapi/linux/kfd_ioctl.h 
>> b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc
>> 100644
>> --- a/include/uapi/linux/kfd_ioctl.h
>> +++ b/include/uapi/linux/kfd_ioctl.h
>> @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
>>     * KFD SMI(System Management Interface) events
>>     */
>>    /* Event type (defined by bitmask) */
>> -#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
>> +#define KFD_SMI_EVENT_VMFAULT			0x0000000000000001
>> +#define KFD_SMI_EVENT_THERMAL_THROTTLE		0x0000000000000002
>>    
>>    struct kfd_ioctl_smi_events_args {
>>    	__u32 gpuid;	/* to KFD */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-07-23 22:06 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-22 16:08 [PATCH v2] drm/amdkfd: Add thermal throttling SMI event Mukul Joshi
2020-07-23 20:36 ` Felix Kuehling
2020-07-23 21:27 ` Amber Lin
2020-07-23 21:41   ` Joshi, Mukul
2020-07-23 21:57     ` Amber Lin
2020-07-23 22:05       ` Joshi, Mukul

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.