All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
@ 2019-01-07 15:16 StDenis, Tom
       [not found] ` <20190107151620.11701-1-tom.stdenis-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: StDenis, Tom @ 2019-01-07 15:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: StDenis, Tom

v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..13d8e2ad2f7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 * by different nodes. No point also since the one node already executing
 	 * reset will also reset all the other nodes in the hive.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 0);
 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
 	    !mutex_trylock(&hive->hive_lock))
 		return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..ebf50809485f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
 	return &hive->device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
 {
 	int i;
 	struct amdgpu_hive_info *tmp;
 
 	if (!adev->gmc.xgmi.hive_id)
 		return NULL;
+
+	mutex_lock(&xgmi_mutex);
+
 	for (i = 0 ; i < hive_count; ++i) {
 		tmp = &xgmi_hives[i];
-		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+			if (lock)
+				mutex_lock(&tmp->hive_lock);
+			mutex_unlock(&xgmi_mutex);
 			return tmp;
+		}
 	}
-	if (i >= AMDGPU_MAX_XGMI_HIVE)
+	if (i >= AMDGPU_MAX_XGMI_HIVE) {
+		mutex_unlock(&xgmi_mutex);
 		return NULL;
+	}
 
 	/* initialize new hive if not exist */
 	tmp = &xgmi_hives[hive_count++];
 	tmp->hive_id = adev->gmc.xgmi.hive_id;
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
+	if (lock)
+		mutex_lock(&tmp->hive_lock);
+
+	mutex_unlock(&xgmi_mutex);
 
 	return tmp;
 }
@@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		return ret;
 	}
 
-	mutex_lock(&xgmi_mutex);
-	hive = amdgpu_get_xgmi_hive(adev);
+	/* find hive and take lock */
+	hive = amdgpu_get_xgmi_hive(adev, 1);
 	if (!hive)
 		goto exit;
 
@@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 			break;
 	}
 
+	mutex_unlock(&hive->hive_lock);
 exit:
-	mutex_unlock(&xgmi_mutex);
 	return ret;
 }
 
@@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return;
 
-	mutex_lock(&xgmi_mutex);
-
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 1);
 	if (!hive)
-		goto exit;
+		return;
 
 	if (!(hive->number_devices--))
 		mutex_destroy(&hive->hive_lock);
-
-exit:
-	mutex_unlock(&xgmi_mutex);
+	else
+		mutex_unlock(&hive->hive_lock);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..8d7984844174 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -32,7 +32,7 @@ struct amdgpu_hive_info {
 	struct mutex hive_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
-- 
2.17.2

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found] ` <20190107151620.11701-1-tom.stdenis-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 16:16   ` Liu, Shaoyun
       [not found]     ` <DM6PR12MB324181E29144A5FBA327AADBF4890-lmeGfMZKVrEShx2hYn8pVQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Liu, Shaoyun @ 2019-01-07 16:16 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: StDenis, Tom

I think it's reasonable to use the hive  specific lock for hive  specific functions. 
The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
Sent: Monday, January 7, 2019 10:16 AM
To: amd-gfx@lists.freedesktop.org
Cc: StDenis, Tom <Tom.StDenis@amd.com>
Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.

Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..13d8e2ad2f7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 * by different nodes. No point also since the one node already executing
 	 * reset will also reset all the other nodes in the hive.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 0);
 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
 	    !mutex_trylock(&hive->hive_lock))
 		return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..ebf50809485f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
 	return &hive->device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
+*adev, int lock)
 {
 	int i;
 	struct amdgpu_hive_info *tmp;
 
 	if (!adev->gmc.xgmi.hive_id)
 		return NULL;
+
+	mutex_lock(&xgmi_mutex);
+
 	for (i = 0 ; i < hive_count; ++i) {
 		tmp = &xgmi_hives[i];
-		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+			if (lock)
+				mutex_lock(&tmp->hive_lock);
+			mutex_unlock(&xgmi_mutex);
 			return tmp;
+		}
 	}
-	if (i >= AMDGPU_MAX_XGMI_HIVE)
+	if (i >= AMDGPU_MAX_XGMI_HIVE) {
+		mutex_unlock(&xgmi_mutex);
 		return NULL;
+	}
 
 	/* initialize new hive if not exist */
 	tmp = &xgmi_hives[hive_count++];
 	tmp->hive_id = adev->gmc.xgmi.hive_id;
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
+	if (lock)
+		mutex_lock(&tmp->hive_lock);
+
+	mutex_unlock(&xgmi_mutex);
 
 	return tmp;
 }
@@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		return ret;
 	}
 
-	mutex_lock(&xgmi_mutex);
-	hive = amdgpu_get_xgmi_hive(adev);
+	/* find hive and take lock */
+	hive = amdgpu_get_xgmi_hive(adev, 1);
 	if (!hive)
 		goto exit;
 
@@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 			break;
 	}
 
+	mutex_unlock(&hive->hive_lock);
 exit:
-	mutex_unlock(&xgmi_mutex);
 	return ret;
 }
 
@@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return;
 
-	mutex_lock(&xgmi_mutex);
-
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 1);
 	if (!hive)
-		goto exit;
+		return;
 
 	if (!(hive->number_devices--))
 		mutex_destroy(&hive->hive_lock);
-
-exit:
-	mutex_unlock(&xgmi_mutex);
+	else
+		mutex_unlock(&hive->hive_lock);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..8d7984844174 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -32,7 +32,7 @@ struct amdgpu_hive_info {
 	struct mutex hive_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
+*adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);  void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
--
2.17.2

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]     ` <DM6PR12MB324181E29144A5FBA327AADBF4890-lmeGfMZKVrEShx2hYn8pVQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-01-07 16:33       ` Grodzovsky, Andrey
       [not found]         ` <967dc160-e50b-0f35-b11e-6d1c9951c6aa-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-01-07 16:33 UTC (permalink / raw)
  To: Liu, Shaoyun, StDenis, Tom, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
> I think it's reasonable to use the hive  specific lock for hive  specific functions.
> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
> Sent: Monday, January 7, 2019 10:16 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: StDenis, Tom <Tom.StDenis@amd.com>
> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>
> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>
> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>   3 files changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 39d5d058b2c7..13d8e2ad2f7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 * by different nodes. No point also since the one node already executing
>   	 * reset will also reset all the other nodes in the hive.
>   	 */
> -	hive = amdgpu_get_xgmi_hive(adev);
> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>   	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>   	    !mutex_trylock(&hive->hive_lock))
>   		return 0;

Let's say i have device 0 in hive A and it just got a gpu reset and at 
the same time device 1 is being added to same have though 
amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device 
being added and so gpu reset for device 0 bails out on 
'!mutex_trylock(&hive->hive_lock))' without completing the reset.
Also in general i feel a bit uncomfortable about the confusing 
acquisition scheme in the function and  the fact that you take the 
hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside 
of the function.

Andrey

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8a8bc60cb6b4..ebf50809485f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
>   	return &hive->device_list;
>   }
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
> +*adev, int lock)
>   {
>   	int i;
>   	struct amdgpu_hive_info *tmp;
>   
>   	if (!adev->gmc.xgmi.hive_id)
>   		return NULL;
> +
> +	mutex_lock(&xgmi_mutex);
> +
>   	for (i = 0 ; i < hive_count; ++i) {
>   		tmp = &xgmi_hives[i];
> -		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
> +		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> +			if (lock)
> +				mutex_lock(&tmp->hive_lock);
> +			mutex_unlock(&xgmi_mutex);
>   			return tmp;
> +		}
>   	}
> -	if (i >= AMDGPU_MAX_XGMI_HIVE)
> +	if (i >= AMDGPU_MAX_XGMI_HIVE) {
> +		mutex_unlock(&xgmi_mutex);
>   		return NULL;
> +	}
>   
>   	/* initialize new hive if not exist */
>   	tmp = &xgmi_hives[hive_count++];
>   	tmp->hive_id = adev->gmc.xgmi.hive_id;
>   	INIT_LIST_HEAD(&tmp->device_list);
>   	mutex_init(&tmp->hive_lock);
> +	if (lock)
> +		mutex_lock(&tmp->hive_lock);
> +
> +	mutex_unlock(&xgmi_mutex);
>   
>   	return tmp;
>   }
> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   		return ret;
>   	}
>   
> -	mutex_lock(&xgmi_mutex);
> -	hive = amdgpu_get_xgmi_hive(adev);
> +	/* find hive and take lock */
> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>   	if (!hive)
>   		goto exit;
>   
> @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   			break;
>   	}
>   
> +	mutex_unlock(&hive->hive_lock);
>   exit:
> -	mutex_unlock(&xgmi_mutex);
>   	return ret;
>   }
>   
> @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
>   	if (!adev->gmc.xgmi.supported)
>   		return;
>   
> -	mutex_lock(&xgmi_mutex);
> -
> -	hive = amdgpu_get_xgmi_hive(adev);
> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>   	if (!hive)
> -		goto exit;
> +		return;
>   
>   	if (!(hive->number_devices--))
>   		mutex_destroy(&hive->hive_lock);
> -
> -exit:
> -	mutex_unlock(&xgmi_mutex);
> +	else
> +		mutex_unlock(&hive->hive_lock);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 6151eb9c8ad3..8d7984844174 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -32,7 +32,7 @@ struct amdgpu_hive_info {
>   	struct mutex hive_lock;
>   };
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
> +*adev, int lock);
>   int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);  void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
> --
> 2.17.2
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]         ` <967dc160-e50b-0f35-b11e-6d1c9951c6aa-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 16:36           ` StDenis, Tom
       [not found]             ` <e7aa0543-b54f-665a-b4b4-d6db58201923-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: StDenis, Tom @ 2019-01-07 16:36 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Liu, Shaoyun,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
> 
> 
> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>> I think it's reasonable to use the hive  specific lock for hive  specific functions.
>> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
>> Sent: Monday, January 7, 2019 10:16 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: StDenis, Tom <Tom.StDenis@amd.com>
>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>
>> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>>
>> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>    3 files changed, 25 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    	 * by different nodes. No point also since the one node already executing
>>    	 * reset will also reset all the other nodes in the hive.
>>    	 */
>> -	hive = amdgpu_get_xgmi_hive(adev);
>> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>>    	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>    	    !mutex_trylock(&hive->hive_lock))
>>    		return 0;
> 
> Let's say i have device 0 in hive A and it just got a gpu reset and at
> the same time device 1 is being added to same have though
> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
> being added and so gpu reset for device 0 bails out on
> '!mutex_trylock(&hive->hive_lock))' without completing the reset.
> Also in general i feel a bit uncomfortable about the confusing
> acquisition scheme in the function and  the fact that you take the
> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
> of the function.

Is adding a device while resetting a device even a valid operation 
anyways?

I think this means more so that the reset logic is broken.  Instead 
there should be a per-hive reset lock that is taken and that is tested 
instead.

Tom


> 
> Andrey
> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 8a8bc60cb6b4..ebf50809485f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
>>    	return &hive->device_list;
>>    }
>>    
>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
>> +*adev, int lock)
>>    {
>>    	int i;
>>    	struct amdgpu_hive_info *tmp;
>>    
>>    	if (!adev->gmc.xgmi.hive_id)
>>    		return NULL;
>> +
>> +	mutex_lock(&xgmi_mutex);
>> +
>>    	for (i = 0 ; i < hive_count; ++i) {
>>    		tmp = &xgmi_hives[i];
>> -		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
>> +		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
>> +			if (lock)
>> +				mutex_lock(&tmp->hive_lock);
>> +			mutex_unlock(&xgmi_mutex);
>>    			return tmp;
>> +		}
>>    	}
>> -	if (i >= AMDGPU_MAX_XGMI_HIVE)
>> +	if (i >= AMDGPU_MAX_XGMI_HIVE) {
>> +		mutex_unlock(&xgmi_mutex);
>>    		return NULL;
>> +	}
>>    
>>    	/* initialize new hive if not exist */
>>    	tmp = &xgmi_hives[hive_count++];
>>    	tmp->hive_id = adev->gmc.xgmi.hive_id;
>>    	INIT_LIST_HEAD(&tmp->device_list);
>>    	mutex_init(&tmp->hive_lock);
>> +	if (lock)
>> +		mutex_lock(&tmp->hive_lock);
>> +
>> +	mutex_unlock(&xgmi_mutex);
>>    
>>    	return tmp;
>>    }
>> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    		return ret;
>>    	}
>>    
>> -	mutex_lock(&xgmi_mutex);
>> -	hive = amdgpu_get_xgmi_hive(adev);
>> +	/* find hive and take lock */
>> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>>    	if (!hive)
>>    		goto exit;
>>    
>> @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>    			break;
>>    	}
>>    
>> +	mutex_unlock(&hive->hive_lock);
>>    exit:
>> -	mutex_unlock(&xgmi_mutex);
>>    	return ret;
>>    }
>>    
>> @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
>>    	if (!adev->gmc.xgmi.supported)
>>    		return;
>>    
>> -	mutex_lock(&xgmi_mutex);
>> -
>> -	hive = amdgpu_get_xgmi_hive(adev);
>> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>>    	if (!hive)
>> -		goto exit;
>> +		return;
>>    
>>    	if (!(hive->number_devices--))
>>    		mutex_destroy(&hive->hive_lock);
>> -
>> -exit:
>> -	mutex_unlock(&xgmi_mutex);
>> +	else
>> +		mutex_unlock(&hive->hive_lock);
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 6151eb9c8ad3..8d7984844174 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -32,7 +32,7 @@ struct amdgpu_hive_info {
>>    	struct mutex hive_lock;
>>    };
>>    
>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
>> +*adev, int lock);
>>    int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);  void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>> --
>> 2.17.2
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]             ` <e7aa0543-b54f-665a-b4b4-d6db58201923-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 16:51               ` Grodzovsky, Andrey
       [not found]                 ` <15dbfb42-df6b-74f3-877b-0680993c23b8-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-01-07 16:51 UTC (permalink / raw)
  To: StDenis, Tom, Liu, Shaoyun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 01/07/2019 11:36 AM, StDenis, Tom wrote:
> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>
>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>> I think it's reasonable to use the hive  specific lock for hive  specific functions.
>>> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
>>> Sent: Monday, January 7, 2019 10:16 AM
>>> To: amd-gfx@lists.freedesktop.org
>>> Cc: StDenis, Tom <Tom.StDenis@amd.com>
>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>
>>> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>>>
>>> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>     3 files changed, 25 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     	 * by different nodes. No point also since the one node already executing
>>>     	 * reset will also reset all the other nodes in the hive.
>>>     	 */
>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>     	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>     	    !mutex_trylock(&hive->hive_lock))
>>>     		return 0;
>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>> the same time device 1 is being added to same have though
>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>> being added and so gpu reset for device 0 bails out on
>> '!mutex_trylock(&hive->hive_lock))' without completing the reset.
>> Also in general i feel a bit uncomfortable about the confusing
>> acquisition scheme in the function and  the fact that you take the
>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>> of the function.
> Is adding a device while resetting a device even a valid operation
> anyways?

In theory it's valid if you have hot pluggable devices
>
> I think this means more so that the reset logic is broken.  Instead
> there should be a per-hive reset lock that is taken and that is tested
> instead.
>
> Tom

The hive->hive_lock was added exactly for this purpose and used only for 
that purpose. Maybe the naming i gave it wasn't reflective of it's 
purpose :)

Andrey

>
>
>> Andrey
>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> index 8a8bc60cb6b4..ebf50809485f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
>>>     	return &hive->device_list;
>>>     }
>>>     
>>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
>>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
>>> +*adev, int lock)
>>>     {
>>>     	int i;
>>>     	struct amdgpu_hive_info *tmp;
>>>     
>>>     	if (!adev->gmc.xgmi.hive_id)
>>>     		return NULL;
>>> +
>>> +	mutex_lock(&xgmi_mutex);
>>> +
>>>     	for (i = 0 ; i < hive_count; ++i) {
>>>     		tmp = &xgmi_hives[i];
>>> -		if (tmp->hive_id == adev->gmc.xgmi.hive_id)
>>> +		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
>>> +			if (lock)
>>> +				mutex_lock(&tmp->hive_lock);
>>> +			mutex_unlock(&xgmi_mutex);
>>>     			return tmp;
>>> +		}
>>>     	}
>>> -	if (i >= AMDGPU_MAX_XGMI_HIVE)
>>> +	if (i >= AMDGPU_MAX_XGMI_HIVE) {
>>> +		mutex_unlock(&xgmi_mutex);
>>>     		return NULL;
>>> +	}
>>>     
>>>     	/* initialize new hive if not exist */
>>>     	tmp = &xgmi_hives[hive_count++];
>>>     	tmp->hive_id = adev->gmc.xgmi.hive_id;
>>>     	INIT_LIST_HEAD(&tmp->device_list);
>>>     	mutex_init(&tmp->hive_lock);
>>> +	if (lock)
>>> +		mutex_lock(&tmp->hive_lock);
>>> +
>>> +	mutex_unlock(&xgmi_mutex);
>>>     
>>>     	return tmp;
>>>     }
>>> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>     		return ret;
>>>     	}
>>>     
>>> -	mutex_lock(&xgmi_mutex);
>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>> +	/* find hive and take lock */
>>> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>>>     	if (!hive)
>>>     		goto exit;
>>>     
>>> @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>>     			break;
>>>     	}
>>>     
>>> +	mutex_unlock(&hive->hive_lock);
>>>     exit:
>>> -	mutex_unlock(&xgmi_mutex);
>>>     	return ret;
>>>     }
>>>     
>>> @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
>>>     	if (!adev->gmc.xgmi.supported)
>>>     		return;
>>>     
>>> -	mutex_lock(&xgmi_mutex);
>>> -
>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>> +	hive = amdgpu_get_xgmi_hive(adev, 1);
>>>     	if (!hive)
>>> -		goto exit;
>>> +		return;
>>>     
>>>     	if (!(hive->number_devices--))
>>>     		mutex_destroy(&hive->hive_lock);
>>> -
>>> -exit:
>>> -	mutex_unlock(&xgmi_mutex);
>>> +	else
>>> +		mutex_unlock(&hive->hive_lock);
>>>     }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>>> index 6151eb9c8ad3..8d7984844174 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>>> @@ -32,7 +32,7 @@ struct amdgpu_hive_info {
>>>     	struct mutex hive_lock;
>>>     };
>>>     
>>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
>>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
>>> +*adev, int lock);
>>>     int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);  void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>>> --
>>> 2.17.2
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]                 ` <15dbfb42-df6b-74f3-877b-0680993c23b8-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 16:53                   ` StDenis, Tom
       [not found]                     ` <d1c4662b-2e8c-bf38-dd14-de919f10488e-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: StDenis, Tom @ 2019-01-07 16:53 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Liu, Shaoyun,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
> 
> 
> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>
>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>> I think it's reasonable to use the hive  specific lock for hive  specific functions.
>>>> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>>>>
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>> To: amd-gfx@lists.freedesktop.org
>>>> Cc: StDenis, Tom <Tom.StDenis@amd.com>
>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>
>>>> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>>>>
>>>> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>      3 files changed, 25 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>      	 * by different nodes. No point also since the one node already executing
>>>>      	 * reset will also reset all the other nodes in the hive.
>>>>      	 */
>>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>>> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>      	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>      	    !mutex_trylock(&hive->hive_lock))
>>>>      		return 0;
>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>> the same time device 1 is being added to same have though
>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>> being added and so gpu reset for device 0 bails out on
>>> '!mutex_trylock(&hive->hive_lock))' without completing the reset.
>>> Also in general i feel a bit uncomfortable about the confusing
>>> acquisition scheme in the function and  the fact that you take the
>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>> of the function.
>> Is adding a device while resetting a device even a valid operation
>> anyways?
> 
> In theory it's valid if you have hot pluggable devices
>>
>> I think this means more so that the reset logic is broken.  Instead
>> there should be a per-hive reset lock that is taken and that is tested
>> instead.
>>
>> Tom
> 
> The hive->hive_lock was added exactly for this purpose and used only for
> that purpose. Maybe the naming i gave it wasn't reflective of it's
> purpose :)


But the add/remove should use per-hive locks not the global lock... :-)

(I'm honestly not trying to bike shed I just thought the get_hive 
function looked wrong :-)).

Tom
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]                     ` <d1c4662b-2e8c-bf38-dd14-de919f10488e-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 17:00                       ` Grodzovsky, Andrey
       [not found]                         ` <133f4e17-aa94-16cc-9bea-2eb16140e1de-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-01-07 17:00 UTC (permalink / raw)
  To: StDenis, Tom, Liu, Shaoyun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 01/07/2019 11:53 AM, StDenis, Tom wrote:
> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
>>
>> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>>> I think it's reasonable to use the hive  specific lock for hive  specific functions.
>>>>> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>>>>>
>>>>> -----Original Message-----
>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
>>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>>> To: amd-gfx@lists.freedesktop.org
>>>>> Cc: StDenis, Tom <Tom.StDenis@amd.com>
>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>>
>>>>> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>>>>>
>>>>> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
>>>>> ---
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>>       3 files changed, 25 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>       	 * by different nodes. No point also since the one node already executing
>>>>>       	 * reset will also reset all the other nodes in the hive.
>>>>>       	 */
>>>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>>>> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>       	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>       	    !mutex_trylock(&hive->hive_lock))
>>>>>       		return 0;
>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>>> the same time device 1 is being added to same have though
>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>>> being added and so gpu reset for device 0 bails out on
>>>> '!mutex_trylock(&hive->hive_lock))' without completing the reset.
>>>> Also in general i feel a bit uncomfortable about the confusing
>>>> acquisition scheme in the function and  the fact that you take the
>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>>> of the function.
>>> Is adding a device while resetting a device even a valid operation
>>> anyways?
>> In theory it's valid if you have hot pluggable devices
>>> I think this means more so that the reset logic is broken.  Instead
>>> there should be a per-hive reset lock that is taken and that is tested
>>> instead.
>>>
>>> Tom
>> The hive->hive_lock was added exactly for this purpose and used only for
>> that purpose. Maybe the naming i gave it wasn't reflective of it's
>> purpose :)
>
> But the add/remove should use per-hive locks not the global lock... :-)
>
> (I'm honestly not trying to bike shed I just thought the get_hive
> function looked wrong :-)).
>
> Tom

Totally agree with you, if Shayun (who origianlly added the global 
xgmi_mutex) is fine with switching to per hive mutex then me too, I just 
point out the problem with gpu reset and as you said we then need to 
rename the existing hive_lock into reset_lock and then and another per 
hive lock to do what you propose. Also - is there a way to not take the 
hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK 
it's an opening for problems where people use it but forget to call 
release.

Andrey

> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]                         ` <133f4e17-aa94-16cc-9bea-2eb16140e1de-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 17:03                           ` StDenis, Tom
       [not found]                             ` <136b5b59-4f10-2eed-7295-bdeee06025ac-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: StDenis, Tom @ 2019-01-07 17:03 UTC (permalink / raw)
  To: Grodzovsky, Andrey, Liu, Shaoyun,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-01-07 12:00 p.m., Grodzovsky, Andrey wrote:
> 
> 
> On 01/07/2019 11:53 AM, StDenis, Tom wrote:
>> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
>>>
>>> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>>>> I think it's reasonable to use the hive  specific lock for hive  specific functions.
>>>>>> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
>>>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>>>> To: amd-gfx@lists.freedesktop.org
>>>>>> Cc: StDenis, Tom <Tom.StDenis@amd.com>
>>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>>>
>>>>>> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>>>>>>
>>>>>> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
>>>>>> ---
>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>>>        3 files changed, 25 insertions(+), 15 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>        	 * by different nodes. No point also since the one node already executing
>>>>>>        	 * reset will also reset all the other nodes in the hive.
>>>>>>        	 */
>>>>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>>>>> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>>        	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>>        	    !mutex_trylock(&hive->hive_lock))
>>>>>>        		return 0;
>>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>>>> the same time device 1 is being added to same have though
>>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>>>> being added and so gpu reset for device 0 bails out on
>>>>> '!mutex_trylock(&hive->hive_lock))' without completing the reset.
>>>>> Also in general i feel a bit uncomfortable about the confusing
>>>>> acquisition scheme in the function and  the fact that you take the
>>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>>>> of the function.
>>>> Is adding a device while resetting a device even a valid operation
>>>> anyways?
>>> In theory it's valid if you have hot pluggable devices
>>>> I think this means more so that the reset logic is broken.  Instead
>>>> there should be a per-hive reset lock that is taken and that is tested
>>>> instead.
>>>>
>>>> Tom
>>> The hive->hive_lock was added exactly for this purpose and used only for
>>> that purpose. Maybe the naming i gave it wasn't reflective of it's
>>> purpose :)
>>
>> But the add/remove should use per-hive locks not the global lock... :-)
>>
>> (I'm honestly not trying to bike shed I just thought the get_hive
>> function looked wrong :-)).
>>
>> Tom
> 
> Totally agree with you, if Shayun (who origianlly added the global
> xgmi_mutex) is fine with switching to per hive mutex then me too, I just
> point out the problem with gpu reset and as you said we then need to
> rename the existing hive_lock into reset_lock and then and another per
> hive lock to do what you propose. Also - is there a way to not take the
> hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK
> it's an opening for problems where people use it but forget to call
> release.

I wanted to take the per-hive lock inside get_hive because it also takes 
the global lock so that add/remove couldn't happen in parallel.

For instance, deleting the last node while adding a new node means the 
per-hive mutex could be in limbo (because remove will delete the lock).

Adding a per-hive reset lock would fix the remaining issues no?

Tom
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
       [not found]                             ` <136b5b59-4f10-2eed-7295-bdeee06025ac-5C7GfCeVMHo@public.gmane.org>
@ 2019-01-07 17:20                               ` Grodzovsky, Andrey
  0 siblings, 0 replies; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-01-07 17:20 UTC (permalink / raw)
  To: StDenis, Tom, Liu, Shaoyun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 01/07/2019 12:03 PM, StDenis, Tom wrote:
> On 2019-01-07 12:00 p.m., Grodzovsky, Andrey wrote:
>>
>> On 01/07/2019 11:53 AM, StDenis, Tom wrote:
>>> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
>>>> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>>>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>>>>> I think it's reasonable to use the hive  specific lock for hive  specific functions.
>>>>>>> The changes is acked-by  Shaoyun.liu < Shaoyun.liu@amd.com>
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of StDenis, Tom
>>>>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>>>>> To: amd-gfx@lists.freedesktop.org
>>>>>>> Cc: StDenis, Tom <Tom.StDenis@amd.com>
>>>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>>>>
>>>>>>> v2: Move locks around in other functions so that this function can stand on its own.  Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive.
>>>>>>>
>>>>>>> Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
>>>>>>> ---
>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>>>>         3 files changed, 25 insertions(+), 15 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>>>>>         	 * by different nodes. No point also since the one node already executing
>>>>>>>         	 * reset will also reset all the other nodes in the hive.
>>>>>>>         	 */
>>>>>>> -	hive = amdgpu_get_xgmi_hive(adev);
>>>>>>> +	hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>>>         	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>>>         	    !mutex_trylock(&hive->hive_lock))
>>>>>>>         		return 0;
>>>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>>>>> the same time device 1 is being added to same have though
>>>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>>>>> being added and so gpu reset for device 0 bails out on
>>>>>> '!mutex_trylock(&hive->hive_lock))' without completing the reset.
>>>>>> Also in general i feel a bit uncomfortable about the confusing
>>>>>> acquisition scheme in the function and  the fact that you take the
>>>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>>>>> of the function.
>>>>> Is adding a device while resetting a device even a valid operation
>>>>> anyways?
>>>> In theory it's valid if you have hot pluggable devices
>>>>> I think this means more so that the reset logic is broken.  Instead
>>>>> there should be a per-hive reset lock that is taken and that is tested
>>>>> instead.
>>>>>
>>>>> Tom
>>>> The hive->hive_lock was added exactly for this purpose and used only for
>>>> that purpose. Maybe the naming i gave it wasn't reflective of it's
>>>> purpose :)
>>> But the add/remove should use per-hive locks not the global lock... :-)
>>>
>>> (I'm honestly not trying to bike shed I just thought the get_hive
>>> function looked wrong :-)).
>>>
>>> Tom
>> Totally agree with you, if Shayun (who origianlly added the global
>> xgmi_mutex) is fine with switching to per hive mutex then me too, I just
>> point out the problem with gpu reset and as you said we then need to
>> rename the existing hive_lock into reset_lock and then and another per
>> hive lock to do what you propose. Also - is there a way to not take the
>> hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK
>> it's an opening for problems where people use it but forget to call
>> release.
> I wanted to take the per-hive lock inside get_hive because it also takes
> the global lock so that add/remove couldn't happen in parallel.
>
> For instance, deleting the last node while adding a new node means the
> per-hive mutex could be in limbo (because remove will delete the lock).
>
> Adding a per-hive reset lock would fix the remaining issues no?
>
> Tom

Looks like it.

Andrey


> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-01-07 17:20 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-07 15:16 [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) StDenis, Tom
     [not found] ` <20190107151620.11701-1-tom.stdenis-5C7GfCeVMHo@public.gmane.org>
2019-01-07 16:16   ` Liu, Shaoyun
     [not found]     ` <DM6PR12MB324181E29144A5FBA327AADBF4890-lmeGfMZKVrEShx2hYn8pVQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-01-07 16:33       ` Grodzovsky, Andrey
     [not found]         ` <967dc160-e50b-0f35-b11e-6d1c9951c6aa-5C7GfCeVMHo@public.gmane.org>
2019-01-07 16:36           ` StDenis, Tom
     [not found]             ` <e7aa0543-b54f-665a-b4b4-d6db58201923-5C7GfCeVMHo@public.gmane.org>
2019-01-07 16:51               ` Grodzovsky, Andrey
     [not found]                 ` <15dbfb42-df6b-74f3-877b-0680993c23b8-5C7GfCeVMHo@public.gmane.org>
2019-01-07 16:53                   ` StDenis, Tom
     [not found]                     ` <d1c4662b-2e8c-bf38-dd14-de919f10488e-5C7GfCeVMHo@public.gmane.org>
2019-01-07 17:00                       ` Grodzovsky, Andrey
     [not found]                         ` <133f4e17-aa94-16cc-9bea-2eb16140e1de-5C7GfCeVMHo@public.gmane.org>
2019-01-07 17:03                           ` StDenis, Tom
     [not found]                             ` <136b5b59-4f10-2eed-7295-bdeee06025ac-5C7GfCeVMHo@public.gmane.org>
2019-01-07 17:20                               ` Grodzovsky, Andrey

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.