All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: fix a typo
@ 2017-06-22  2:42 Alex Xie
       [not found] ` <1498099356-31332-1-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Alex Xie @ 2017-06-22  2:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Alex Xie

Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 7635f38..94c27fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -494,7 +494,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 				 &e->user_invalidated) && e->user_pages) {
 
 				/* We acquired a page array, but somebody
-				 * invalidated it. Free it an try again
+				 * invalidated it. Free it and try again
 				 */
 				release_pages(e->user_pages,
 					      e->robj->tbo.ttm->num_pages,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/3] drm/amdgpu: change a function to static function
       [not found] ` <1498099356-31332-1-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-22  2:42   ` Alex Xie
       [not found]     ` <1498099356-31332-2-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
  2017-06-22  2:42   ` [PATCH 3/3] drm/amdgpu: optimize out a spin lock Use atomic instead of spin lock Alex Xie
  2017-06-22  7:35   ` [PATCH 1/3] drm/amdgpu: fix a typo Christian König
  2 siblings, 1 reply; 37+ messages in thread
From: Alex Xie @ 2017-06-22  2:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Alex Xie

The function is called only once inside the .c file.

Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h    | 1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6b7d2a1..7caf514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1929,7 +1929,6 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev);
 bool amdgpu_need_post(struct amdgpu_device *adev);
 void amdgpu_update_display_priority(struct amdgpu_device *adev);
 
-int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data);
 void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes);
 void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
 bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 94c27fc..82131d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -63,7 +63,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
-int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
+static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
 	struct amdgpu_vm *vm = &fpriv->vm;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 3/3] drm/amdgpu: optimize out a spin lock Use atomic instead of spin lock.
       [not found] ` <1498099356-31332-1-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
  2017-06-22  2:42   ` [PATCH 2/3] drm/amdgpu: change a function to static function Alex Xie
@ 2017-06-22  2:42   ` Alex Xie
  2017-06-22  7:35   ` [PATCH 1/3] drm/amdgpu: fix a typo Christian König
  2 siblings, 0 replies; 37+ messages in thread
From: Alex Xie @ 2017-06-22  2:42 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Alex Xie

Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 110 +++++++++++++++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   1 -
 3 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 7caf514..21d318b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1588,9 +1588,8 @@ struct amdgpu_device {
 
 	/* data for buffer migration throttling */
 	struct {
-		spinlock_t		lock;
-		s64			last_update_us;
-		s64			accum_us; /* accumulated microseconds */
+		atomic64_t		last_update_us;
+		atomic64_t		accum_us; /* accumulated microseconds */
 		u32			log2_max_MBps;
 	} mm_stats;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 82131d7..7b6f42e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -225,6 +225,9 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	s64 time_us, increment_us;
 	u64 max_bytes;
 	u64 free_vram, total_vram, used_vram;
+	s64 old_update_us, head_time_us;
+	s64 accum_us;
+	s64 old_accum_us, head_accum_us;
 
 	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 	 * throttling.
@@ -242,47 +245,83 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	used_vram = atomic64_read(&adev->vram_usage);
 	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
-	spin_lock(&adev->mm_stats.lock);
-
 	/* Increase the amount of accumulated us. */
-	time_us = ktime_to_us(ktime_get());
-	increment_us = time_us - adev->mm_stats.last_update_us;
-	adev->mm_stats.last_update_us = time_us;
-	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-                                      us_upper_bound);
-
-	/* This prevents the short period of low performance when the VRAM
-	 * usage is low and the driver is in debt or doesn't have enough
-	 * accumulated us to fill VRAM quickly.
-	 *
-	 * The situation can occur in these cases:
-	 * - a lot of VRAM is freed by userspace
-	 * - the presence of a big buffer causes a lot of evictions
-	 *   (solution: split buffers into smaller ones)
-	 *
-	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
-	 * accum_us to a positive number.
-	 */
-	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
-		s64 min_us;
-
-		/* Be more aggresive on dGPUs. Try to fill a portion of free
-		 * VRAM now.
-		 */
-		if (!(adev->flags & AMD_IS_APU))
-			min_us = bytes_to_us(adev, free_vram / 4);
+	old_update_us = atomic64_read(&adev->mm_stats.last_update_us);
+	for (;;) {
+		time_us = ktime_to_us(ktime_get());
+		head_time_us = atomic64_cmpxchg(&adev->mm_stats.last_update_us,
+						old_update_us, time_us);
+
+		if (likely(head_time_us == old_update_us))
+			/*
+			 * No other task modified adev->mm_stats.last_update_us.
+			 * Update was successful.
+			 */
+			break;
 		else
-			min_us = 0; /* Reset accum_us on APUs. */
+			/* Another task modified the value after we read it.
+			 * A rare contention happens, let us retry.
+			 * In most case, one retry can do the job.
+			 * See function atomic64_add_unless as a similar idea.
+			 */
+			old_update_us = head_time_us;
+	}
+	increment_us = time_us - old_update_us;
+
+	old_accum_us = atomic64_read(&adev->mm_stats.accum_us);
+
+	for (;;) {
+		accum_us = min(old_accum_us + increment_us, us_upper_bound);
+
+		/* This prevents the short period of low performance when the
+		 * VRAM usage is low and the driver is in debt or doesn't have
+		 * enough accumulated us to fill VRAM quickly.
+		 *
+		 * The situation can occur in these cases:
+		 * - a lot of VRAM is freed by userspace
+		 * - the presence of a big buffer causes a lot of evictions
+		 *   (solution: split buffers into smaller ones)
+		 *
+		 * If 128 MB or 1/8th of VRAM is free, start filling it now by
+		 * setting accum_us to a positive number.
+		 */
+		if (free_vram >= 128 * 1024 * 1024 ||
+			free_vram >= total_vram / 8) {
+			s64 min_us;
+
+			/* Be more aggresive on dGPUs. Try to fill a portion of
+			 * free VRAM now.
+			 */
+			if (!(adev->flags & AMD_IS_APU))
+				min_us = bytes_to_us(adev, free_vram / 4);
+			else
+				min_us = 0; /* Reset accum_us on APUs. */
+
+			accum_us = max(min_us, accum_us);
+		}
+
+		head_accum_us = atomic64_cmpxchg(&adev->mm_stats.accum_us,
+							old_accum_us, accum_us);
 
-		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+		if (likely(head_accum_us == old_accum_us))
+			/*
+			 * No other task modified adev->mm_stats.accum_us.
+			 * Update was successful.
+			 */
+			break;
+		else
+			/* Another task modified the value after we read it.
+			 * A rare contention happens, let us retry.
+			 * In most case, one retry can do the job.
+			 * See function atomic64_add_unless as a similar idea.
+			 */
+			old_accum_us = head_accum_us;
 	}
 
 	/* This returns 0 if the driver is in debt to disallow (optional)
 	 * buffer moves.
 	 */
-	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
-
-	spin_unlock(&adev->mm_stats.lock);
+	max_bytes = us_to_bytes(adev, accum_us);
 	return max_bytes;
 }
 
@@ -292,9 +331,8 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
  */
 void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
 {
-	spin_lock(&adev->mm_stats.lock);
-	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
-	spin_unlock(&adev->mm_stats.lock);
+	s64 i = bytes_to_us(adev, num_bytes);
+	atomic64_sub(i, &adev->mm_stats.accum_us);
 }
 
 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ff90f78..9e9d592 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2117,7 +2117,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	spin_lock_init(&adev->didt_idx_lock);
 	spin_lock_init(&adev->gc_cac_idx_lock);
 	spin_lock_init(&adev->audio_endpt_idx_lock);
-	spin_lock_init(&adev->mm_stats.lock);
 
 	INIT_LIST_HEAD(&adev->shadow_list);
 	mutex_init(&adev->shadow_list_lock);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: change a function to static function
       [not found]     ` <1498099356-31332-2-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-22  2:54       ` Michel Dänzer
  0 siblings, 0 replies; 37+ messages in thread
From: Michel Dänzer @ 2017-06-22  2:54 UTC (permalink / raw)
  To: Alex Xie; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 22/06/17 11:42 AM, Alex Xie wrote:
> The function is called only once inside the .c file.
> 
> Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>

The shortlog should explicitly say "drm/amdgpu: Make
amdgpu_cs_parser_init static". With that, this patch and patch 1 are

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found] ` <1498099356-31332-1-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
  2017-06-22  2:42   ` [PATCH 2/3] drm/amdgpu: change a function to static function Alex Xie
  2017-06-22  2:42   ` [PATCH 3/3] drm/amdgpu: optimize out a spin lock Use atomic instead of spin lock Alex Xie
@ 2017-06-22  7:35   ` Christian König
       [not found]     ` <5ff65f82-9d15-4606-7e25-e4f75c172aed-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2 siblings, 1 reply; 37+ messages in thread
From: Christian König @ 2017-06-22  7:35 UTC (permalink / raw)
  To: Alex Xie, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 22.06.2017 um 04:42 schrieb Alex Xie:
> Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>

With the commit message fixed as Michel suggested patches #1 and #2 are 
Reviewed-by: Christian König <christian.koenig@amd.com> as well.

On patch #3 Marek needs to take a look, cause I don't know the logic 
behind that.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 7635f38..94c27fc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -494,7 +494,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   				 &e->user_invalidated) && e->user_pages) {
>   
>   				/* We acquired a page array, but somebody
> -				 * invalidated it. Free it an try again
> +				 * invalidated it. Free it and try again
>   				 */
>   				release_pages(e->user_pages,
>   					      e->robj->tbo.ttm->num_pages,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]     ` <5ff65f82-9d15-4606-7e25-e4f75c172aed-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-22 15:33       ` Xie, AlexBin
       [not found]         ` <DM5PR12MB1257B1FCC981D29A6D4045CDF2DB0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Xie, AlexBin @ 2017-06-22 15:33 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Olsak, Marek


[-- Attachment #1.1: Type: text/plain, Size: 2016 bytes --]

Hi Christian,


In fact, the change from spinlock to atomic is quite painful. When I started, I thought it was easy but later I found there might be race condition here and there. Now I think the change looks more robust. In kernel source, there are several other drivers used the same trick.


On the other hand, I think the logic itself might be optimized considering the locking. But I had spent quite some effort to maintain original logic.


Thanks,

Alex Bin


From: Christian König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
Sent: Thursday, June 22, 2017 3:35 AM
To: Xie, AlexBin; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: [PATCH 1/3] drm/amdgpu: fix a typo

Am 22.06.2017 um 04:42 schrieb Alex Xie:
> Signed-off-by: Alex Xie <AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>

With the commit message fixed as Michel suggested patches #1 and #2 are
Reviewed-by: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org> as well.

On patch #3 Marek needs to take a look, cause I don't know the logic
behind that.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 7635f38..94c27fc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -494,7 +494,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>                                 &e->user_invalidated) && e->user_pages) {
>
>                                /* We acquired a page array, but somebody
> -                              * invalidated it. Free it an try again
> +                              * invalidated it. Free it and try again
>                                 */
>                                release_pages(e->user_pages,
>                                              e->robj->tbo.ttm->num_pages,



[-- Attachment #1.2: Type: text/html, Size: 4234 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]         ` <DM5PR12MB1257B1FCC981D29A6D4045CDF2DB0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-06-22 16:24           ` Christian König
  2017-06-22 16:27           ` Marek Olšák
  1 sibling, 0 replies; 37+ messages in thread
From: Christian König @ 2017-06-22 16:24 UTC (permalink / raw)
  To: Xie, AlexBin, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Olsak, Marek


[-- Attachment #1.1: Type: text/plain, Size: 2427 bytes --]

Hi Alex,

yeah, the change looks totally ok to me. The problem is just that I'm 
not familiar with that part of the source.

Marek came up with that, so he should at least take a look and nod.

Regards,
Christian.

Am 22.06.2017 um 17:33 schrieb Xie, AlexBin:
>
> Hi Christian,
>
>
> In fact, the change from spinlock to atomic is quite painful. When I 
> started, I thought it was easy but later I found there might be race 
> condition here and there. Now I think the change looks more robust. In 
> kernel source, there are several other drivers used the same trick.
>
>
> On the other hand, I think the logic itself might be optimized 
> considering the locking. But I had spent quite some effort to maintain 
> original logic.
>
>
> Thanks,
>
> Alex Bin
>
>
> *From:* Christian König <deathsimple-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
> *Sent:* Thursday, June 22, 2017 3:35 AM
> *To:* Xie, AlexBin; amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> *Subject:* Re: [PATCH 1/3] drm/amdgpu: fix a typo
> Am 22.06.2017 um 04:42 schrieb Alex Xie:
> > Signed-off-by: Alex Xie <AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
>
> With the commit message fixed as Michel suggested patches #1 and #2 are
> Reviewed-by: Christian König <christian.koenig-5C7GfCeVMHo@public.gmane.org> as well.
>
> On patch #3 Marek needs to take a look, cause I don't know the logic
> behind that.
>
> Regards,
> Christian.
>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > index 7635f38..94c27fc 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > @@ -494,7 +494,7 @@ static int amdgpu_cs_parser_bos(struct 
> amdgpu_cs_parser *p,
> > &e->user_invalidated) && e->user_pages) {
> >
> >                                /* We acquired a page array, but somebody
> > -                              * invalidated it. Free it an try again
> > +                              * invalidated it. Free it and try again
> >                                 */
> > release_pages(e->user_pages,
> > e->robj->tbo.ttm->num_pages,
>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 5695 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]         ` <DM5PR12MB1257B1FCC981D29A6D4045CDF2DB0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-06-22 16:24           ` Christian König
@ 2017-06-22 16:27           ` Marek Olšák
       [not found]             ` <CAAxE2A7ic2YLmnkzM0Faa3bO9-GwWRTHRc0EUVej5UU7Yhs4Qg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 37+ messages in thread
From: Marek Olšák @ 2017-06-22 16:27 UTC (permalink / raw)
  To: Xie, AlexBin
  Cc: Christian König, Olsak, Marek,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com> wrote:
> Hi Christian,
>
>
> In fact, the change from spinlock to atomic is quite painful. When I
> started, I thought it was easy but later I found there might be race
> condition here and there. Now I think the change looks more robust. In
> kernel source, there are several other drivers used the same trick.
>
>
> On the other hand, I think the logic itself might be optimized considering
> the locking. But I had spent quite some effort to maintain original logic.

It seems quite complicated and I don't know if there is any
performance benefit. Spinlocks are nice because they allow preemption.

It would be more interesting to merge the CS and BO_LIST ioctls into one.

Marek
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]             ` <CAAxE2A7ic2YLmnkzM0Faa3bO9-GwWRTHRc0EUVej5UU7Yhs4Qg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-22 17:34               ` axie
       [not found]                 ` <efc2289a-382d-8443-8419-da2d0cadfd77-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: axie @ 2017-06-22 17:34 UTC (permalink / raw)
  To: Marek Olšák, Xie, AlexBin
  Cc: Christian König, Olsak, Marek,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Marek,

Spin lock and spin unlock is fast. But it is not so fast compared with 
atomic, which is a single CPU instruction in x86.


1. spinlock does NOT allow preemption at local CPU. Let us have a look 
at how spin lock was implemented.

static inline void __raw_spin_lock(raw_spinlock_t *lock)
{
     preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is 
memory barrier operation too.
     spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
     LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}

2.  A function  __lock_acquire called by spinlock. The function is so 
long that I would not attach all of it here.

There is atomic operation inside and 12 meta data updates and 14 if 
statements and it calls quite some other functions.

Note that it disable IRQ...

static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
               int trylock, int read, int check, int hardirqs_off,
               struct lockdep_map *nest_lock, unsigned long ip,
               int references, int pin_count)
{
     struct task_struct *curr = current;
     struct lock_class *class = NULL;
     struct held_lock *hlock;
     unsigned int depth;
     int chain_head = 0;
     int class_idx;
     u64 chain_key;

     if (unlikely(!debug_locks))
         return 0;

     /*
      * Lockdep should run with IRQs disabled, otherwise we could
      * get an interrupt which would want to take locks, which would
      * end up in lockdep and have you got a head-ache already?
      */
     if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable IRQ
         return 0;

....

3. Another function called by spinlock in a higher level:

void lock_acquire(struct lockdep_map *lock, unsigned int subclass,

               int trylock, int read, int check,
               struct lockdep_map *nest_lock, unsigned long ip)
{
     unsigned long flags;

     if (unlikely(current->lockdep_recursion))
         return;

     raw_local_irq_save(flags);
     check_flags(flags);

     current->lockdep_recursion = 1;
     trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, 
ip);
     __lock_acquire(lock, subclass, trylock, read, check,
                irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
     current->lockdep_recursion = 0;
     raw_local_irq_restore(flags);
}


Thanks,

Alex Bin


On 2017-06-22 12:27 PM, Marek Olšák wrote:
> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com> wrote:
>> Hi Christian,
>>
>>
>> In fact, the change from spinlock to atomic is quite painful. When I
>> started, I thought it was easy but later I found there might be race
>> condition here and there. Now I think the change looks more robust. In
>> kernel source, there are several other drivers used the same trick.
>>
>>
>> On the other hand, I think the logic itself might be optimized considering
>> the locking. But I had spent quite some effort to maintain original logic.
> It seems quite complicated and I don't know if there is any
> performance benefit. Spinlocks are nice because they allow preemption.
>
> It would be more interesting to merge the CS and BO_LIST ioctls into one.
>
> Marek

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                 ` <efc2289a-382d-8443-8419-da2d0cadfd77-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-22 18:19                   ` axie
       [not found]                     ` <e23f64c3-6abf-ff65-2300-5c44d242f4df-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: axie @ 2017-06-22 18:19 UTC (permalink / raw)
  To: Marek Olšák, Xie, AlexBin
  Cc: Christian König, Olsak, Marek,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);

Function __lock_acquire double checks that the local IRQ is really disabled.


On 2017-06-22 01:34 PM, axie wrote:
> Hi Marek,
>
> Spin lock and spin unlock is fast. But it is not so fast compared with 
> atomic, which is a single CPU instruction in x86.
>
>
> 1. spinlock does NOT allow preemption at local CPU. Let us have a look 
> at how spin lock was implemented.
>
> static inline void __raw_spin_lock(raw_spinlock_t *lock)
> {
>     preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is 
> memory barrier operation too.
>     spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>     LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
> }
>
> 2.  A function  __lock_acquire called by spinlock. The function is so 
> long that I would not attach all of it here.
>
> There is atomic operation inside and 12 meta data updates and 14 if 
> statements and it calls quite some other functions.
>
> Note that it disable IRQ...
>
> static int __lock_acquire(struct lockdep_map *lock, unsigned int 
> subclass,
>               int trylock, int read, int check, int hardirqs_off,
>               struct lockdep_map *nest_lock, unsigned long ip,
>               int references, int pin_count)
> {
>     struct task_struct *curr = current;
>     struct lock_class *class = NULL;
>     struct held_lock *hlock;
>     unsigned int depth;
>     int chain_head = 0;
>     int class_idx;
>     u64 chain_key;
>
>     if (unlikely(!debug_locks))
>         return 0;
>
>     /*
>      * Lockdep should run with IRQs disabled, otherwise we could
>      * get an interrupt which would want to take locks, which would
>      * end up in lockdep and have you got a head-ache already?
>      */
>     if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable IRQ
>         return 0;
>
> ....
>
> 3. Another function called by spinlock in a higher level:
>
> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>
>               int trylock, int read, int check,
>               struct lockdep_map *nest_lock, unsigned long ip)
> {
>     unsigned long flags;
>
>     if (unlikely(current->lockdep_recursion))
>         return;
>
>     raw_local_irq_save(flags);
>     check_flags(flags);
>
>     current->lockdep_recursion = 1;
>     trace_lock_acquire(lock, subclass, trylock, read, check, 
> nest_lock, ip);
>     __lock_acquire(lock, subclass, trylock, read, check,
>                irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>     current->lockdep_recursion = 0;
>     raw_local_irq_restore(flags);
> }
>
>
> Thanks,
>
> Alex Bin
>
>
> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com> 
>> wrote:
>>> Hi Christian,
>>>
>>>
>>> In fact, the change from spinlock to atomic is quite painful. When I
>>> started, I thought it was easy but later I found there might be race
>>> condition here and there. Now I think the change looks more robust. In
>>> kernel source, there are several other drivers used the same trick.
>>>
>>>
>>> On the other hand, I think the logic itself might be optimized 
>>> considering
>>> the locking. But I had spent quite some effort to maintain original 
>>> logic.
>> It seems quite complicated and I don't know if there is any
>> performance benefit. Spinlocks are nice because they allow preemption.
>>
>> It would be more interesting to merge the CS and BO_LIST ioctls into 
>> one.
>>
>> Marek
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                     ` <e23f64c3-6abf-ff65-2300-5c44d242f4df-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-22 23:54                       ` Marek Olšák
       [not found]                         ` <CAAxE2A61KqM9gr=Zoo5PHFNb8gWp74RG9KOs=efWzf1dMKBwSA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Marek Olšák @ 2017-06-22 23:54 UTC (permalink / raw)
  To: axie
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

That's all nice, but does it improve performance? Have you been able
to measure some performance difference with that code? Were you
targeting a specific inefficiency you had seen e.g. with a CPU
profiler?

Marek

On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>
> Function __lock_acquire double checks that the local IRQ is really disabled.
>
>
>
> On 2017-06-22 01:34 PM, axie wrote:
>>
>> Hi Marek,
>>
>> Spin lock and spin unlock is fast. But it is not so fast compared with
>> atomic, which is a single CPU instruction in x86.
>>
>>
>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look at
>> how spin lock was implemented.
>>
>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>> {
>>     preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>> memory barrier operation too.
>>     spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>     LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>> }
>>
>> 2.  A function  __lock_acquire called by spinlock. The function is so long
>> that I would not attach all of it here.
>>
>> There is atomic operation inside and 12 meta data updates and 14 if
>> statements and it calls quite some other functions.
>>
>> Note that it disable IRQ...
>>
>> static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>               int trylock, int read, int check, int hardirqs_off,
>>               struct lockdep_map *nest_lock, unsigned long ip,
>>               int references, int pin_count)
>> {
>>     struct task_struct *curr = current;
>>     struct lock_class *class = NULL;
>>     struct held_lock *hlock;
>>     unsigned int depth;
>>     int chain_head = 0;
>>     int class_idx;
>>     u64 chain_key;
>>
>>     if (unlikely(!debug_locks))
>>         return 0;
>>
>>     /*
>>      * Lockdep should run with IRQs disabled, otherwise we could
>>      * get an interrupt which would want to take locks, which would
>>      * end up in lockdep and have you got a head-ache already?
>>      */
>>     if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable IRQ
>>         return 0;
>>
>> ....
>>
>> 3. Another function called by spinlock in a higher level:
>>
>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>
>>               int trylock, int read, int check,
>>               struct lockdep_map *nest_lock, unsigned long ip)
>> {
>>     unsigned long flags;
>>
>>     if (unlikely(current->lockdep_recursion))
>>         return;
>>
>>     raw_local_irq_save(flags);
>>     check_flags(flags);
>>
>>     current->lockdep_recursion = 1;
>>     trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock,
>> ip);
>>     __lock_acquire(lock, subclass, trylock, read, check,
>>                irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>     current->lockdep_recursion = 0;
>>     raw_local_irq_restore(flags);
>> }
>>
>>
>> Thanks,
>>
>> Alex Bin
>>
>>
>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>
>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>> wrote:
>>>>
>>>> Hi Christian,
>>>>
>>>>
>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>> started, I thought it was easy but later I found there might be race
>>>> condition here and there. Now I think the change looks more robust. In
>>>> kernel source, there are several other drivers used the same trick.
>>>>
>>>>
>>>> On the other hand, I think the logic itself might be optimized
>>>> considering
>>>> the locking. But I had spent quite some effort to maintain original
>>>> logic.
>>>
>>> It seems quite complicated and I don't know if there is any
>>> performance benefit. Spinlocks are nice because they allow preemption.
>>>
>>> It would be more interesting to merge the CS and BO_LIST ioctls into one.
>>>
>>> Marek
>>
>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                         ` <CAAxE2A61KqM9gr=Zoo5PHFNb8gWp74RG9KOs=efWzf1dMKBwSA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-23  2:23                           ` axie
       [not found]                             ` <7134d81f-a60f-7093-d2a3-70edde23cdb2-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: axie @ 2017-06-23  2:23 UTC (permalink / raw)
  To: Marek Olšák
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

Hi Marek,


So do you agree that spinlock disables CPU preemption, contrary to your 
original idea?


If you have new reason that this patch does not improve, please speak out.


Many patches in GPU driver aim at improving performance and power 
efficiency. Does most patches submitted in AMDGPU requires a 
benchmarking first?

If all developers are required to always answer your questions when code 
review, I am afraid that most open source community developers cannot 
meet that requirement and stop working on AMDGPU.


To improve performance, there are many bottlenecks to clear. When the 
last several bottlenecks are clear, the performance will show faster 
more significantly.

My pass profiling experience told me that clearing a lock can improve 
performance for some driver like 0.3% to much bigger percentage. It 
depends on many factors, even depends on the application itself.


This is not the first bottleneck fixed. This is surely not the last one.


Thanks,

Alex Bin


On 2017-06-22 07:54 PM, Marek Olšák wrote:
> That's all nice, but does it improve performance? Have you been able
> to measure some performance difference with that code? Were you
> targeting a specific inefficiency you had seen e.g. with a CPU
> profiler?
>
> Marek
>
> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>
>> Function __lock_acquire double checks that the local IRQ is really disabled.
>>
>>
>>
>> On 2017-06-22 01:34 PM, axie wrote:
>>> Hi Marek,
>>>
>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>> atomic, which is a single CPU instruction in x86.
>>>
>>>
>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look at
>>> how spin lock was implemented.
>>>
>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>> {
>>>      preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>> memory barrier operation too.
>>>      spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>      LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>> }
>>>
>>> 2.  A function  __lock_acquire called by spinlock. The function is so long
>>> that I would not attach all of it here.
>>>
>>> There is atomic operation inside and 12 meta data updates and 14 if
>>> statements and it calls quite some other functions.
>>>
>>> Note that it disable IRQ...
>>>
>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>                int trylock, int read, int check, int hardirqs_off,
>>>                struct lockdep_map *nest_lock, unsigned long ip,
>>>                int references, int pin_count)
>>> {
>>>      struct task_struct *curr = current;
>>>      struct lock_class *class = NULL;
>>>      struct held_lock *hlock;
>>>      unsigned int depth;
>>>      int chain_head = 0;
>>>      int class_idx;
>>>      u64 chain_key;
>>>
>>>      if (unlikely(!debug_locks))
>>>          return 0;
>>>
>>>      /*
>>>       * Lockdep should run with IRQs disabled, otherwise we could
>>>       * get an interrupt which would want to take locks, which would
>>>       * end up in lockdep and have you got a head-ache already?
>>>       */
>>>      if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable IRQ
>>>          return 0;
>>>
>>> ....
>>>
>>> 3. Another function called by spinlock in a higher level:
>>>
>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>
>>>                int trylock, int read, int check,
>>>                struct lockdep_map *nest_lock, unsigned long ip)
>>> {
>>>      unsigned long flags;
>>>
>>>      if (unlikely(current->lockdep_recursion))
>>>          return;
>>>
>>>      raw_local_irq_save(flags);
>>>      check_flags(flags);
>>>
>>>      current->lockdep_recursion = 1;
>>>      trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock,
>>> ip);
>>>      __lock_acquire(lock, subclass, trylock, read, check,
>>>                 irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>      current->lockdep_recursion = 0;
>>>      raw_local_irq_restore(flags);
>>> }
>>>
>>>
>>> Thanks,
>>>
>>> Alex Bin
>>>
>>>
>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>> wrote:
>>>>> Hi Christian,
>>>>>
>>>>>
>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>> started, I thought it was easy but later I found there might be race
>>>>> condition here and there. Now I think the change looks more robust. In
>>>>> kernel source, there are several other drivers used the same trick.
>>>>>
>>>>>
>>>>> On the other hand, I think the logic itself might be optimized
>>>>> considering
>>>>> the locking. But I had spent quite some effort to maintain original
>>>>> logic.
>>>> It seems quite complicated and I don't know if there is any
>>>> performance benefit. Spinlocks are nice because they allow preemption.
>>>>
>>>> It would be more interesting to merge the CS and BO_LIST ioctls into one.
>>>>
>>>> Marek
>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                             ` <7134d81f-a60f-7093-d2a3-70edde23cdb2-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-23  6:57                               ` Christian König
       [not found]                                 ` <3436ae97-39b1-4a3f-bb73-2991adad5715-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-06-23 11:37                               ` Marek Olšák
  1 sibling, 1 reply; 37+ messages in thread
From: Christian König @ 2017-06-23  6:57 UTC (permalink / raw)
  To: axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

Hi Alex,

actually Marek is right, command submission is actually not much of a 
bottleneck to us because it is handled from a separate userspace thread.

So those micro optimizations you do here on CPU cycles are actually 
rather superfluous.

But giving the CS IOCTL an option for directly specifying the BOs 
instead of a BO list like Marek suggested would indeed save us some time 
here.

Regards,
Christian.

Am 23.06.2017 um 04:23 schrieb axie:
> Hi Marek,
>
>
> So do you agree that spinlock disables CPU preemption, contrary to 
> your original idea?
>
>
> If you have new reason that this patch does not improve, please speak 
> out.
>
>
> Many patches in GPU driver aim at improving performance and power 
> efficiency. Does most patches submitted in AMDGPU requires a 
> benchmarking first?
>
> If all developers are required to always answer your questions when 
> code review, I am afraid that most open source community developers 
> cannot meet that requirement and stop working on AMDGPU.
>
>
> To improve performance, there are many bottlenecks to clear. When the 
> last several bottlenecks are clear, the performance will show faster 
> more significantly.
>
> My pass profiling experience told me that clearing a lock can improve 
> performance for some driver like 0.3% to much bigger percentage. It 
> depends on many factors, even depends on the application itself.
>
>
> This is not the first bottleneck fixed. This is surely not the last one.
>
>
> Thanks,
>
> Alex Bin
>
>
> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>> That's all nice, but does it improve performance? Have you been able
>> to measure some performance difference with that code? Were you
>> targeting a specific inefficiency you had seen e.g. with a CPU
>> profiler?
>>
>> Marek
>>
>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>>
>>> Function __lock_acquire double checks that the local IRQ is really 
>>> disabled.
>>>
>>>
>>>
>>> On 2017-06-22 01:34 PM, axie wrote:
>>>> Hi Marek,
>>>>
>>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>>> atomic, which is a single CPU instruction in x86.
>>>>
>>>>
>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a 
>>>> look at
>>>> how spin lock was implemented.
>>>>
>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>> {
>>>>      preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>> memory barrier operation too.
>>>>      spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>      LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>> }
>>>>
>>>> 2.  A function  __lock_acquire called by spinlock. The function is 
>>>> so long
>>>> that I would not attach all of it here.
>>>>
>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>> statements and it calls quite some other functions.
>>>>
>>>> Note that it disable IRQ...
>>>>
>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int 
>>>> subclass,
>>>>                int trylock, int read, int check, int hardirqs_off,
>>>>                struct lockdep_map *nest_lock, unsigned long ip,
>>>>                int references, int pin_count)
>>>> {
>>>>      struct task_struct *curr = current;
>>>>      struct lock_class *class = NULL;
>>>>      struct held_lock *hlock;
>>>>      unsigned int depth;
>>>>      int chain_head = 0;
>>>>      int class_idx;
>>>>      u64 chain_key;
>>>>
>>>>      if (unlikely(!debug_locks))
>>>>          return 0;
>>>>
>>>>      /*
>>>>       * Lockdep should run with IRQs disabled, otherwise we could
>>>>       * get an interrupt which would want to take locks, which would
>>>>       * end up in lockdep and have you got a head-ache already?
>>>>       */
>>>>      if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 
>>>> <<<<<<<<<<<<<<<Disable IRQ
>>>>          return 0;
>>>>
>>>> ....
>>>>
>>>> 3. Another function called by spinlock in a higher level:
>>>>
>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>
>>>>                int trylock, int read, int check,
>>>>                struct lockdep_map *nest_lock, unsigned long ip)
>>>> {
>>>>      unsigned long flags;
>>>>
>>>>      if (unlikely(current->lockdep_recursion))
>>>>          return;
>>>>
>>>>      raw_local_irq_save(flags);
>>>>      check_flags(flags);
>>>>
>>>>      current->lockdep_recursion = 1;
>>>>      trace_lock_acquire(lock, subclass, trylock, read, check, 
>>>> nest_lock,
>>>> ip);
>>>>      __lock_acquire(lock, subclass, trylock, read, check,
>>>>                 irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>      current->lockdep_recursion = 0;
>>>>      raw_local_irq_restore(flags);
>>>> }
>>>>
>>>>
>>>> Thanks,
>>>>
>>>> Alex Bin
>>>>
>>>>
>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>> wrote:
>>>>>> Hi Christian,
>>>>>>
>>>>>>
>>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>>> started, I thought it was easy but later I found there might be race
>>>>>> condition here and there. Now I think the change looks more 
>>>>>> robust. In
>>>>>> kernel source, there are several other drivers used the same trick.
>>>>>>
>>>>>>
>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>> considering
>>>>>> the locking. But I had spent quite some effort to maintain original
>>>>>> logic.
>>>>> It seems quite complicated and I don't know if there is any
>>>>> performance benefit. Spinlocks are nice because they allow 
>>>>> preemption.
>>>>>
>>>>> It would be more interesting to merge the CS and BO_LIST ioctls 
>>>>> into one.
>>>>>
>>>>> Marek
>>>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                 ` <3436ae97-39b1-4a3f-bb73-2991adad5715-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-23  7:09                                   ` zhoucm1
       [not found]                                     ` <594CBEBC.5010703-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: zhoucm1 @ 2017-06-23  7:09 UTC (permalink / raw)
  To: Christian König, axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin



On 2017年06月23日 14:57, Christian König wrote:
> But giving the CS IOCTL an option for directly specifying the BOs 
> instead of a BO list like Marek suggested would indeed save us some 
> time here. 
interesting, I always follow how to improve our cs ioctl, since UMD guys 
aften complain our command submission is slower than windows.
Then how to directly specifying the BOs instead of a BO list? BO handle 
array from UMD? Could your guys describe more clear? Is it doable?

Regards,
David Zhou
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                     ` <594CBEBC.5010703-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-23  8:25                                       ` Christian König
       [not found]                                         ` <0d847baf-8296-f3ce-7a8d-8823f33e392e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Christian König @ 2017-06-23  8:25 UTC (permalink / raw)
  To: zhoucm1, axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

Am 23.06.2017 um 09:09 schrieb zhoucm1:
>
>
> On 2017年06月23日 14:57, Christian König wrote:
>> But giving the CS IOCTL an option for directly specifying the BOs 
>> instead of a BO list like Marek suggested would indeed save us some 
>> time here. 
> interesting, I always follow how to improve our cs ioctl, since UMD 
> guys aften complain our command submission is slower than windows.
> Then how to directly specifying the BOs instead of a BO list? BO 
> handle array from UMD? Could your guys describe more clear? Is it doable?

Making the BO list part of the CS IOCTL wouldn't help at all for the 
close source UMDs. To be precise we actually came up with the BO list 
approach because of their requirement.

The biggest bunch of work during CS is reserving all the buffers, 
validating them and checking their VM status. It doesn't matter if the 
BOs come from the BO list or directly in the CS IOCTL.

The key point is that CS overhead is pretty much irrelevant for the open 
source stack, since Mesa does command submission from a separate thread 
anyway.

Regards,
Christian.

>
> Regards,
> David Zhou


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                         ` <0d847baf-8296-f3ce-7a8d-8823f33e392e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-23  9:01                                           ` zhoucm1
  2017-06-23  9:01                                           ` zhoucm1
  1 sibling, 0 replies; 37+ messages in thread
From: zhoucm1 @ 2017-06-23  9:01 UTC (permalink / raw)
  To: Christian König, axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin



On 2017年06月23日 16:25, Christian König wrote:
> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>
>>
>> On 2017年06月23日 14:57, Christian König wrote:
>>> But giving the CS IOCTL an option for directly specifying the BOs 
>>> instead of a BO list like Marek suggested would indeed save us some 
>>> time here. 
>> interesting, I always follow how to improve our cs ioctl, since UMD 
>> guys aften complain our command submission is slower than windows.
>> Then how to directly specifying the BOs instead of a BO list? BO 
>> handle array from UMD? Could your guys describe more clear? Is it 
>> doable?
>
> Making the BO list part of the CS IOCTL wouldn't help at all for the 
> close source UMDs. To be precise we actually came up with the BO list 
> approach because of their requirement.
>
> The biggest bunch of work during CS is reserving all the buffers, 
> validating them and checking their VM status. 
Totally agree. Every time when I read code there, I often want to 
optimize them.

> It doesn't matter if the BOs come from the BO list or directly in the 
> CS IOCTL.
>
> The key point is that CS overhead is pretty much irrelevant for the 
> open source stack, since Mesa does command submission from a separate 
> thread anyway.
If irrelevant for the open stack, then how does open source stack handle 
"The biggest bunch of work during CS is reserving all the buffers, 
validating them and checking their VM status."?
If open stack has a better way, I think closed stack can follow it, I 
don't know the history.

Regards,
David Zhou
>
> Regards,
> Christian.
>
>>
>> Regards,
>> David Zhou
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                         ` <0d847baf-8296-f3ce-7a8d-8823f33e392e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-06-23  9:01                                           ` zhoucm1
@ 2017-06-23  9:01                                           ` zhoucm1
       [not found]                                             ` <594CD8E0.3080702-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 37+ messages in thread
From: zhoucm1 @ 2017-06-23  9:01 UTC (permalink / raw)
  To: Christian König, axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin



On 2017年06月23日 16:25, Christian König wrote:
> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>
>>
>> On 2017年06月23日 14:57, Christian König wrote:
>>> But giving the CS IOCTL an option for directly specifying the BOs 
>>> instead of a BO list like Marek suggested would indeed save us some 
>>> time here. 
>> interesting, I always follow how to improve our cs ioctl, since UMD 
>> guys aften complain our command submission is slower than windows.
>> Then how to directly specifying the BOs instead of a BO list? BO 
>> handle array from UMD? Could your guys describe more clear? Is it 
>> doable?
>
> Making the BO list part of the CS IOCTL wouldn't help at all for the 
> close source UMDs. To be precise we actually came up with the BO list 
> approach because of their requirement.
>
> The biggest bunch of work during CS is reserving all the buffers, 
> validating them and checking their VM status. 
Totally agree. Every time when I read code there, I often want to 
optimize them.

> It doesn't matter if the BOs come from the BO list or directly in the 
> CS IOCTL.
>
> The key point is that CS overhead is pretty much irrelevant for the 
> open source stack, since Mesa does command submission from a separate 
> thread anyway.
If irrelevant for the open stack, then how does open source stack handle 
"The biggest bunch of work during CS is reserving all the buffers, 
validating them and checking their VM status."?
If open stack has a better way, I think closed stack can follow it, I 
don't know the history.

Regards,
David Zhou
>
> Regards,
> Christian.
>
>>
>> Regards,
>> David Zhou
>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                             ` <594CD8E0.3080702-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-23  9:08                                               ` zhoucm1
       [not found]                                                 ` <594CDA92.1060809-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: zhoucm1 @ 2017-06-23  9:08 UTC (permalink / raw)
  To: Christian König, axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin



On 2017年06月23日 17:01, zhoucm1 wrote:
>
>
> On 2017年06月23日 16:25, Christian König wrote:
>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>
>>>
>>> On 2017年06月23日 14:57, Christian König wrote:
>>>> But giving the CS IOCTL an option for directly specifying the BOs 
>>>> instead of a BO list like Marek suggested would indeed save us some 
>>>> time here. 
>>> interesting, I always follow how to improve our cs ioctl, since UMD 
>>> guys aften complain our command submission is slower than windows.
>>> Then how to directly specifying the BOs instead of a BO list? BO 
>>> handle array from UMD? Could your guys describe more clear? Is it 
>>> doable?
>>
>> Making the BO list part of the CS IOCTL wouldn't help at all for the 
>> close source UMDs. To be precise we actually came up with the BO list 
>> approach because of their requirement.
>>
>> The biggest bunch of work during CS is reserving all the buffers, 
>> validating them and checking their VM status. 
> Totally agree. Every time when I read code there, I often want to 
> optimize them.
>
>> It doesn't matter if the BOs come from the BO list or directly in the 
>> CS IOCTL.
>>
>> The key point is that CS overhead is pretty much irrelevant for the 
>> open source stack, since Mesa does command submission from a separate 
>> thread anyway.
> If irrelevant for the open stack, then how does open source stack 
> handle "The biggest bunch of work during CS is reserving all the 
> buffers, validating them and checking their VM status."?
> If open stack has a better way, I think closed stack can follow it, I 
> don't know the history.
Do you not use bo list at all in mesa? radv as well?

Regards,
David Zhou
>
> Regards,
> David Zhou
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>> David Zhou
>>
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                 ` <594CDA92.1060809-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-23  9:27                                                   ` Christian König
       [not found]                                                     ` <807f27d6-7e53-4066-c440-699bf66dd227-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-06-25 19:48                                                   ` Dave Airlie
  1 sibling, 1 reply; 37+ messages in thread
From: Christian König @ 2017-06-23  9:27 UTC (permalink / raw)
  To: zhoucm1, axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

Am 23.06.2017 um 11:08 schrieb zhoucm1:
>
>
> On 2017年06月23日 17:01, zhoucm1 wrote:
>>
>>
>> On 2017年06月23日 16:25, Christian König wrote:
>>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>>
>>>>
>>>> On 2017年06月23日 14:57, Christian König wrote:
>>>>> But giving the CS IOCTL an option for directly specifying the BOs 
>>>>> instead of a BO list like Marek suggested would indeed save us 
>>>>> some time here. 
>>>> interesting, I always follow how to improve our cs ioctl, since UMD 
>>>> guys aften complain our command submission is slower than windows.
>>>> Then how to directly specifying the BOs instead of a BO list? BO 
>>>> handle array from UMD? Could your guys describe more clear? Is it 
>>>> doable?
>>>
>>> Making the BO list part of the CS IOCTL wouldn't help at all for the 
>>> close source UMDs. To be precise we actually came up with the BO 
>>> list approach because of their requirement.
>>>
>>> The biggest bunch of work during CS is reserving all the buffers, 
>>> validating them and checking their VM status. 
>> Totally agree. Every time when I read code there, I often want to 
>> optimize them.
>>
>>> It doesn't matter if the BOs come from the BO list or directly in 
>>> the CS IOCTL.
>>>
>>> The key point is that CS overhead is pretty much irrelevant for the 
>>> open source stack, since Mesa does command submission from a 
>>> separate thread anyway.
>> If irrelevant for the open stack, then how does open source stack 
>> handle "The biggest bunch of work during CS is reserving all the 
>> buffers, validating them and checking their VM status."?

Command submission on the open stack is outsourced to a separate user 
space thread. E.g. when an application triggers a flush the IBs created 
so far are just put on a queue and another thread pushes them down to 
the kernel.

I mean reducing the overhead of the CS IOCTL is always nice, but you 
usual won't see any fps increase as long as not all CPUs are completely 
bound to some tasks.

>> If open stack has a better way, I think closed stack can follow it, I 
>> don't know the history.
> Do you not use bo list at all in mesa? radv as well?

I don't think so. Mesa just wants to send the list of used BOs down to 
the kernel with every IOCTL.

Regards,
Christian.

>
> Regards,
> David Zhou
>>
>> Regards,
>> David Zhou
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>> David Zhou
>>>
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                     ` <807f27d6-7e53-4066-c440-699bf66dd227-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-23 10:49                                                       ` Marek Olšák
       [not found]                                                         ` <CAAxE2A4vdV+QD55RAn+mrD92o39X_vPT6wLKCwkREx2=fpU2Vw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Marek Olšák @ 2017-06-23 10:49 UTC (permalink / raw)
  To: Christian König
  Cc: zhoucm1, axie, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

On Fri, Jun 23, 2017 at 11:27 AM, Christian König
<deathsimple@vodafone.de> wrote:
> Am 23.06.2017 um 11:08 schrieb zhoucm1:
>>
>>
>>
>> On 2017年06月23日 17:01, zhoucm1 wrote:
>>>
>>>
>>>
>>> On 2017年06月23日 16:25, Christian König wrote:
>>>>
>>>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>>>
>>>>>
>>>>>
>>>>> On 2017年06月23日 14:57, Christian König wrote:
>>>>>>
>>>>>> But giving the CS IOCTL an option for directly specifying the BOs
>>>>>> instead of a BO list like Marek suggested would indeed save us some time
>>>>>> here.
>>>>>
>>>>> interesting, I always follow how to improve our cs ioctl, since UMD
>>>>> guys aften complain our command submission is slower than windows.
>>>>> Then how to directly specifying the BOs instead of a BO list? BO handle
>>>>> array from UMD? Could your guys describe more clear? Is it doable?
>>>>
>>>>
>>>> Making the BO list part of the CS IOCTL wouldn't help at all for the
>>>> close source UMDs. To be precise we actually came up with the BO list
>>>> approach because of their requirement.
>>>>
>>>> The biggest bunch of work during CS is reserving all the buffers,
>>>> validating them and checking their VM status.
>>>
>>> Totally agree. Every time when I read code there, I often want to
>>> optimize them.
>>>
>>>> It doesn't matter if the BOs come from the BO list or directly in the CS
>>>> IOCTL.
>>>>
>>>> The key point is that CS overhead is pretty much irrelevant for the open
>>>> source stack, since Mesa does command submission from a separate thread
>>>> anyway.
>>>
>>> If irrelevant for the open stack, then how does open source stack handle
>>> "The biggest bunch of work during CS is reserving all the buffers,
>>> validating them and checking their VM status."?
>
>
> Command submission on the open stack is outsourced to a separate user space
> thread. E.g. when an application triggers a flush the IBs created so far are
> just put on a queue and another thread pushes them down to the kernel.
>
> I mean reducing the overhead of the CS IOCTL is always nice, but you usual
> won't see any fps increase as long as not all CPUs are completely bound to
> some tasks.
>
>>> If open stack has a better way, I think closed stack can follow it, I
>>> don't know the history.
>>
>> Do you not use bo list at all in mesa? radv as well?
>
>
> I don't think so. Mesa just wants to send the list of used BOs down to the
> kernel with every IOCTL.

The CS ioctl actually costs us some performance, but not as much as on
closed source drivers.

MesaGL always executes all CS ioctls in a separate thread (in parallel
with the UMD) except for the last IB that's submitted by SwapBuffers.
SwapBuffers requires that all IBs have been submitted when SwapBuffers
returns. For example, if you have 5 IBs per frame, 4 of them are
executed on the thread and the overhead is hidden. The last one is
executed on the thread too, but this time the Mesa driver has to wait
for it. For things like glxgears with only 1 IB per frame, the thread
doesn't hide anything and Mesa always has to wait for it after
submission, just because of SwapBuffers.

Having 10 or more IBs per frame is great, because 9 are done in
parallel and the last one is synchronous. The final CPU cost is 10x
lower, but it's not zero.

For us, it's certainly useful to optimize the CS ioctl because of apps
that submit only 1 IB per frame where multithreading has no effect or
may even hurt performance.

The most obvious inefficiency is the BO_LIST ioctl that is completely
unnecessary and only slows us down. What we need is exactly what
radeon does.

Marek
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                             ` <7134d81f-a60f-7093-d2a3-70edde23cdb2-5C7GfCeVMHo@public.gmane.org>
  2017-06-23  6:57                               ` Christian König
@ 2017-06-23 11:37                               ` Marek Olšák
       [not found]                                 ` <CAAxE2A6Tt1JzLCDr4vM1iKASpWGFncqrXkL75==1Zd5wJq8xTA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 37+ messages in thread
From: Marek Olšák @ 2017-06-23 11:37 UTC (permalink / raw)
  To: axie
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

I agree with you about the spinlock. You seem to be good at this.

It's always good to do measurements to validate that a code change
improves something, especially when the code size and code complexity
has to be increased. A CPU profiler such as sysprof can show you
improvements on the order of 1/10000th = 0.01% if you record enough
samples. Sometimes you have to un-inline a function to make it visible
there. If you see a function that takes 0.3% of CPU time and you
optimize it down to 0.1% using the profiler as the measurement tool,
you have evidence that the improvement is there and nobody can reject
the idea anymore. It also proves that the code size increase is worth
it. It's always "added code size and loss of simplicity" vs benefit.
It's a transaction. You trade one for the other. You lose something to
get something else. OK, we know the code complexity. Now, what's the
benefit? Can you do some measurements? The accuracy of 1/10000th
should be enough for anybody.

I know the feeling when you spend many days working on something,
adding 100s or 1000s of lines of code, solving many problems to get
there and increasing code complexity significantly, and then you do
the measurement and it doesn't improve anything. I know the feeling
very well. It sucks. The frustration comes from the investment of time
and getting no return on the investment. Many frustrations in life are
like that.

Marek


On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
> Hi Marek,
>
>
> So do you agree that spinlock disables CPU preemption, contrary to your
> original idea?
>
>
> If you have new reason that this patch does not improve, please speak out.
>
>
> Many patches in GPU driver aim at improving performance and power
> efficiency. Does most patches submitted in AMDGPU requires a benchmarking
> first?
>
> If all developers are required to always answer your questions when code
> review, I am afraid that most open source community developers cannot meet
> that requirement and stop working on AMDGPU.
>
>
> To improve performance, there are many bottlenecks to clear. When the last
> several bottlenecks are clear, the performance will show faster more
> significantly.
>
> My pass profiling experience told me that clearing a lock can improve
> performance for some driver like 0.3% to much bigger percentage. It depends
> on many factors, even depends on the application itself.
>
>
> This is not the first bottleneck fixed. This is surely not the last one.
>
>
> Thanks,
>
> Alex Bin
>
>
>
> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>
>> That's all nice, but does it improve performance? Have you been able
>> to measure some performance difference with that code? Were you
>> targeting a specific inefficiency you had seen e.g. with a CPU
>> profiler?
>>
>> Marek
>>
>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>
>>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>>
>>> Function __lock_acquire double checks that the local IRQ is really
>>> disabled.
>>>
>>>
>>>
>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>
>>>> Hi Marek,
>>>>
>>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>>> atomic, which is a single CPU instruction in x86.
>>>>
>>>>
>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look
>>>> at
>>>> how spin lock was implemented.
>>>>
>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>> {
>>>>      preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>> memory barrier operation too.
>>>>      spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>      LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>> }
>>>>
>>>> 2.  A function  __lock_acquire called by spinlock. The function is so
>>>> long
>>>> that I would not attach all of it here.
>>>>
>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>> statements and it calls quite some other functions.
>>>>
>>>> Note that it disable IRQ...
>>>>
>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>> subclass,
>>>>                int trylock, int read, int check, int hardirqs_off,
>>>>                struct lockdep_map *nest_lock, unsigned long ip,
>>>>                int references, int pin_count)
>>>> {
>>>>      struct task_struct *curr = current;
>>>>      struct lock_class *class = NULL;
>>>>      struct held_lock *hlock;
>>>>      unsigned int depth;
>>>>      int chain_head = 0;
>>>>      int class_idx;
>>>>      u64 chain_key;
>>>>
>>>>      if (unlikely(!debug_locks))
>>>>          return 0;
>>>>
>>>>      /*
>>>>       * Lockdep should run with IRQs disabled, otherwise we could
>>>>       * get an interrupt which would want to take locks, which would
>>>>       * end up in lockdep and have you got a head-ache already?
>>>>       */
>>>>      if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable
>>>> IRQ
>>>>          return 0;
>>>>
>>>> ....
>>>>
>>>> 3. Another function called by spinlock in a higher level:
>>>>
>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>
>>>>                int trylock, int read, int check,
>>>>                struct lockdep_map *nest_lock, unsigned long ip)
>>>> {
>>>>      unsigned long flags;
>>>>
>>>>      if (unlikely(current->lockdep_recursion))
>>>>          return;
>>>>
>>>>      raw_local_irq_save(flags);
>>>>      check_flags(flags);
>>>>
>>>>      current->lockdep_recursion = 1;
>>>>      trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock,
>>>> ip);
>>>>      __lock_acquire(lock, subclass, trylock, read, check,
>>>>                 irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>      current->lockdep_recursion = 0;
>>>>      raw_local_irq_restore(flags);
>>>> }
>>>>
>>>>
>>>> Thanks,
>>>>
>>>> Alex Bin
>>>>
>>>>
>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>
>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>> wrote:
>>>>>>
>>>>>> Hi Christian,
>>>>>>
>>>>>>
>>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>>> started, I thought it was easy but later I found there might be race
>>>>>> condition here and there. Now I think the change looks more robust. In
>>>>>> kernel source, there are several other drivers used the same trick.
>>>>>>
>>>>>>
>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>> considering
>>>>>> the locking. But I had spent quite some effort to maintain original
>>>>>> logic.
>>>>>
>>>>> It seems quite complicated and I don't know if there is any
>>>>> performance benefit. Spinlocks are nice because they allow preemption.
>>>>>
>>>>> It would be more interesting to merge the CS and BO_LIST ioctls into
>>>>> one.
>>>>>
>>>>> Marek
>>>>
>>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                         ` <CAAxE2A4vdV+QD55RAn+mrD92o39X_vPT6wLKCwkREx2=fpU2Vw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-23 11:55                                                           ` Zhou, David(ChunMing)
       [not found]                                                             ` <MWHPR1201MB02063CAC83DEEAB81EFA9A10B4D80-3iK1xFAIwjrUF/YbdlDdgWrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  2017-06-26  9:06                                                           ` Michel Dänzer
  1 sibling, 1 reply; 37+ messages in thread
From: Zhou, David(ChunMing) @ 2017-06-23 11:55 UTC (permalink / raw)
  To: Marek Olšák, Christian König
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin


________________________________________
From: Marek Olšák [maraeo@gmail.com]
Sent: Friday, June 23, 2017 6:49 PM
To: Christian König
Cc: Zhou, David(ChunMing); Xie, AlexBin; amd-gfx@lists.freedesktop.org; Xie, AlexBin
Subject: Re: [PATCH 1/3] drm/amdgpu: fix a typo

On Fri, Jun 23, 2017 at 11:27 AM, Christian König
<deathsimple@vodafone.de> wrote:
> Am 23.06.2017 um 11:08 schrieb zhoucm1:
>>
>>
>>
>> On 2017年06月23日 17:01, zhoucm1 wrote:
>>>
>>>
>>>
>>> On 2017年06月23日 16:25, Christian König wrote:
>>>>
>>>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>>>
>>>>>
>>>>>
>>>>> On 2017年06月23日 14:57, Christian König wrote:
>>>>>>
>>>>>> But giving the CS IOCTL an option for directly specifying the BOs
>>>>>> instead of a BO list like Marek suggested would indeed save us some time
>>>>>> here.
>>>>>
>>>>> interesting, I always follow how to improve our cs ioctl, since UMD
>>>>> guys aften complain our command submission is slower than windows.
>>>>> Then how to directly specifying the BOs instead of a BO list? BO handle
>>>>> array from UMD? Could your guys describe more clear? Is it doable?
>>>>
>>>>
>>>> Making the BO list part of the CS IOCTL wouldn't help at all for the
>>>> close source UMDs. To be precise we actually came up with the BO list
>>>> approach because of their requirement.
>>>>
>>>> The biggest bunch of work during CS is reserving all the buffers,
>>>> validating them and checking their VM status.
>>>
>>> Totally agree. Every time when I read code there, I often want to
>>> optimize them.
>>>
>>>> It doesn't matter if the BOs come from the BO list or directly in the CS
>>>> IOCTL.
>>>>
>>>> The key point is that CS overhead is pretty much irrelevant for the open
>>>> source stack, since Mesa does command submission from a separate thread
>>>> anyway.
>>>
>>> If irrelevant for the open stack, then how does open source stack handle
>>> "The biggest bunch of work during CS is reserving all the buffers,
>>> validating them and checking their VM status."?
>
>
> Command submission on the open stack is outsourced to a separate user space
> thread. E.g. when an application triggers a flush the IBs created so far are
> just put on a queue and another thread pushes them down to the kernel.
>
> I mean reducing the overhead of the CS IOCTL is always nice, but you usual
> won't see any fps increase as long as not all CPUs are completely bound to
> some tasks.
>
>>> If open stack has a better way, I think closed stack can follow it, I
>>> don't know the history.
>>
>> Do you not use bo list at all in mesa? radv as well?
>
>
> I don't think so. Mesa just wants to send the list of used BOs down to the
> kernel with every IOCTL.

The CS ioctl actually costs us some performance, but not as much as on
closed source drivers.

MesaGL always executes all CS ioctls in a separate thread (in parallel
with the UMD) except for the last IB that's submitted by SwapBuffers.
SwapBuffers requires that all IBs have been submitted when SwapBuffers
returns. For example, if you have 5 IBs per frame, 4 of them are
executed on the thread and the overhead is hidden. The last one is
executed on the thread too, but this time the Mesa driver has to wait
for it. For things like glxgears with only 1 IB per frame, the thread
doesn't hide anything and Mesa always has to wait for it after
submission, just because of SwapBuffers.

Having 10 or more IBs per frame is great, because 9 are done in
parallel and the last one is synchronous. The final CPU cost is 10x
lower, but it's not zero.
[DZ] Thanks Marek, this is very useful and helpful message for me to understand Mesa action of CS, I will talk to closed guys to see if it can be used for them.
Anothing I also want to confirm with you, do you know if radv is using this cs way?

For us, it's certainly useful to optimize the CS ioctl because of apps
that submit only 1 IB per frame where multithreading has no effect or
may even hurt performance.

The most obvious inefficiency is the BO_LIST ioctl that is completely
unnecessary and only slows us down. What we need is exactly what
radeon does.

[DZ] I don't know how radeon handle bo list, could you describe it as well?

Thanks,
David Zhou

Marek
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                 ` <CAAxE2A6Tt1JzLCDr4vM1iKASpWGFncqrXkL75==1Zd5wJq8xTA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-23 13:01                                   ` Christian König
       [not found]                                     ` <963f4d3d-dc46-2279-509f-b475c5ec94ee-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-06-23 13:45                                   ` axie
  1 sibling, 1 reply; 37+ messages in thread
From: Christian König @ 2017-06-23 13:01 UTC (permalink / raw)
  To: Marek Olšák, axie
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

The key point here is while optimizing this is nice the much bigger pile 
is the locking done for each BO.

In other words even when we optimize all the other locks involved into 
atomics or RCU, the BO reservation lock will still dominate everything.

One possible solution to this would be per process resources like I 
suggested multiple times now.

Christian.

Am 23.06.2017 um 13:37 schrieb Marek Olšák:
> I agree with you about the spinlock. You seem to be good at this.
>
> It's always good to do measurements to validate that a code change
> improves something, especially when the code size and code complexity
> has to be increased. A CPU profiler such as sysprof can show you
> improvements on the order of 1/10000th = 0.01% if you record enough
> samples. Sometimes you have to un-inline a function to make it visible
> there. If you see a function that takes 0.3% of CPU time and you
> optimize it down to 0.1% using the profiler as the measurement tool,
> you have evidence that the improvement is there and nobody can reject
> the idea anymore. It also proves that the code size increase is worth
> it. It's always "added code size and loss of simplicity" vs benefit.
> It's a transaction. You trade one for the other. You lose something to
> get something else. OK, we know the code complexity. Now, what's the
> benefit? Can you do some measurements? The accuracy of 1/10000th
> should be enough for anybody.
>
> I know the feeling when you spend many days working on something,
> adding 100s or 1000s of lines of code, solving many problems to get
> there and increasing code complexity significantly, and then you do
> the measurement and it doesn't improve anything. I know the feeling
> very well. It sucks. The frustration comes from the investment of time
> and getting no return on the investment. Many frustrations in life are
> like that.
>
> Marek
>
>
> On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
>> Hi Marek,
>>
>>
>> So do you agree that spinlock disables CPU preemption, contrary to your
>> original idea?
>>
>>
>> If you have new reason that this patch does not improve, please speak out.
>>
>>
>> Many patches in GPU driver aim at improving performance and power
>> efficiency. Does most patches submitted in AMDGPU requires a benchmarking
>> first?
>>
>> If all developers are required to always answer your questions when code
>> review, I am afraid that most open source community developers cannot meet
>> that requirement and stop working on AMDGPU.
>>
>>
>> To improve performance, there are many bottlenecks to clear. When the last
>> several bottlenecks are clear, the performance will show faster more
>> significantly.
>>
>> My pass profiling experience told me that clearing a lock can improve
>> performance for some driver like 0.3% to much bigger percentage. It depends
>> on many factors, even depends on the application itself.
>>
>>
>> This is not the first bottleneck fixed. This is surely not the last one.
>>
>>
>> Thanks,
>>
>> Alex Bin
>>
>>
>>
>> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>> That's all nice, but does it improve performance? Have you been able
>>> to measure some performance difference with that code? Were you
>>> targeting a specific inefficiency you had seen e.g. with a CPU
>>> profiler?
>>>
>>> Marek
>>>
>>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>>>
>>>> Function __lock_acquire double checks that the local IRQ is really
>>>> disabled.
>>>>
>>>>
>>>>
>>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>> Hi Marek,
>>>>>
>>>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>>>> atomic, which is a single CPU instruction in x86.
>>>>>
>>>>>
>>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look
>>>>> at
>>>>> how spin lock was implemented.
>>>>>
>>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>>> {
>>>>>       preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>>> memory barrier operation too.
>>>>>       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>>       LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>>> }
>>>>>
>>>>> 2.  A function  __lock_acquire called by spinlock. The function is so
>>>>> long
>>>>> that I would not attach all of it here.
>>>>>
>>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>>> statements and it calls quite some other functions.
>>>>>
>>>>> Note that it disable IRQ...
>>>>>
>>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>>> subclass,
>>>>>                 int trylock, int read, int check, int hardirqs_off,
>>>>>                 struct lockdep_map *nest_lock, unsigned long ip,
>>>>>                 int references, int pin_count)
>>>>> {
>>>>>       struct task_struct *curr = current;
>>>>>       struct lock_class *class = NULL;
>>>>>       struct held_lock *hlock;
>>>>>       unsigned int depth;
>>>>>       int chain_head = 0;
>>>>>       int class_idx;
>>>>>       u64 chain_key;
>>>>>
>>>>>       if (unlikely(!debug_locks))
>>>>>           return 0;
>>>>>
>>>>>       /*
>>>>>        * Lockdep should run with IRQs disabled, otherwise we could
>>>>>        * get an interrupt which would want to take locks, which would
>>>>>        * end up in lockdep and have you got a head-ache already?
>>>>>        */
>>>>>       if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable
>>>>> IRQ
>>>>>           return 0;
>>>>>
>>>>> ....
>>>>>
>>>>> 3. Another function called by spinlock in a higher level:
>>>>>
>>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>>
>>>>>                 int trylock, int read, int check,
>>>>>                 struct lockdep_map *nest_lock, unsigned long ip)
>>>>> {
>>>>>       unsigned long flags;
>>>>>
>>>>>       if (unlikely(current->lockdep_recursion))
>>>>>           return;
>>>>>
>>>>>       raw_local_irq_save(flags);
>>>>>       check_flags(flags);
>>>>>
>>>>>       current->lockdep_recursion = 1;
>>>>>       trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock,
>>>>> ip);
>>>>>       __lock_acquire(lock, subclass, trylock, read, check,
>>>>>                  irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>>       current->lockdep_recursion = 0;
>>>>>       raw_local_irq_restore(flags);
>>>>> }
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Alex Bin
>>>>>
>>>>>
>>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>>> wrote:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>>
>>>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>>>> started, I thought it was easy but later I found there might be race
>>>>>>> condition here and there. Now I think the change looks more robust. In
>>>>>>> kernel source, there are several other drivers used the same trick.
>>>>>>>
>>>>>>>
>>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>>> considering
>>>>>>> the locking. But I had spent quite some effort to maintain original
>>>>>>> logic.
>>>>>> It seems quite complicated and I don't know if there is any
>>>>>> performance benefit. Spinlocks are nice because they allow preemption.
>>>>>>
>>>>>> It would be more interesting to merge the CS and BO_LIST ioctls into
>>>>>> one.
>>>>>>
>>>>>> Marek
>>>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                 ` <CAAxE2A6Tt1JzLCDr4vM1iKASpWGFncqrXkL75==1Zd5wJq8xTA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-06-23 13:01                                   ` Christian König
@ 2017-06-23 13:45                                   ` axie
       [not found]                                     ` <94b20fb6-3f81-da3c-eb44-7d2e49ff5c8b-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 37+ messages in thread
From: axie @ 2017-06-23 13:45 UTC (permalink / raw)
  To: Marek Olšák
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

Hi Marek,

I understand you spent time on your original logic too. I really don't 
understand why you talked about pain if somebody can improve it.

To reduce the pain, now I am seriously considering dropping this patch. 
But please read on before you conclude. Let us treat open source 
software development a fun.

Same trick like this patch could be found in open source Intel GPU 
driver and xfs.

Talking about code size. You will be surprised if you really calculate it.

For function amdgpu_cs_get_threshold_for_moves:
New code:  It is 2 more loops and 2 more ifs.
Old code:  2 spinlock inline function. spin_lock can expand to 5 
function calls and one if. One function call contains 7 parameters.
spin_unlock can expand to 4 function calls.

By the way, you can config Linux kernel to disable some spinlock macro 
expansion. But I don't think people really do that.

In function amdgpu_cs_report_moved_bytes
New code:  zero
Old code:  2 spinlock inline function.

In Total:

New code:  It is 2 more loops and 2 more ifs. Maybe there are one or two 
other tiny things.
Old code:  4 spinlock inline function. They are expanded to 18 function 
calls. Among them, two function calls each contain 7 parameters.

Please think about it. Are you still sure that the new code make code 
size bigger? Now what is the next problem of the new change?

I agree that it is more difficult to understand the new code. But if you 
get used to it. It is not so difficult in deed. Just one loop to retry. 
Human are much smarter than this logic.
Compared with how hardware engineers went to extreme to optimize logics 
and design with optimization from day 1, my tiny new logic is really 
nothing.

I said that removing a lock can improve 0.3% or even bigger for some 
driver. I did not say it was AMDGPU. My tiny improvement may not be so 
obvious in this big driver for the time being.

I will give you the privilege to make a final decision, for example, you 
can even delay it for future if you don't want to make a decision now. 
Please be happy.

Thanks,
Alex Bin Xie



On 2017-06-23 07:37 AM, Marek Olšák wrote:
> I agree with you about the spinlock. You seem to be good at this.
>
> It's always good to do measurements to validate that a code change
> improves something, especially when the code size and code complexity
> has to be increased. A CPU profiler such as sysprof can show you
> improvements on the order of 1/10000th = 0.01% if you record enough
> samples. Sometimes you have to un-inline a function to make it visible
> there. If you see a function that takes 0.3% of CPU time and you
> optimize it down to 0.1% using the profiler as the measurement tool,
> you have evidence that the improvement is there and nobody can reject
> the idea anymore. It also proves that the code size increase is worth
> it. It's always "added code size and loss of simplicity" vs benefit.
> It's a transaction. You trade one for the other. You lose something to
> get something else. OK, we know the code complexity. Now, what's the
> benefit? Can you do some measurements? The accuracy of 1/10000th
> should be enough for anybody.
>
> I know the feeling when you spend many days working on something,
> adding 100s or 1000s of lines of code, solving many problems to get
> there and increasing code complexity significantly, and then you do
> the measurement and it doesn't improve anything. I know the feeling
> very well. It sucks. The frustration comes from the investment of time
> and getting no return on the investment. Many frustrations in life are
> like that.
>
> Marek
>
>
> On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
>> Hi Marek,
>>
>>
>> So do you agree that spinlock disables CPU preemption, contrary to your
>> original idea?
>>
>>
>> If you have new reason that this patch does not improve, please speak out.
>>
>>
>> Many patches in GPU driver aim at improving performance and power
>> efficiency. Does most patches submitted in AMDGPU requires a benchmarking
>> first?
>>
>> If all developers are required to always answer your questions when code
>> review, I am afraid that most open source community developers cannot meet
>> that requirement and stop working on AMDGPU.
>>
>>
>> To improve performance, there are many bottlenecks to clear. When the last
>> several bottlenecks are clear, the performance will show faster more
>> significantly.
>>
>> My pass profiling experience told me that clearing a lock can improve
>> performance for some driver like 0.3% to much bigger percentage. It depends
>> on many factors, even depends on the application itself.
>>
>>
>> This is not the first bottleneck fixed. This is surely not the last one.
>>
>>
>> Thanks,
>>
>> Alex Bin
>>
>>
>>
>> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>> That's all nice, but does it improve performance? Have you been able
>>> to measure some performance difference with that code? Were you
>>> targeting a specific inefficiency you had seen e.g. with a CPU
>>> profiler?
>>>
>>> Marek
>>>
>>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>>>
>>>> Function __lock_acquire double checks that the local IRQ is really
>>>> disabled.
>>>>
>>>>
>>>>
>>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>> Hi Marek,
>>>>>
>>>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>>>> atomic, which is a single CPU instruction in x86.
>>>>>
>>>>>
>>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look
>>>>> at
>>>>> how spin lock was implemented.
>>>>>
>>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>>> {
>>>>>       preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>>> memory barrier operation too.
>>>>>       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>>       LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>>> }
>>>>>
>>>>> 2.  A function  __lock_acquire called by spinlock. The function is so
>>>>> long
>>>>> that I would not attach all of it here.
>>>>>
>>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>>> statements and it calls quite some other functions.
>>>>>
>>>>> Note that it disable IRQ...
>>>>>
>>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>>> subclass,
>>>>>                 int trylock, int read, int check, int hardirqs_off,
>>>>>                 struct lockdep_map *nest_lock, unsigned long ip,
>>>>>                 int references, int pin_count)
>>>>> {
>>>>>       struct task_struct *curr = current;
>>>>>       struct lock_class *class = NULL;
>>>>>       struct held_lock *hlock;
>>>>>       unsigned int depth;
>>>>>       int chain_head = 0;
>>>>>       int class_idx;
>>>>>       u64 chain_key;
>>>>>
>>>>>       if (unlikely(!debug_locks))
>>>>>           return 0;
>>>>>
>>>>>       /*
>>>>>        * Lockdep should run with IRQs disabled, otherwise we could
>>>>>        * get an interrupt which would want to take locks, which would
>>>>>        * end up in lockdep and have you got a head-ache already?
>>>>>        */
>>>>>       if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) <<<<<<<<<<<<<<<Disable
>>>>> IRQ
>>>>>           return 0;
>>>>>
>>>>> ....
>>>>>
>>>>> 3. Another function called by spinlock in a higher level:
>>>>>
>>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>>
>>>>>                 int trylock, int read, int check,
>>>>>                 struct lockdep_map *nest_lock, unsigned long ip)
>>>>> {
>>>>>       unsigned long flags;
>>>>>
>>>>>       if (unlikely(current->lockdep_recursion))
>>>>>           return;
>>>>>
>>>>>       raw_local_irq_save(flags);
>>>>>       check_flags(flags);
>>>>>
>>>>>       current->lockdep_recursion = 1;
>>>>>       trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock,
>>>>> ip);
>>>>>       __lock_acquire(lock, subclass, trylock, read, check,
>>>>>                  irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>>       current->lockdep_recursion = 0;
>>>>>       raw_local_irq_restore(flags);
>>>>> }
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Alex Bin
>>>>>
>>>>>
>>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>>> wrote:
>>>>>>> Hi Christian,
>>>>>>>
>>>>>>>
>>>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>>>> started, I thought it was easy but later I found there might be race
>>>>>>> condition here and there. Now I think the change looks more robust. In
>>>>>>> kernel source, there are several other drivers used the same trick.
>>>>>>>
>>>>>>>
>>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>>> considering
>>>>>>> the locking. But I had spent quite some effort to maintain original
>>>>>>> logic.
>>>>>> It seems quite complicated and I don't know if there is any
>>>>>> performance benefit. Spinlocks are nice because they allow preemption.
>>>>>>
>>>>>> It would be more interesting to merge the CS and BO_LIST ioctls into
>>>>>> one.
>>>>>>
>>>>>> Marek
>>>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                     ` <963f4d3d-dc46-2279-509f-b475c5ec94ee-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-06-23 13:55                                       ` axie
       [not found]                                         ` <3d4f3204-6893-3d68-23ea-b309ace33740-5C7GfCeVMHo@public.gmane.org>
  2017-06-24  0:27                                       ` Marek Olšák
  1 sibling, 1 reply; 37+ messages in thread
From: axie @ 2017-06-23 13:55 UTC (permalink / raw)
  To: Christian König, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

Hi Christian,


I agree with you. On the other hand, after you optimize the BO 
reservation lock, other locks still need optimization, right?


1. Locking itself is not cheap.

2. Waiting in lock is even more expensive.


Thanks,

Alex Bin Xie


On 2017-06-23 09:01 AM, Christian König wrote:
> The key point here is while optimizing this is nice the much bigger 
> pile is the locking done for each BO.
>
> In other words even when we optimize all the other locks involved into 
> atomics or RCU, the BO reservation lock will still dominate everything.
>
> One possible solution to this would be per process resources like I 
> suggested multiple times now.
>
> Christian.
>
> Am 23.06.2017 um 13:37 schrieb Marek Olšák:
>> I agree with you about the spinlock. You seem to be good at this.
>>
>> It's always good to do measurements to validate that a code change
>> improves something, especially when the code size and code complexity
>> has to be increased. A CPU profiler such as sysprof can show you
>> improvements on the order of 1/10000th = 0.01% if you record enough
>> samples. Sometimes you have to un-inline a function to make it visible
>> there. If you see a function that takes 0.3% of CPU time and you
>> optimize it down to 0.1% using the profiler as the measurement tool,
>> you have evidence that the improvement is there and nobody can reject
>> the idea anymore. It also proves that the code size increase is worth
>> it. It's always "added code size and loss of simplicity" vs benefit.
>> It's a transaction. You trade one for the other. You lose something to
>> get something else. OK, we know the code complexity. Now, what's the
>> benefit? Can you do some measurements? The accuracy of 1/10000th
>> should be enough for anybody.
>>
>> I know the feeling when you spend many days working on something,
>> adding 100s or 1000s of lines of code, solving many problems to get
>> there and increasing code complexity significantly, and then you do
>> the measurement and it doesn't improve anything. I know the feeling
>> very well. It sucks. The frustration comes from the investment of time
>> and getting no return on the investment. Many frustrations in life are
>> like that.
>>
>> Marek
>>
>>
>> On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
>>> Hi Marek,
>>>
>>>
>>> So do you agree that spinlock disables CPU preemption, contrary to your
>>> original idea?
>>>
>>>
>>> If you have new reason that this patch does not improve, please 
>>> speak out.
>>>
>>>
>>> Many patches in GPU driver aim at improving performance and power
>>> efficiency. Does most patches submitted in AMDGPU requires a 
>>> benchmarking
>>> first?
>>>
>>> If all developers are required to always answer your questions when 
>>> code
>>> review, I am afraid that most open source community developers 
>>> cannot meet
>>> that requirement and stop working on AMDGPU.
>>>
>>>
>>> To improve performance, there are many bottlenecks to clear. When 
>>> the last
>>> several bottlenecks are clear, the performance will show faster more
>>> significantly.
>>>
>>> My pass profiling experience told me that clearing a lock can improve
>>> performance for some driver like 0.3% to much bigger percentage. It 
>>> depends
>>> on many factors, even depends on the application itself.
>>>
>>>
>>> This is not the first bottleneck fixed. This is surely not the last 
>>> one.
>>>
>>>
>>> Thanks,
>>>
>>> Alex Bin
>>>
>>>
>>>
>>> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>>> That's all nice, but does it improve performance? Have you been able
>>>> to measure some performance difference with that code? Were you
>>>> targeting a specific inefficiency you had seen e.g. with a CPU
>>>> profiler?
>>>>
>>>> Marek
>>>>
>>>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>>> To clarify, local IRQ is disabled by calling 
>>>>> raw_local_irq_save(flags);
>>>>>
>>>>> Function __lock_acquire double checks that the local IRQ is really
>>>>> disabled.
>>>>>
>>>>>
>>>>>
>>>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>>> Hi Marek,
>>>>>>
>>>>>> Spin lock and spin unlock is fast. But it is not so fast compared 
>>>>>> with
>>>>>> atomic, which is a single CPU instruction in x86.
>>>>>>
>>>>>>
>>>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a 
>>>>>> look
>>>>>> at
>>>>>> how spin lock was implemented.
>>>>>>
>>>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>>>> {
>>>>>>       preempt_disable(); 
>>>>>> <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>>>> memory barrier operation too.
>>>>>>       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>>>       LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>>>> }
>>>>>>
>>>>>> 2.  A function  __lock_acquire called by spinlock. The function 
>>>>>> is so
>>>>>> long
>>>>>> that I would not attach all of it here.
>>>>>>
>>>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>>>> statements and it calls quite some other functions.
>>>>>>
>>>>>> Note that it disable IRQ...
>>>>>>
>>>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>>>> subclass,
>>>>>>                 int trylock, int read, int check, int hardirqs_off,
>>>>>>                 struct lockdep_map *nest_lock, unsigned long ip,
>>>>>>                 int references, int pin_count)
>>>>>> {
>>>>>>       struct task_struct *curr = current;
>>>>>>       struct lock_class *class = NULL;
>>>>>>       struct held_lock *hlock;
>>>>>>       unsigned int depth;
>>>>>>       int chain_head = 0;
>>>>>>       int class_idx;
>>>>>>       u64 chain_key;
>>>>>>
>>>>>>       if (unlikely(!debug_locks))
>>>>>>           return 0;
>>>>>>
>>>>>>       /*
>>>>>>        * Lockdep should run with IRQs disabled, otherwise we could
>>>>>>        * get an interrupt which would want to take locks, which 
>>>>>> would
>>>>>>        * end up in lockdep and have you got a head-ache already?
>>>>>>        */
>>>>>>       if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 
>>>>>> <<<<<<<<<<<<<<<Disable
>>>>>> IRQ
>>>>>>           return 0;
>>>>>>
>>>>>> ....
>>>>>>
>>>>>> 3. Another function called by spinlock in a higher level:
>>>>>>
>>>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>>>
>>>>>>                 int trylock, int read, int check,
>>>>>>                 struct lockdep_map *nest_lock, unsigned long ip)
>>>>>> {
>>>>>>       unsigned long flags;
>>>>>>
>>>>>>       if (unlikely(current->lockdep_recursion))
>>>>>>           return;
>>>>>>
>>>>>>       raw_local_irq_save(flags);
>>>>>>       check_flags(flags);
>>>>>>
>>>>>>       current->lockdep_recursion = 1;
>>>>>>       trace_lock_acquire(lock, subclass, trylock, read, check, 
>>>>>> nest_lock,
>>>>>> ip);
>>>>>>       __lock_acquire(lock, subclass, trylock, read, check,
>>>>>>                  irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>>>       current->lockdep_recursion = 0;
>>>>>>       raw_local_irq_restore(flags);
>>>>>> }
>>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Alex Bin
>>>>>>
>>>>>>
>>>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>>>> wrote:
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>>
>>>>>>>> In fact, the change from spinlock to atomic is quite painful. 
>>>>>>>> When I
>>>>>>>> started, I thought it was easy but later I found there might be 
>>>>>>>> race
>>>>>>>> condition here and there. Now I think the change looks more 
>>>>>>>> robust. In
>>>>>>>> kernel source, there are several other drivers used the same 
>>>>>>>> trick.
>>>>>>>>
>>>>>>>>
>>>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>>>> considering
>>>>>>>> the locking. But I had spent quite some effort to maintain 
>>>>>>>> original
>>>>>>>> logic.
>>>>>>> It seems quite complicated and I don't know if there is any
>>>>>>> performance benefit. Spinlocks are nice because they allow 
>>>>>>> preemption.
>>>>>>>
>>>>>>> It would be more interesting to merge the CS and BO_LIST ioctls 
>>>>>>> into
>>>>>>> one.
>>>>>>>
>>>>>>> Marek
>>>>>>
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                         ` <3d4f3204-6893-3d68-23ea-b309ace33740-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-23 14:30                                           ` Christian König
  0 siblings, 0 replies; 37+ messages in thread
From: Christian König @ 2017-06-23 14:30 UTC (permalink / raw)
  To: axie, Marek Olšák
  Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

> On the other hand, after you optimize the BO reservation lock, other 
> locks still need optimization, right? 
In theory yes, in practice no.

There are just way other things we should tackle before taking care of 
removing any locks that we probably never get to that point even with 
more manpower.

Christian.

Am 23.06.2017 um 15:55 schrieb axie:
> Hi Christian,
>
>
> I agree with you. On the other hand, after you optimize the BO 
> reservation lock, other locks still need optimization, right?
>
>
> 1. Locking itself is not cheap.
>
> 2. Waiting in lock is even more expensive.
>
>
> Thanks,
>
> Alex Bin Xie
>
>
> On 2017-06-23 09:01 AM, Christian König wrote:
>> The key point here is while optimizing this is nice the much bigger 
>> pile is the locking done for each BO.
>>
>> In other words even when we optimize all the other locks involved 
>> into atomics or RCU, the BO reservation lock will still dominate 
>> everything.
>>
>> One possible solution to this would be per process resources like I 
>> suggested multiple times now.
>>
>> Christian.
>>
>> Am 23.06.2017 um 13:37 schrieb Marek Olšák:
>>> I agree with you about the spinlock. You seem to be good at this.
>>>
>>> It's always good to do measurements to validate that a code change
>>> improves something, especially when the code size and code complexity
>>> has to be increased. A CPU profiler such as sysprof can show you
>>> improvements on the order of 1/10000th = 0.01% if you record enough
>>> samples. Sometimes you have to un-inline a function to make it visible
>>> there. If you see a function that takes 0.3% of CPU time and you
>>> optimize it down to 0.1% using the profiler as the measurement tool,
>>> you have evidence that the improvement is there and nobody can reject
>>> the idea anymore. It also proves that the code size increase is worth
>>> it. It's always "added code size and loss of simplicity" vs benefit.
>>> It's a transaction. You trade one for the other. You lose something to
>>> get something else. OK, we know the code complexity. Now, what's the
>>> benefit? Can you do some measurements? The accuracy of 1/10000th
>>> should be enough for anybody.
>>>
>>> I know the feeling when you spend many days working on something,
>>> adding 100s or 1000s of lines of code, solving many problems to get
>>> there and increasing code complexity significantly, and then you do
>>> the measurement and it doesn't improve anything. I know the feeling
>>> very well. It sucks. The frustration comes from the investment of time
>>> and getting no return on the investment. Many frustrations in life are
>>> like that.
>>>
>>> Marek
>>>
>>>
>>> On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
>>>> Hi Marek,
>>>>
>>>>
>>>> So do you agree that spinlock disables CPU preemption, contrary to 
>>>> your
>>>> original idea?
>>>>
>>>>
>>>> If you have new reason that this patch does not improve, please 
>>>> speak out.
>>>>
>>>>
>>>> Many patches in GPU driver aim at improving performance and power
>>>> efficiency. Does most patches submitted in AMDGPU requires a 
>>>> benchmarking
>>>> first?
>>>>
>>>> If all developers are required to always answer your questions when 
>>>> code
>>>> review, I am afraid that most open source community developers 
>>>> cannot meet
>>>> that requirement and stop working on AMDGPU.
>>>>
>>>>
>>>> To improve performance, there are many bottlenecks to clear. When 
>>>> the last
>>>> several bottlenecks are clear, the performance will show faster more
>>>> significantly.
>>>>
>>>> My pass profiling experience told me that clearing a lock can improve
>>>> performance for some driver like 0.3% to much bigger percentage. It 
>>>> depends
>>>> on many factors, even depends on the application itself.
>>>>
>>>>
>>>> This is not the first bottleneck fixed. This is surely not the last 
>>>> one.
>>>>
>>>>
>>>> Thanks,
>>>>
>>>> Alex Bin
>>>>
>>>>
>>>>
>>>> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>>>> That's all nice, but does it improve performance? Have you been able
>>>>> to measure some performance difference with that code? Were you
>>>>> targeting a specific inefficiency you had seen e.g. with a CPU
>>>>> profiler?
>>>>>
>>>>> Marek
>>>>>
>>>>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>>>> To clarify, local IRQ is disabled by calling 
>>>>>> raw_local_irq_save(flags);
>>>>>>
>>>>>> Function __lock_acquire double checks that the local IRQ is really
>>>>>> disabled.
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>>>> Hi Marek,
>>>>>>>
>>>>>>> Spin lock and spin unlock is fast. But it is not so fast 
>>>>>>> compared with
>>>>>>> atomic, which is a single CPU instruction in x86.
>>>>>>>
>>>>>>>
>>>>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have 
>>>>>>> a look
>>>>>>> at
>>>>>>> how spin lock was implemented.
>>>>>>>
>>>>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>>>>> {
>>>>>>>       preempt_disable(); 
>>>>>>> <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>>>>> memory barrier operation too.
>>>>>>>       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>>>>       LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>>>>> }
>>>>>>>
>>>>>>> 2.  A function  __lock_acquire called by spinlock. The function 
>>>>>>> is so
>>>>>>> long
>>>>>>> that I would not attach all of it here.
>>>>>>>
>>>>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>>>>> statements and it calls quite some other functions.
>>>>>>>
>>>>>>> Note that it disable IRQ...
>>>>>>>
>>>>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>>>>> subclass,
>>>>>>>                 int trylock, int read, int check, int hardirqs_off,
>>>>>>>                 struct lockdep_map *nest_lock, unsigned long ip,
>>>>>>>                 int references, int pin_count)
>>>>>>> {
>>>>>>>       struct task_struct *curr = current;
>>>>>>>       struct lock_class *class = NULL;
>>>>>>>       struct held_lock *hlock;
>>>>>>>       unsigned int depth;
>>>>>>>       int chain_head = 0;
>>>>>>>       int class_idx;
>>>>>>>       u64 chain_key;
>>>>>>>
>>>>>>>       if (unlikely(!debug_locks))
>>>>>>>           return 0;
>>>>>>>
>>>>>>>       /*
>>>>>>>        * Lockdep should run with IRQs disabled, otherwise we could
>>>>>>>        * get an interrupt which would want to take locks, which 
>>>>>>> would
>>>>>>>        * end up in lockdep and have you got a head-ache already?
>>>>>>>        */
>>>>>>>       if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 
>>>>>>> <<<<<<<<<<<<<<<Disable
>>>>>>> IRQ
>>>>>>>           return 0;
>>>>>>>
>>>>>>> ....
>>>>>>>
>>>>>>> 3. Another function called by spinlock in a higher level:
>>>>>>>
>>>>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>>>>
>>>>>>>                 int trylock, int read, int check,
>>>>>>>                 struct lockdep_map *nest_lock, unsigned long ip)
>>>>>>> {
>>>>>>>       unsigned long flags;
>>>>>>>
>>>>>>>       if (unlikely(current->lockdep_recursion))
>>>>>>>           return;
>>>>>>>
>>>>>>>       raw_local_irq_save(flags);
>>>>>>>       check_flags(flags);
>>>>>>>
>>>>>>>       current->lockdep_recursion = 1;
>>>>>>>       trace_lock_acquire(lock, subclass, trylock, read, check, 
>>>>>>> nest_lock,
>>>>>>> ip);
>>>>>>>       __lock_acquire(lock, subclass, trylock, read, check,
>>>>>>>                  irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>>>>       current->lockdep_recursion = 0;
>>>>>>>       raw_local_irq_restore(flags);
>>>>>>> }
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> Alex Bin
>>>>>>>
>>>>>>>
>>>>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin 
>>>>>>>> <AlexBin.Xie@amd.com>
>>>>>>>> wrote:
>>>>>>>>> Hi Christian,
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> In fact, the change from spinlock to atomic is quite painful. 
>>>>>>>>> When I
>>>>>>>>> started, I thought it was easy but later I found there might 
>>>>>>>>> be race
>>>>>>>>> condition here and there. Now I think the change looks more 
>>>>>>>>> robust. In
>>>>>>>>> kernel source, there are several other drivers used the same 
>>>>>>>>> trick.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>>>>> considering
>>>>>>>>> the locking. But I had spent quite some effort to maintain 
>>>>>>>>> original
>>>>>>>>> logic.
>>>>>>>> It seems quite complicated and I don't know if there is any
>>>>>>>> performance benefit. Spinlocks are nice because they allow 
>>>>>>>> preemption.
>>>>>>>>
>>>>>>>> It would be more interesting to merge the CS and BO_LIST ioctls 
>>>>>>>> into
>>>>>>>> one.
>>>>>>>>
>>>>>>>> Marek
>>>>>>>
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                     ` <963f4d3d-dc46-2279-509f-b475c5ec94ee-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-06-23 13:55                                       ` axie
@ 2017-06-24  0:27                                       ` Marek Olšák
       [not found]                                         ` <CAAxE2A7kFusP3=-FcVWdSnGRvaFwL8kZ17BH4uKJ3z044S9XFQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 37+ messages in thread
From: Marek Olšák @ 2017-06-24  0:27 UTC (permalink / raw)
  To: Christian König
  Cc: axie, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

On Fri, Jun 23, 2017 at 3:01 PM, Christian König
<deathsimple@vodafone.de> wrote:
> The key point here is while optimizing this is nice the much bigger pile is
> the locking done for each BO.
>
> In other words even when we optimize all the other locks involved into
> atomics or RCU, the BO reservation lock will still dominate everything.
>
> One possible solution to this would be per process resources like I
> suggested multiple times now.

Mesa can set a per-process resource flag on all resources except
displayable ones. The question is, would it help if an IB contained
1000 per-process resources and 1-2 inter-process sharable?

Marek

>
> Christian.
>
>
> Am 23.06.2017 um 13:37 schrieb Marek Olšák:
>>
>> I agree with you about the spinlock. You seem to be good at this.
>>
>> It's always good to do measurements to validate that a code change
>> improves something, especially when the code size and code complexity
>> has to be increased. A CPU profiler such as sysprof can show you
>> improvements on the order of 1/10000th = 0.01% if you record enough
>> samples. Sometimes you have to un-inline a function to make it visible
>> there. If you see a function that takes 0.3% of CPU time and you
>> optimize it down to 0.1% using the profiler as the measurement tool,
>> you have evidence that the improvement is there and nobody can reject
>> the idea anymore. It also proves that the code size increase is worth
>> it. It's always "added code size and loss of simplicity" vs benefit.
>> It's a transaction. You trade one for the other. You lose something to
>> get something else. OK, we know the code complexity. Now, what's the
>> benefit? Can you do some measurements? The accuracy of 1/10000th
>> should be enough for anybody.
>>
>> I know the feeling when you spend many days working on something,
>> adding 100s or 1000s of lines of code, solving many problems to get
>> there and increasing code complexity significantly, and then you do
>> the measurement and it doesn't improve anything. I know the feeling
>> very well. It sucks. The frustration comes from the investment of time
>> and getting no return on the investment. Many frustrations in life are
>> like that.
>>
>> Marek
>>
>>
>> On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
>>>
>>> Hi Marek,
>>>
>>>
>>> So do you agree that spinlock disables CPU preemption, contrary to your
>>> original idea?
>>>
>>>
>>> If you have new reason that this patch does not improve, please speak
>>> out.
>>>
>>>
>>> Many patches in GPU driver aim at improving performance and power
>>> efficiency. Does most patches submitted in AMDGPU requires a benchmarking
>>> first?
>>>
>>> If all developers are required to always answer your questions when code
>>> review, I am afraid that most open source community developers cannot
>>> meet
>>> that requirement and stop working on AMDGPU.
>>>
>>>
>>> To improve performance, there are many bottlenecks to clear. When the
>>> last
>>> several bottlenecks are clear, the performance will show faster more
>>> significantly.
>>>
>>> My pass profiling experience told me that clearing a lock can improve
>>> performance for some driver like 0.3% to much bigger percentage. It
>>> depends
>>> on many factors, even depends on the application itself.
>>>
>>>
>>> This is not the first bottleneck fixed. This is surely not the last one.
>>>
>>>
>>> Thanks,
>>>
>>> Alex Bin
>>>
>>>
>>>
>>> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>>>
>>>> That's all nice, but does it improve performance? Have you been able
>>>> to measure some performance difference with that code? Were you
>>>> targeting a specific inefficiency you had seen e.g. with a CPU
>>>> profiler?
>>>>
>>>> Marek
>>>>
>>>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>>>
>>>>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>>>>
>>>>> Function __lock_acquire double checks that the local IRQ is really
>>>>> disabled.
>>>>>
>>>>>
>>>>>
>>>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>>>
>>>>>> Hi Marek,
>>>>>>
>>>>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>>>>> atomic, which is a single CPU instruction in x86.
>>>>>>
>>>>>>
>>>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look
>>>>>> at
>>>>>> how spin lock was implemented.
>>>>>>
>>>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>>>> {
>>>>>>       preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>>>> memory barrier operation too.
>>>>>>       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>>>       LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>>>> }
>>>>>>
>>>>>> 2.  A function  __lock_acquire called by spinlock. The function is so
>>>>>> long
>>>>>> that I would not attach all of it here.
>>>>>>
>>>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>>>> statements and it calls quite some other functions.
>>>>>>
>>>>>> Note that it disable IRQ...
>>>>>>
>>>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>>>> subclass,
>>>>>>                 int trylock, int read, int check, int hardirqs_off,
>>>>>>                 struct lockdep_map *nest_lock, unsigned long ip,
>>>>>>                 int references, int pin_count)
>>>>>> {
>>>>>>       struct task_struct *curr = current;
>>>>>>       struct lock_class *class = NULL;
>>>>>>       struct held_lock *hlock;
>>>>>>       unsigned int depth;
>>>>>>       int chain_head = 0;
>>>>>>       int class_idx;
>>>>>>       u64 chain_key;
>>>>>>
>>>>>>       if (unlikely(!debug_locks))
>>>>>>           return 0;
>>>>>>
>>>>>>       /*
>>>>>>        * Lockdep should run with IRQs disabled, otherwise we could
>>>>>>        * get an interrupt which would want to take locks, which would
>>>>>>        * end up in lockdep and have you got a head-ache already?
>>>>>>        */
>>>>>>       if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
>>>>>> <<<<<<<<<<<<<<<Disable
>>>>>> IRQ
>>>>>>           return 0;
>>>>>>
>>>>>> ....
>>>>>>
>>>>>> 3. Another function called by spinlock in a higher level:
>>>>>>
>>>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>>>
>>>>>>                 int trylock, int read, int check,
>>>>>>                 struct lockdep_map *nest_lock, unsigned long ip)
>>>>>> {
>>>>>>       unsigned long flags;
>>>>>>
>>>>>>       if (unlikely(current->lockdep_recursion))
>>>>>>           return;
>>>>>>
>>>>>>       raw_local_irq_save(flags);
>>>>>>       check_flags(flags);
>>>>>>
>>>>>>       current->lockdep_recursion = 1;
>>>>>>       trace_lock_acquire(lock, subclass, trylock, read, check,
>>>>>> nest_lock,
>>>>>> ip);
>>>>>>       __lock_acquire(lock, subclass, trylock, read, check,
>>>>>>                  irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>>>       current->lockdep_recursion = 0;
>>>>>>       raw_local_irq_restore(flags);
>>>>>> }
>>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Alex Bin
>>>>>>
>>>>>>
>>>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>>>
>>>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> Hi Christian,
>>>>>>>>
>>>>>>>>
>>>>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>>>>> started, I thought it was easy but later I found there might be race
>>>>>>>> condition here and there. Now I think the change looks more robust.
>>>>>>>> In
>>>>>>>> kernel source, there are several other drivers used the same trick.
>>>>>>>>
>>>>>>>>
>>>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>>>> considering
>>>>>>>> the locking. But I had spent quite some effort to maintain original
>>>>>>>> logic.
>>>>>>>
>>>>>>> It seems quite complicated and I don't know if there is any
>>>>>>> performance benefit. Spinlocks are nice because they allow
>>>>>>> preemption.
>>>>>>>
>>>>>>> It would be more interesting to merge the CS and BO_LIST ioctls into
>>>>>>> one.
>>>>>>>
>>>>>>> Marek
>>>>>>
>>>>>>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                             ` <MWHPR1201MB02063CAC83DEEAB81EFA9A10B4D80-3iK1xFAIwjrUF/YbdlDdgWrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-06-24  0:29                                                               ` Marek Olšák
       [not found]                                                                 ` <CAAxE2A7jmhxC8SctSSxXhc7XRwSgzmPOOF5HmmtAPL7aYKE+Jg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Marek Olšák @ 2017-06-24  0:29 UTC (permalink / raw)
  To: Zhou, David(ChunMing)
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

On Fri, Jun 23, 2017 at 1:55 PM, Zhou, David(ChunMing)
<David1.Zhou@amd.com> wrote:
>
> ________________________________________
> From: Marek Olšák [maraeo@gmail.com]
> Sent: Friday, June 23, 2017 6:49 PM
> To: Christian König
> Cc: Zhou, David(ChunMing); Xie, AlexBin; amd-gfx@lists.freedesktop.org; Xie, AlexBin
> Subject: Re: [PATCH 1/3] drm/amdgpu: fix a typo
>
> On Fri, Jun 23, 2017 at 11:27 AM, Christian König
> <deathsimple@vodafone.de> wrote:
>> Am 23.06.2017 um 11:08 schrieb zhoucm1:
>>>
>>>
>>>
>>> On 2017年06月23日 17:01, zhoucm1 wrote:
>>>>
>>>>
>>>>
>>>> On 2017年06月23日 16:25, Christian König wrote:
>>>>>
>>>>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2017年06月23日 14:57, Christian König wrote:
>>>>>>>
>>>>>>> But giving the CS IOCTL an option for directly specifying the BOs
>>>>>>> instead of a BO list like Marek suggested would indeed save us some time
>>>>>>> here.
>>>>>>
>>>>>> interesting, I always follow how to improve our cs ioctl, since UMD
>>>>>> guys aften complain our command submission is slower than windows.
>>>>>> Then how to directly specifying the BOs instead of a BO list? BO handle
>>>>>> array from UMD? Could your guys describe more clear? Is it doable?
>>>>>
>>>>>
>>>>> Making the BO list part of the CS IOCTL wouldn't help at all for the
>>>>> close source UMDs. To be precise we actually came up with the BO list
>>>>> approach because of their requirement.
>>>>>
>>>>> The biggest bunch of work during CS is reserving all the buffers,
>>>>> validating them and checking their VM status.
>>>>
>>>> Totally agree. Every time when I read code there, I often want to
>>>> optimize them.
>>>>
>>>>> It doesn't matter if the BOs come from the BO list or directly in the CS
>>>>> IOCTL.
>>>>>
>>>>> The key point is that CS overhead is pretty much irrelevant for the open
>>>>> source stack, since Mesa does command submission from a separate thread
>>>>> anyway.
>>>>
>>>> If irrelevant for the open stack, then how does open source stack handle
>>>> "The biggest bunch of work during CS is reserving all the buffers,
>>>> validating them and checking their VM status."?
>>
>>
>> Command submission on the open stack is outsourced to a separate user space
>> thread. E.g. when an application triggers a flush the IBs created so far are
>> just put on a queue and another thread pushes them down to the kernel.
>>
>> I mean reducing the overhead of the CS IOCTL is always nice, but you usual
>> won't see any fps increase as long as not all CPUs are completely bound to
>> some tasks.
>>
>>>> If open stack has a better way, I think closed stack can follow it, I
>>>> don't know the history.
>>>
>>> Do you not use bo list at all in mesa? radv as well?
>>
>>
>> I don't think so. Mesa just wants to send the list of used BOs down to the
>> kernel with every IOCTL.
>
> The CS ioctl actually costs us some performance, but not as much as on
> closed source drivers.
>
> MesaGL always executes all CS ioctls in a separate thread (in parallel
> with the UMD) except for the last IB that's submitted by SwapBuffers.
> SwapBuffers requires that all IBs have been submitted when SwapBuffers
> returns. For example, if you have 5 IBs per frame, 4 of them are
> executed on the thread and the overhead is hidden. The last one is
> executed on the thread too, but this time the Mesa driver has to wait
> for it. For things like glxgears with only 1 IB per frame, the thread
> doesn't hide anything and Mesa always has to wait for it after
> submission, just because of SwapBuffers.
>
> Having 10 or more IBs per frame is great, because 9 are done in
> parallel and the last one is synchronous. The final CPU cost is 10x
> lower, but it's not zero.
> [DZ] Thanks Marek, this is very useful and helpful message for me to understand Mesa action of CS, I will talk to closed guys to see if it can be used for them.
> Anothing I also want to confirm with you, do you know if radv is using this cs way?
>
> For us, it's certainly useful to optimize the CS ioctl because of apps
> that submit only 1 IB per frame where multithreading has no effect or
> may even hurt performance.
>
> The most obvious inefficiency is the BO_LIST ioctl that is completely
> unnecessary and only slows us down. What we need is exactly what
> radeon does.
>
> [DZ] I don't know how radeon handle bo list, could you describe it as well?

Inputs for the following ioctls are:

AMDGPU: BO_LIST:
- list of BOs

AMDGPU: CS
- list of IBs
- BO list handle

RADEON: CS
- one IB
- list of BOs

Ideal solution for a new amdgpu CS ioctl:
- list of IBs
- list of BOs

Marek
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                     ` <94b20fb6-3f81-da3c-eb44-7d2e49ff5c8b-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-24  0:46                                       ` Marek Olšák
  0 siblings, 0 replies; 37+ messages in thread
From: Marek Olšák @ 2017-06-24  0:46 UTC (permalink / raw)
  To: axie
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

On Fri, Jun 23, 2017 at 3:45 PM, axie <axie@amd.com> wrote:
> Hi Marek,
>
> I understand you spent time on your original logic too. I really don't
> understand why you talked about pain if somebody can improve it.
>
> To reduce the pain, now I am seriously considering dropping this patch. But
> please read on before you conclude. Let us treat open source software
> development a fun.
>
> Same trick like this patch could be found in open source Intel GPU driver
> and xfs.
>
> Talking about code size. You will be surprised if you really calculate it.
>
> For function amdgpu_cs_get_threshold_for_moves:
> New code:  It is 2 more loops and 2 more ifs.
> Old code:  2 spinlock inline function. spin_lock can expand to 5 function
> calls and one if. One function call contains 7 parameters.
> spin_unlock can expand to 4 function calls.
>
> By the way, you can config Linux kernel to disable some spinlock macro
> expansion. But I don't think people really do that.
>
> In function amdgpu_cs_report_moved_bytes
> New code:  zero
> Old code:  2 spinlock inline function.
>
> In Total:
>
> New code:  It is 2 more loops and 2 more ifs. Maybe there are one or two
> other tiny things.
> Old code:  4 spinlock inline function. They are expanded to 18 function
> calls. Among them, two function calls each contain 7 parameters.
>
> Please think about it. Are you still sure that the new code make code size
> bigger? Now what is the next problem of the new change?

The code size means the size of source code, not binary. The fewer
lines of amdgpu code that we need to get the job done, the better.

The next issue is the risk of breaking this already hard-to-test code.

Your logic would be OK if there was a measurable benefit even with the
silliest microbenchmark you can find (and even if it were as low as
0.2% improvement). Without that, I can't accept it. Sorry.

Marek
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                         ` <CAAxE2A7kFusP3=-FcVWdSnGRvaFwL8kZ17BH4uKJ3z044S9XFQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-24 18:27                                           ` Christian König
  0 siblings, 0 replies; 37+ messages in thread
From: Christian König @ 2017-06-24 18:27 UTC (permalink / raw)
  To: Marek Olšák
  Cc: axie, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

Am 24.06.2017 um 02:27 schrieb Marek Olšák:
> On Fri, Jun 23, 2017 at 3:01 PM, Christian König
> <deathsimple@vodafone.de> wrote:
>> The key point here is while optimizing this is nice the much bigger pile is
>> the locking done for each BO.
>>
>> In other words even when we optimize all the other locks involved into
>> atomics or RCU, the BO reservation lock will still dominate everything.
>>
>> One possible solution to this would be per process resources like I
>> suggested multiple times now.
> Mesa can set a per-process resource flag on all resources except
> displayable ones. The question is, would it help if an IB contained
> 1000 per-process resources and 1-2 inter-process sharable?

Yeah, absolutely.

See the 1000 per-process shareable resources only need to be revalidated 
when one of them was evicted. This would easily reduce the per BO CS 
overhead to 1/1000 of what it is now.

The crux is that you can't share those resources with other process and 
since you don't specify them on a per CS basis swapping things in and 
out on per CS basis like we do for all the Unigine benchmarks won't work 
any more.

Everything would be statically allocated, that might not be ideal either.

Regards,
Christian.

>
> Marek
>
>> Christian.
>>
>>
>> Am 23.06.2017 um 13:37 schrieb Marek Olšák:
>>> I agree with you about the spinlock. You seem to be good at this.
>>>
>>> It's always good to do measurements to validate that a code change
>>> improves something, especially when the code size and code complexity
>>> has to be increased. A CPU profiler such as sysprof can show you
>>> improvements on the order of 1/10000th = 0.01% if you record enough
>>> samples. Sometimes you have to un-inline a function to make it visible
>>> there. If you see a function that takes 0.3% of CPU time and you
>>> optimize it down to 0.1% using the profiler as the measurement tool,
>>> you have evidence that the improvement is there and nobody can reject
>>> the idea anymore. It also proves that the code size increase is worth
>>> it. It's always "added code size and loss of simplicity" vs benefit.
>>> It's a transaction. You trade one for the other. You lose something to
>>> get something else. OK, we know the code complexity. Now, what's the
>>> benefit? Can you do some measurements? The accuracy of 1/10000th
>>> should be enough for anybody.
>>>
>>> I know the feeling when you spend many days working on something,
>>> adding 100s or 1000s of lines of code, solving many problems to get
>>> there and increasing code complexity significantly, and then you do
>>> the measurement and it doesn't improve anything. I know the feeling
>>> very well. It sucks. The frustration comes from the investment of time
>>> and getting no return on the investment. Many frustrations in life are
>>> like that.
>>>
>>> Marek
>>>
>>>
>>> On Fri, Jun 23, 2017 at 4:23 AM, axie <axie@amd.com> wrote:
>>>> Hi Marek,
>>>>
>>>>
>>>> So do you agree that spinlock disables CPU preemption, contrary to your
>>>> original idea?
>>>>
>>>>
>>>> If you have new reason that this patch does not improve, please speak
>>>> out.
>>>>
>>>>
>>>> Many patches in GPU driver aim at improving performance and power
>>>> efficiency. Does most patches submitted in AMDGPU requires a benchmarking
>>>> first?
>>>>
>>>> If all developers are required to always answer your questions when code
>>>> review, I am afraid that most open source community developers cannot
>>>> meet
>>>> that requirement and stop working on AMDGPU.
>>>>
>>>>
>>>> To improve performance, there are many bottlenecks to clear. When the
>>>> last
>>>> several bottlenecks are clear, the performance will show faster more
>>>> significantly.
>>>>
>>>> My pass profiling experience told me that clearing a lock can improve
>>>> performance for some driver like 0.3% to much bigger percentage. It
>>>> depends
>>>> on many factors, even depends on the application itself.
>>>>
>>>>
>>>> This is not the first bottleneck fixed. This is surely not the last one.
>>>>
>>>>
>>>> Thanks,
>>>>
>>>> Alex Bin
>>>>
>>>>
>>>>
>>>> On 2017-06-22 07:54 PM, Marek Olšák wrote:
>>>>> That's all nice, but does it improve performance? Have you been able
>>>>> to measure some performance difference with that code? Were you
>>>>> targeting a specific inefficiency you had seen e.g. with a CPU
>>>>> profiler?
>>>>>
>>>>> Marek
>>>>>
>>>>> On Thu, Jun 22, 2017 at 8:19 PM, axie <axie@amd.com> wrote:
>>>>>> To clarify, local IRQ is disabled by calling raw_local_irq_save(flags);
>>>>>>
>>>>>> Function __lock_acquire double checks that the local IRQ is really
>>>>>> disabled.
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2017-06-22 01:34 PM, axie wrote:
>>>>>>> Hi Marek,
>>>>>>>
>>>>>>> Spin lock and spin unlock is fast. But it is not so fast compared with
>>>>>>> atomic, which is a single CPU instruction in x86.
>>>>>>>
>>>>>>>
>>>>>>> 1. spinlock does NOT allow preemption at local CPU. Let us have a look
>>>>>>> at
>>>>>>> how spin lock was implemented.
>>>>>>>
>>>>>>> static inline void __raw_spin_lock(raw_spinlock_t *lock)
>>>>>>> {
>>>>>>>        preempt_disable(); <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<--This is
>>>>>>> memory barrier operation too.
>>>>>>>        spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
>>>>>>>        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
>>>>>>> }
>>>>>>>
>>>>>>> 2.  A function  __lock_acquire called by spinlock. The function is so
>>>>>>> long
>>>>>>> that I would not attach all of it here.
>>>>>>>
>>>>>>> There is atomic operation inside and 12 meta data updates and 14 if
>>>>>>> statements and it calls quite some other functions.
>>>>>>>
>>>>>>> Note that it disable IRQ...
>>>>>>>
>>>>>>> static int __lock_acquire(struct lockdep_map *lock, unsigned int
>>>>>>> subclass,
>>>>>>>                  int trylock, int read, int check, int hardirqs_off,
>>>>>>>                  struct lockdep_map *nest_lock, unsigned long ip,
>>>>>>>                  int references, int pin_count)
>>>>>>> {
>>>>>>>        struct task_struct *curr = current;
>>>>>>>        struct lock_class *class = NULL;
>>>>>>>        struct held_lock *hlock;
>>>>>>>        unsigned int depth;
>>>>>>>        int chain_head = 0;
>>>>>>>        int class_idx;
>>>>>>>        u64 chain_key;
>>>>>>>
>>>>>>>        if (unlikely(!debug_locks))
>>>>>>>            return 0;
>>>>>>>
>>>>>>>        /*
>>>>>>>         * Lockdep should run with IRQs disabled, otherwise we could
>>>>>>>         * get an interrupt which would want to take locks, which would
>>>>>>>         * end up in lockdep and have you got a head-ache already?
>>>>>>>         */
>>>>>>>        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
>>>>>>> <<<<<<<<<<<<<<<Disable
>>>>>>> IRQ
>>>>>>>            return 0;
>>>>>>>
>>>>>>> ....
>>>>>>>
>>>>>>> 3. Another function called by spinlock in a higher level:
>>>>>>>
>>>>>>> void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>>>>>>>
>>>>>>>                  int trylock, int read, int check,
>>>>>>>                  struct lockdep_map *nest_lock, unsigned long ip)
>>>>>>> {
>>>>>>>        unsigned long flags;
>>>>>>>
>>>>>>>        if (unlikely(current->lockdep_recursion))
>>>>>>>            return;
>>>>>>>
>>>>>>>        raw_local_irq_save(flags);
>>>>>>>        check_flags(flags);
>>>>>>>
>>>>>>>        current->lockdep_recursion = 1;
>>>>>>>        trace_lock_acquire(lock, subclass, trylock, read, check,
>>>>>>> nest_lock,
>>>>>>> ip);
>>>>>>>        __lock_acquire(lock, subclass, trylock, read, check,
>>>>>>>                   irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
>>>>>>>        current->lockdep_recursion = 0;
>>>>>>>        raw_local_irq_restore(flags);
>>>>>>> }
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> Alex Bin
>>>>>>>
>>>>>>>
>>>>>>> On 2017-06-22 12:27 PM, Marek Olšák wrote:
>>>>>>>> On Thu, Jun 22, 2017 at 5:33 PM, Xie, AlexBin <AlexBin.Xie@amd.com>
>>>>>>>> wrote:
>>>>>>>>> Hi Christian,
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> In fact, the change from spinlock to atomic is quite painful. When I
>>>>>>>>> started, I thought it was easy but later I found there might be race
>>>>>>>>> condition here and there. Now I think the change looks more robust.
>>>>>>>>> In
>>>>>>>>> kernel source, there are several other drivers used the same trick.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On the other hand, I think the logic itself might be optimized
>>>>>>>>> considering
>>>>>>>>> the locking. But I had spent quite some effort to maintain original
>>>>>>>>> logic.
>>>>>>>> It seems quite complicated and I don't know if there is any
>>>>>>>> performance benefit. Spinlocks are nice because they allow
>>>>>>>> preemption.
>>>>>>>>
>>>>>>>> It would be more interesting to merge the CS and BO_LIST ioctls into
>>>>>>>> one.
>>>>>>>>
>>>>>>>> Marek
>>>>>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                                 ` <CAAxE2A7jmhxC8SctSSxXhc7XRwSgzmPOOF5HmmtAPL7aYKE+Jg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-24 20:49                                                                   ` Marek Olšák
  0 siblings, 0 replies; 37+ messages in thread
From: Marek Olšák @ 2017-06-24 20:49 UTC (permalink / raw)
  To: Zhou, David(ChunMing)
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Xie, AlexBin

On Sat, Jun 24, 2017 at 2:29 AM, Marek Olšák <maraeo@gmail.com> wrote:
> On Fri, Jun 23, 2017 at 1:55 PM, Zhou, David(ChunMing)
> <David1.Zhou@amd.com> wrote:
>>
>> ________________________________________
>> From: Marek Olšák [maraeo@gmail.com]
>> Sent: Friday, June 23, 2017 6:49 PM
>> To: Christian König
>> Cc: Zhou, David(ChunMing); Xie, AlexBin; amd-gfx@lists.freedesktop.org; Xie, AlexBin
>> Subject: Re: [PATCH 1/3] drm/amdgpu: fix a typo
>>
>> On Fri, Jun 23, 2017 at 11:27 AM, Christian König
>> <deathsimple@vodafone.de> wrote:
>>> Am 23.06.2017 um 11:08 schrieb zhoucm1:
>>>>
>>>>
>>>>
>>>> On 2017年06月23日 17:01, zhoucm1 wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 2017年06月23日 16:25, Christian König wrote:
>>>>>>
>>>>>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 2017年06月23日 14:57, Christian König wrote:
>>>>>>>>
>>>>>>>> But giving the CS IOCTL an option for directly specifying the BOs
>>>>>>>> instead of a BO list like Marek suggested would indeed save us some time
>>>>>>>> here.
>>>>>>>
>>>>>>> interesting, I always follow how to improve our cs ioctl, since UMD
>>>>>>> guys aften complain our command submission is slower than windows.
>>>>>>> Then how to directly specifying the BOs instead of a BO list? BO handle
>>>>>>> array from UMD? Could your guys describe more clear? Is it doable?
>>>>>>
>>>>>>
>>>>>> Making the BO list part of the CS IOCTL wouldn't help at all for the
>>>>>> close source UMDs. To be precise we actually came up with the BO list
>>>>>> approach because of their requirement.
>>>>>>
>>>>>> The biggest bunch of work during CS is reserving all the buffers,
>>>>>> validating them and checking their VM status.
>>>>>
>>>>> Totally agree. Every time when I read code there, I often want to
>>>>> optimize them.
>>>>>
>>>>>> It doesn't matter if the BOs come from the BO list or directly in the CS
>>>>>> IOCTL.
>>>>>>
>>>>>> The key point is that CS overhead is pretty much irrelevant for the open
>>>>>> source stack, since Mesa does command submission from a separate thread
>>>>>> anyway.
>>>>>
>>>>> If irrelevant for the open stack, then how does open source stack handle
>>>>> "The biggest bunch of work during CS is reserving all the buffers,
>>>>> validating them and checking their VM status."?
>>>
>>>
>>> Command submission on the open stack is outsourced to a separate user space
>>> thread. E.g. when an application triggers a flush the IBs created so far are
>>> just put on a queue and another thread pushes them down to the kernel.
>>>
>>> I mean reducing the overhead of the CS IOCTL is always nice, but you usual
>>> won't see any fps increase as long as not all CPUs are completely bound to
>>> some tasks.
>>>
>>>>> If open stack has a better way, I think closed stack can follow it, I
>>>>> don't know the history.
>>>>
>>>> Do you not use bo list at all in mesa? radv as well?
>>>
>>>
>>> I don't think so. Mesa just wants to send the list of used BOs down to the
>>> kernel with every IOCTL.
>>
>> The CS ioctl actually costs us some performance, but not as much as on
>> closed source drivers.
>>
>> MesaGL always executes all CS ioctls in a separate thread (in parallel
>> with the UMD) except for the last IB that's submitted by SwapBuffers.
>> SwapBuffers requires that all IBs have been submitted when SwapBuffers
>> returns. For example, if you have 5 IBs per frame, 4 of them are
>> executed on the thread and the overhead is hidden. The last one is
>> executed on the thread too, but this time the Mesa driver has to wait
>> for it. For things like glxgears with only 1 IB per frame, the thread
>> doesn't hide anything and Mesa always has to wait for it after
>> submission, just because of SwapBuffers.
>>
>> Having 10 or more IBs per frame is great, because 9 are done in
>> parallel and the last one is synchronous. The final CPU cost is 10x
>> lower, but it's not zero.
>> [DZ] Thanks Marek, this is very useful and helpful message for me to understand Mesa action of CS, I will talk to closed guys to see if it can be used for them.
>> Anothing I also want to confirm with you, do you know if radv is using this cs way?
>>
>> For us, it's certainly useful to optimize the CS ioctl because of apps
>> that submit only 1 IB per frame where multithreading has no effect or
>> may even hurt performance.
>>
>> The most obvious inefficiency is the BO_LIST ioctl that is completely
>> unnecessary and only slows us down. What we need is exactly what
>> radeon does.
>>
>> [DZ] I don't know how radeon handle bo list, could you describe it as well?
>
> Inputs for the following ioctls are:
>
> AMDGPU: BO_LIST:
> - list of BOs
>
> AMDGPU: CS
> - list of IBs
> - BO list handle
>
> RADEON: CS
> - one IB
> - list of BOs
>
> Ideal solution for a new amdgpu CS ioctl:
> - list of IBs
> - list of BOs

I'd like to say that the current CS ioctl design is only a half of the
problem with slow command submission. The second half is the libdrm
overhead itself. Having wrapper functions around ioctls that have to
unwrap input objects into alloca'd memory just for the ioctl to be
called is simply wasted CPU time. There are cases where libdrm is
useful. Command submission is not one of them. Any driver developer or
vendor putting CS wrappers into libdrm is putting himself into a
losing position. It's a perpetuated community myth that libdrm should
have wrappers for everything.

Our winsys in Mesa is designed such that it can call the CS ioctl
right when it's requested. The whole CS ioctl input structure is
always ready at any point in time, because it's updated incrementally
while draw calls are made. The radeon winsys works that way. The
amdgpu winsys works that way too, but there is another translation of
inputs in libdrm that defeats it. Intel also publicly admitted that
putting CS wrappers into libdrm was stupid.

The solution for the best performance is to call the CS ioctl directly
from UMDs, i.e. Mesa and Vulkan should call drmCommandWriteRead(fd,
DRM_AMDGPU_CS2, ...) directly.

Until that's done, the command submission for the radeon kernel driver
will remain faster than amdgpu.

Marek
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                 ` <594CDA92.1060809-5C7GfCeVMHo@public.gmane.org>
  2017-06-23  9:27                                                   ` Christian König
@ 2017-06-25 19:48                                                   ` Dave Airlie
       [not found]                                                     ` <CAPM=9tzXrih9qpkOY+OGC1TF8nD0ZmwNnxXiRbMtcO7tR1-m1Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 37+ messages in thread
From: Dave Airlie @ 2017-06-25 19:48 UTC (permalink / raw)
  To: zhoucm1, Bas Nieuwenhuizen
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	axie, Marek Olšák, Xie, AlexBin

> Do you not use bo list at all in mesa? radv as well?

Currently radv is creating a bo list per command submission. radv does
not use an offload thread to do command submission, as it seems pretty
un-vulkan to use a thread for the queue submission thread the game
uses.

I have considered investigating this, but with vulkan it's probably
optimising for the single threaded case which isn't where apps should
really be.

At the moment we regenerate the bo list on every CS ioctl, we probably
can do a lot better, I know Bas has looked into this area a bit more
than I.

Dave.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                         ` <CAAxE2A4vdV+QD55RAn+mrD92o39X_vPT6wLKCwkREx2=fpU2Vw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-06-23 11:55                                                           ` Zhou, David(ChunMing)
@ 2017-06-26  9:06                                                           ` Michel Dänzer
  1 sibling, 0 replies; 37+ messages in thread
From: Michel Dänzer @ 2017-06-26  9:06 UTC (permalink / raw)
  To: Marek Olšák, Christian König
  Cc: zhoucm1, axie, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Xie, AlexBin

On 23/06/17 07:49 PM, Marek Olšák wrote:
> On Fri, Jun 23, 2017 at 11:27 AM, Christian König
> <deathsimple@vodafone.de> wrote:
>> Am 23.06.2017 um 11:08 schrieb zhoucm1:
>>> On 2017年06月23日 17:01, zhoucm1 wrote:
>>>> On 2017年06月23日 16:25, Christian König wrote:
>>>>> Am 23.06.2017 um 09:09 schrieb zhoucm1:
>>>>>> On 2017年06月23日 14:57, Christian König wrote:
>>>>>>>
>>>>>>> But giving the CS IOCTL an option for directly specifying the BOs
>>>>>>> instead of a BO list like Marek suggested would indeed save us some time
>>>>>>> here.
>>>>>>
>>>>>> interesting, I always follow how to improve our cs ioctl, since UMD
>>>>>> guys aften complain our command submission is slower than windows.
>>>>>> Then how to directly specifying the BOs instead of a BO list? BO handle
>>>>>> array from UMD? Could your guys describe more clear? Is it doable?
>>>>>
>>>>>
>>>>> Making the BO list part of the CS IOCTL wouldn't help at all for the
>>>>> close source UMDs. To be precise we actually came up with the BO list
>>>>> approach because of their requirement.
>>>>>
>>>>> The biggest bunch of work during CS is reserving all the buffers,
>>>>> validating them and checking their VM status.
>>>>
>>>> Totally agree. Every time when I read code there, I often want to
>>>> optimize them.
>>>>
>>>>> It doesn't matter if the BOs come from the BO list or directly in the CS
>>>>> IOCTL.
>>>>>
>>>>> The key point is that CS overhead is pretty much irrelevant for the open
>>>>> source stack, since Mesa does command submission from a separate thread
>>>>> anyway.
>>>>
>>>> If irrelevant for the open stack, then how does open source stack handle
>>>> "The biggest bunch of work during CS is reserving all the buffers,
>>>> validating them and checking their VM status."?
>>
>>
>> Command submission on the open stack is outsourced to a separate user space
>> thread. E.g. when an application triggers a flush the IBs created so far are
>> just put on a queue and another thread pushes them down to the kernel.
>>
>> I mean reducing the overhead of the CS IOCTL is always nice, but you usual
>> won't see any fps increase as long as not all CPUs are completely bound to
>> some tasks.
>>
>>>> If open stack has a better way, I think closed stack can follow it, I
>>>> don't know the history.
>>>
>>> Do you not use bo list at all in mesa? radv as well?
>>
>>
>> I don't think so. Mesa just wants to send the list of used BOs down to the
>> kernel with every IOCTL.
> 
> The CS ioctl actually costs us some performance, but not as much as on
> closed source drivers.
> 
> MesaGL always executes all CS ioctls in a separate thread (in parallel
> with the UMD) except for the last IB that's submitted by SwapBuffers.

... or by an explicit glFinish or glFlush (at least when the current
draw buffer isn't a back buffer) call, right?


> For us, it's certainly useful to optimize the CS ioctl because of apps
> that submit only 1 IB per frame where multithreading has no effect or
> may even hurt performance.

Another possibility might be flushing earlier, e.g. when the GPU and/or
CS submission thread are idle. But optimizing the CS ioctl would still
help in that case.

Finding good heuristics which allows better utilization of the GPU / CS
submission thread and doesn't hurt performance in any scenario might be
tricky though.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                     ` <CAPM=9tzXrih9qpkOY+OGC1TF8nD0ZmwNnxXiRbMtcO7tR1-m1Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-26  9:10                                                       ` zhoucm1
       [not found]                                                         ` <5950CFA0.90906-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: zhoucm1 @ 2017-06-26  9:10 UTC (permalink / raw)
  To: Dave Airlie, Bas Nieuwenhuizen
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	axie, Marek Olšák, Xie, AlexBin



On 2017年06月26日 03:48, Dave Airlie wrote:
>> Do you not use bo list at all in mesa? radv as well?
> Currently radv is creating a bo list per command submission. radv does
> not use an offload thread to do command submission, as it seems pretty
> un-vulkan to use a thread for the queue submission thread the game
> uses.
>
> I have considered investigating this, but with vulkan it's probably
> optimising for the single threaded case which isn't where apps should
> really be.
>
> At the moment we regenerate the bo list on every CS ioctl, we probably
> can do a lot better, I know Bas has looked into this area a bit more
> than I.
Thanks Dave for inputting.

Could I ask more about radv? How does radv make bo list for every cs 
ioctl? Adding filter in every operation, any related bo will be add to 
bo list during make command submission?

Thanks,
David Zhou
>
> Dave.

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                         ` <5950CFA0.90906-5C7GfCeVMHo@public.gmane.org>
@ 2017-06-27 20:40                                                           ` Dave Airlie
       [not found]                                                             ` <CAPM=9tw-E39oQ1gRPJs3oZBniALzgoa53ErF-trgxiN8Z9xyww-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Dave Airlie @ 2017-06-27 20:40 UTC (permalink / raw)
  To: zhoucm1
  Cc: Marek Olšák, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Christian König, Bas Nieuwenhuizen, axie, Xie, AlexBin

On 26 June 2017 at 19:10, zhoucm1 <david1.zhou@amd.com> wrote:
>
>
> On 2017年06月26日 03:48, Dave Airlie wrote:
>>>
>>> Do you not use bo list at all in mesa? radv as well?
>>
>> Currently radv is creating a bo list per command submission. radv does
>> not use an offload thread to do command submission, as it seems pretty
>> un-vulkan to use a thread for the queue submission thread the game
>> uses.
>>
>> I have considered investigating this, but with vulkan it's probably
>> optimising for the single threaded case which isn't where apps should
>> really be.
>>
>> At the moment we regenerate the bo list on every CS ioctl, we probably
>> can do a lot better, I know Bas has looked into this area a bit more
>> than I.
>
> Thanks Dave for inputting.
>
> Could I ask more about radv? How does radv make bo list for every cs ioctl?
> Adding filter in every operation, any related bo will be add to bo list
> during make command submission?

When we create command buffers we add each bo to a list for it, but we
don't call the kernel ioctl.

When we get a QueueSubmit, we usually submit two IBs to the kernel,
one with some cache flushes and one with the main IB in it, however as
we have to give the kernel one bo list we have to combine the two
lists (even if the cache flush list only has one ib in it, itself), so
we can't call create bo list early, we have to do it just before
command submission. However I don't think we've had any problems with
this being our slow path yet, but this may change in the future.

Dave.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                             ` <CAPM=9tw-E39oQ1gRPJs3oZBniALzgoa53ErF-trgxiN8Z9xyww-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-27 20:50                                                               ` Bas Nieuwenhuizen
       [not found]                                                                 ` <CAP+8YyGouX0AeBxj+UNmb1Q7iwAzt0Z_hx__hf1Bb=YqC55J_A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 37+ messages in thread
From: Bas Nieuwenhuizen @ 2017-06-27 20:50 UTC (permalink / raw)
  To: Dave Airlie
  Cc: zhoucm1, Marek Olšák,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, Christian König,
	axie, Xie, AlexBin

On Tue, Jun 27, 2017 at 10:40 PM, Dave Airlie <airlied@gmail.com> wrote:
> On 26 June 2017 at 19:10, zhoucm1 <david1.zhou@amd.com> wrote:
>>
>>
>> On 2017年06月26日 03:48, Dave Airlie wrote:
>>>>
>>>> Do you not use bo list at all in mesa? radv as well?
>>>
>>> Currently radv is creating a bo list per command submission. radv does
>>> not use an offload thread to do command submission, as it seems pretty
>>> un-vulkan to use a thread for the queue submission thread the game
>>> uses.
>>>
>>> I have considered investigating this, but with vulkan it's probably
>>> optimising for the single threaded case which isn't where apps should
>>> really be.
>>>
>>> At the moment we regenerate the bo list on every CS ioctl, we probably
>>> can do a lot better, I know Bas has looked into this area a bit more
>>> than I.
>>
>> Thanks Dave for inputting.
>>
>> Could I ask more about radv? How does radv make bo list for every cs ioctl?
>> Adding filter in every operation, any related bo will be add to bo list
>> during make command submission?
>
> When we create command buffers we add each bo to a list for it, but we
> don't call the kernel ioctl.
>
> When we get a QueueSubmit, we usually submit two IBs to the kernel,
> one with some cache flushes and one with the main IB in it, however as
> we have to give the kernel one bo list we have to combine the two
> lists (even if the cache flush list only has one ib in it, itself), so
> we can't call create bo list early, we have to do it just before
> command submission. However I don't think we've had any problems with
> this being our slow path yet, but this may change in the future.

Yeah. IIRC the bo list create is typically < 10% of our kernel time for radv.
>
> Dave.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: fix a typo
       [not found]                                                                 ` <CAP+8YyGouX0AeBxj+UNmb1Q7iwAzt0Z_hx__hf1Bb=YqC55J_A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-28  2:00                                                                   ` zhoucm1
  0 siblings, 0 replies; 37+ messages in thread
From: zhoucm1 @ 2017-06-28  2:00 UTC (permalink / raw)
  To: Bas Nieuwenhuizen, Dave Airlie
  Cc: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	axie, Marek Olšák, Xie, AlexBin



On 2017年06月28日 04:50, Bas Nieuwenhuizen wrote:
> On Tue, Jun 27, 2017 at 10:40 PM, Dave Airlie <airlied@gmail.com> wrote:
>> On 26 June 2017 at 19:10, zhoucm1 <david1.zhou@amd.com> wrote:
>>>
>>> On 2017年06月26日 03:48, Dave Airlie wrote:
>>>>> Do you not use bo list at all in mesa? radv as well?
>>>> Currently radv is creating a bo list per command submission. radv does
>>>> not use an offload thread to do command submission, as it seems pretty
>>>> un-vulkan to use a thread for the queue submission thread the game
>>>> uses.
>>>>
>>>> I have considered investigating this, but with vulkan it's probably
>>>> optimising for the single threaded case which isn't where apps should
>>>> really be.
>>>>
>>>> At the moment we regenerate the bo list on every CS ioctl, we probably
>>>> can do a lot better, I know Bas has looked into this area a bit more
>>>> than I.
>>> Thanks Dave for inputting.
>>>
>>> Could I ask more about radv? How does radv make bo list for every cs ioctl?
>>> Adding filter in every operation, any related bo will be add to bo list
>>> during make command submission?
>> When we create command buffers we add each bo to a list for it, but we
>> don't call the kernel ioctl.
>>
>> When we get a QueueSubmit, we usually submit two IBs to the kernel,
>> one with some cache flushes and one with the main IB in it, however as
>> we have to give the kernel one bo list we have to combine the two
>> lists (even if the cache flush list only has one ib in it, itself), so
>> we can't call create bo list early, we have to do it just before
>> command submission. However I don't think we've had any problems with
>> this being our slow path yet, but this may change in the future.
> Yeah. IIRC the bo list create is typically < 10% of our kernel time for radv.

Thanks a lot for all your input.

Regards,
David Zhou
>> Dave.

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2017-06-28  2:00 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-22  2:42 [PATCH 1/3] drm/amdgpu: fix a typo Alex Xie
     [not found] ` <1498099356-31332-1-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
2017-06-22  2:42   ` [PATCH 2/3] drm/amdgpu: change a function to static function Alex Xie
     [not found]     ` <1498099356-31332-2-git-send-email-AlexBin.Xie-5C7GfCeVMHo@public.gmane.org>
2017-06-22  2:54       ` Michel Dänzer
2017-06-22  2:42   ` [PATCH 3/3] drm/amdgpu: optimize out a spin lock Use atomic instead of spin lock Alex Xie
2017-06-22  7:35   ` [PATCH 1/3] drm/amdgpu: fix a typo Christian König
     [not found]     ` <5ff65f82-9d15-4606-7e25-e4f75c172aed-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-22 15:33       ` Xie, AlexBin
     [not found]         ` <DM5PR12MB1257B1FCC981D29A6D4045CDF2DB0-2J9CzHegvk/NHlLGalgXawdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-06-22 16:24           ` Christian König
2017-06-22 16:27           ` Marek Olšák
     [not found]             ` <CAAxE2A7ic2YLmnkzM0Faa3bO9-GwWRTHRc0EUVej5UU7Yhs4Qg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-22 17:34               ` axie
     [not found]                 ` <efc2289a-382d-8443-8419-da2d0cadfd77-5C7GfCeVMHo@public.gmane.org>
2017-06-22 18:19                   ` axie
     [not found]                     ` <e23f64c3-6abf-ff65-2300-5c44d242f4df-5C7GfCeVMHo@public.gmane.org>
2017-06-22 23:54                       ` Marek Olšák
     [not found]                         ` <CAAxE2A61KqM9gr=Zoo5PHFNb8gWp74RG9KOs=efWzf1dMKBwSA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-23  2:23                           ` axie
     [not found]                             ` <7134d81f-a60f-7093-d2a3-70edde23cdb2-5C7GfCeVMHo@public.gmane.org>
2017-06-23  6:57                               ` Christian König
     [not found]                                 ` <3436ae97-39b1-4a3f-bb73-2991adad5715-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-23  7:09                                   ` zhoucm1
     [not found]                                     ` <594CBEBC.5010703-5C7GfCeVMHo@public.gmane.org>
2017-06-23  8:25                                       ` Christian König
     [not found]                                         ` <0d847baf-8296-f3ce-7a8d-8823f33e392e-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-23  9:01                                           ` zhoucm1
2017-06-23  9:01                                           ` zhoucm1
     [not found]                                             ` <594CD8E0.3080702-5C7GfCeVMHo@public.gmane.org>
2017-06-23  9:08                                               ` zhoucm1
     [not found]                                                 ` <594CDA92.1060809-5C7GfCeVMHo@public.gmane.org>
2017-06-23  9:27                                                   ` Christian König
     [not found]                                                     ` <807f27d6-7e53-4066-c440-699bf66dd227-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-23 10:49                                                       ` Marek Olšák
     [not found]                                                         ` <CAAxE2A4vdV+QD55RAn+mrD92o39X_vPT6wLKCwkREx2=fpU2Vw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-23 11:55                                                           ` Zhou, David(ChunMing)
     [not found]                                                             ` <MWHPR1201MB02063CAC83DEEAB81EFA9A10B4D80-3iK1xFAIwjrUF/YbdlDdgWrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-06-24  0:29                                                               ` Marek Olšák
     [not found]                                                                 ` <CAAxE2A7jmhxC8SctSSxXhc7XRwSgzmPOOF5HmmtAPL7aYKE+Jg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-24 20:49                                                                   ` Marek Olšák
2017-06-26  9:06                                                           ` Michel Dänzer
2017-06-25 19:48                                                   ` Dave Airlie
     [not found]                                                     ` <CAPM=9tzXrih9qpkOY+OGC1TF8nD0ZmwNnxXiRbMtcO7tR1-m1Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-26  9:10                                                       ` zhoucm1
     [not found]                                                         ` <5950CFA0.90906-5C7GfCeVMHo@public.gmane.org>
2017-06-27 20:40                                                           ` Dave Airlie
     [not found]                                                             ` <CAPM=9tw-E39oQ1gRPJs3oZBniALzgoa53ErF-trgxiN8Z9xyww-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-27 20:50                                                               ` Bas Nieuwenhuizen
     [not found]                                                                 ` <CAP+8YyGouX0AeBxj+UNmb1Q7iwAzt0Z_hx__hf1Bb=YqC55J_A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-28  2:00                                                                   ` zhoucm1
2017-06-23 11:37                               ` Marek Olšák
     [not found]                                 ` <CAAxE2A6Tt1JzLCDr4vM1iKASpWGFncqrXkL75==1Zd5wJq8xTA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-23 13:01                                   ` Christian König
     [not found]                                     ` <963f4d3d-dc46-2279-509f-b475c5ec94ee-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-06-23 13:55                                       ` axie
     [not found]                                         ` <3d4f3204-6893-3d68-23ea-b309ace33740-5C7GfCeVMHo@public.gmane.org>
2017-06-23 14:30                                           ` Christian König
2017-06-24  0:27                                       ` Marek Olšák
     [not found]                                         ` <CAAxE2A7kFusP3=-FcVWdSnGRvaFwL8kZ17BH4uKJ3z044S9XFQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-24 18:27                                           ` Christian König
2017-06-23 13:45                                   ` axie
     [not found]                                     ` <94b20fb6-3f81-da3c-eb44-7d2e49ff5c8b-5C7GfCeVMHo@public.gmane.org>
2017-06-24  0:46                                       ` Marek Olšák

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.