dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] iosys-map: Add per-word read
@ 2022-06-10 23:21 Lucas De Marchi
  2022-06-10 23:21 ` [PATCH 2/3] iosys-map: Add per-word write Lucas De Marchi
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Lucas De Marchi @ 2022-06-10 23:21 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: daniel.vetter, Lucas De Marchi, christian.koenig, tzimmermann, chris

Instead of always falling back to memcpy_fromio() for any size, prefer
using read{b,w,l}(). When reading struct members it's common to read
individual integer variables individually. Going through memcpy_fromio()
for each of them poses a high penalty.

Employ a similar trick as __seqprop() by using _Generic() to generate
only the specific call based on a type-compatible variable.

For a pariticular i915 workload producing GPU context switches,
__get_engine_usage_record() is particularly hot since the engine usage
is read from device local memory with dgfx, possibly multiple times
since it's racy. Test execution time for this test shows a ~12.5%
improvement with DG2:

Before:
	nrepeats = 1000; min = 7.63243e+06; max = 1.01817e+07;
	median = 9.52548e+06; var = 526149;
After:
	nrepeats = 1000; min = 7.03402e+06; max = 8.8832e+06;
	median = 8.33955e+06; var = 333113;

Other things attempted that didn't prove very useful:
1) Change the _Generic() on x86 to just dereference the memory address
2) Change __get_engine_usage_record() to do just 1 read per loop,
   comparing with the previous value read
3) Change __get_engine_usage_record() to access the fields directly as it
   was before the conversion to iosys-map

(3) did gave a small improvement (~3%), but doesn't seem to scale well
to other similar cases in the driver.

Additional test by Chris Wilson using gem_create from igt with some
changes to track object creation time. This happens to accidentally
stress this code path:

	Pre iosys_map conversion of engine busyness:
	lmem0: Creating    262144 4KiB objects took 59274.2ms

	Unpatched:
	lmem0: Creating    262144 4KiB objects took 108830.2ms

	With readl (this patch):
	lmem0: Creating    262144 4KiB objects took 61348.6ms

	s/readl/READ_ONCE/
	lmem0: Creating    262144 4KiB objects took 61333.2ms

So we do take a little bit more time than before the conversion, but
that is due to other factors: bringing the READ_ONCE back would be as
good as just doing this conversion.

Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 include/linux/iosys-map.h | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index e69a002d5aa4..cd28c7a1b79c 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -333,6 +333,20 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
 		memset(dst->vaddr + offset, value, len);
 }
 
+#ifdef CONFIG_64BIT
+#define __iosys_map_rd_io_u64_case(val_, vaddr_iomem_)			\
+	u64: val_ = readq(vaddr_iomem_),
+#else
+#define __iosys_map_rd_io_u64_case(val_, vaddr_iomem_)
+#endif
+
+#define __iosys_map_rd_io(val__, vaddr_iomem__, type__) _Generic(val__,	\
+	u8: val__ = readb(vaddr_iomem__),				\
+	u16: val__ = readw(vaddr_iomem__),				\
+	u32: val__ = readl(vaddr_iomem__),				\
+	__iosys_map_rd_io_u64_case(val__, vaddr_iomem__)		\
+	default: memcpy_fromio(&(val__), vaddr_iomem__, sizeof(val__)))
+
 /**
  * iosys_map_rd - Read a C-type value from the iosys_map
  *
@@ -346,10 +360,14 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * Returns:
  * The value read from the mapping.
  */
-#define iosys_map_rd(map__, offset__, type__) ({			\
-	type__ val;							\
-	iosys_map_memcpy_from(&val, map__, offset__, sizeof(val));	\
-	val;								\
+#define iosys_map_rd(map__, offset__, type__) ({				\
+	type__ val;								\
+	if ((map__)->is_iomem) {						\
+		__iosys_map_rd_io(val, (map__)->vaddr_iomem + offset__, type__);\
+	} else {								\
+		memcpy(&val, (map__)->vaddr + offset__, sizeof(val));		\
+	}									\
+	val;									\
 })
 
 /**
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] iosys-map: Add per-word write
  2022-06-10 23:21 [PATCH 1/3] iosys-map: Add per-word read Lucas De Marchi
@ 2022-06-10 23:21 ` Lucas De Marchi
  2022-06-10 23:21 ` [PATCH 3/3] iosys-map: Fix typo in documentation Lucas De Marchi
  2022-06-11  8:16 ` [PATCH 1/3] iosys-map: Add per-word read Christian König
  2 siblings, 0 replies; 4+ messages in thread
From: Lucas De Marchi @ 2022-06-10 23:21 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: daniel.vetter, Lucas De Marchi, christian.koenig, tzimmermann, chris

Like was done for read, provide the equivalent for write. Even if
current users are not in the hot path, this should future-proof it.

Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 include/linux/iosys-map.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index cd28c7a1b79c..793e5cd50dbf 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -336,8 +336,11 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
 #ifdef CONFIG_64BIT
 #define __iosys_map_rd_io_u64_case(val_, vaddr_iomem_)			\
 	u64: val_ = readq(vaddr_iomem_),
+#define __iosys_map_wr_io_u64_case(val_, vaddr_iomem_)			\
+	u64: writeq(val_, vaddr_iomem_),
 #else
 #define __iosys_map_rd_io_u64_case(val_, vaddr_iomem_)
+#define __iosys_map_wr_io_u64_case(val_, vaddr_iomem_)
 #endif
 
 #define __iosys_map_rd_io(val__, vaddr_iomem__, type__) _Generic(val__,	\
@@ -347,6 +350,13 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
 	__iosys_map_rd_io_u64_case(val__, vaddr_iomem__)		\
 	default: memcpy_fromio(&(val__), vaddr_iomem__, sizeof(val__)))
 
+#define __iosys_map_wr_io(val__, vaddr_iomem__, type__) _Generic(val__,	\
+	u8: writeb(val__, vaddr_iomem__),				\
+	u16: writew(val__, vaddr_iomem__),				\
+	u32: writel(val__, vaddr_iomem__),				\
+	__iosys_map_wr_io_u64_case(val__, vaddr_iomem__)		\
+	default: memcpy_toio(vaddr_iomem__, &val, sizeof(val)))
+
 /**
  * iosys_map_rd - Read a C-type value from the iosys_map
  *
@@ -381,9 +391,13 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * Write a C-type value to the iosys_map, handling possible un-aligned accesses
  * to the mapping.
  */
-#define iosys_map_wr(map__, offset__, type__, val__) ({			\
-	type__ val = (val__);						\
-	iosys_map_memcpy_to(map__, offset__, &val, sizeof(val));	\
+#define iosys_map_wr(map__, offset__, type__, val__) ({				\
+	type__ val = (val__);							\
+	if ((map__)->is_iomem) {						\
+		__iosys_map_wr_io(val, (map__)->vaddr_iomem + offset__, type__);\
+	} else {								\
+		memcpy((map__)->vaddr + offset__, &val, sizeof(val));		\
+	}									\
 })
 
 /**
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] iosys-map: Fix typo in documentation
  2022-06-10 23:21 [PATCH 1/3] iosys-map: Add per-word read Lucas De Marchi
  2022-06-10 23:21 ` [PATCH 2/3] iosys-map: Add per-word write Lucas De Marchi
@ 2022-06-10 23:21 ` Lucas De Marchi
  2022-06-11  8:16 ` [PATCH 1/3] iosys-map: Add per-word read Christian König
  2 siblings, 0 replies; 4+ messages in thread
From: Lucas De Marchi @ 2022-06-10 23:21 UTC (permalink / raw)
  To: intel-gfx, dri-devel
  Cc: daniel.vetter, Lucas De Marchi, christian.koenig, tzimmermann, chris

It's one argument, vaddr_iomem, not 2 (vaddr and _iomem).

Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 include/linux/iosys-map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index 793e5cd50dbf..d092d30f5812 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -23,7 +23,7 @@
  *	memcpy(vaddr, src, len);
  *
  *	void *vaddr_iomem = ...; // pointer to I/O memory
- *	memcpy_toio(vaddr, _iomem, src, len);
+ *	memcpy_toio(vaddr_iomem, src, len);
  *
  * The user of such pointer may not have information about the mapping of that
  * region or may want to have a single code path to handle operations on that
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/3] iosys-map: Add per-word read
  2022-06-10 23:21 [PATCH 1/3] iosys-map: Add per-word read Lucas De Marchi
  2022-06-10 23:21 ` [PATCH 2/3] iosys-map: Add per-word write Lucas De Marchi
  2022-06-10 23:21 ` [PATCH 3/3] iosys-map: Fix typo in documentation Lucas De Marchi
@ 2022-06-11  8:16 ` Christian König
  2 siblings, 0 replies; 4+ messages in thread
From: Christian König @ 2022-06-11  8:16 UTC (permalink / raw)
  To: Lucas De Marchi, intel-gfx, dri-devel; +Cc: daniel.vetter, tzimmermann, chris

Am 11.06.22 um 01:21 schrieb Lucas De Marchi:
> Instead of always falling back to memcpy_fromio() for any size, prefer
> using read{b,w,l}(). When reading struct members it's common to read
> individual integer variables individually. Going through memcpy_fromio()
> for each of them poses a high penalty.
>
> Employ a similar trick as __seqprop() by using _Generic() to generate
> only the specific call based on a type-compatible variable.
>
> For a pariticular i915 workload producing GPU context switches,
> __get_engine_usage_record() is particularly hot since the engine usage
> is read from device local memory with dgfx, possibly multiple times
> since it's racy. Test execution time for this test shows a ~12.5%
> improvement with DG2:
>
> Before:
> 	nrepeats = 1000; min = 7.63243e+06; max = 1.01817e+07;
> 	median = 9.52548e+06; var = 526149;
> After:
> 	nrepeats = 1000; min = 7.03402e+06; max = 8.8832e+06;
> 	median = 8.33955e+06; var = 333113;
>
> Other things attempted that didn't prove very useful:
> 1) Change the _Generic() on x86 to just dereference the memory address
> 2) Change __get_engine_usage_record() to do just 1 read per loop,
>     comparing with the previous value read
> 3) Change __get_engine_usage_record() to access the fields directly as it
>     was before the conversion to iosys-map
>
> (3) did gave a small improvement (~3%), but doesn't seem to scale well
> to other similar cases in the driver.
>
> Additional test by Chris Wilson using gem_create from igt with some
> changes to track object creation time. This happens to accidentally
> stress this code path:
>
> 	Pre iosys_map conversion of engine busyness:
> 	lmem0: Creating    262144 4KiB objects took 59274.2ms
>
> 	Unpatched:
> 	lmem0: Creating    262144 4KiB objects took 108830.2ms
>
> 	With readl (this patch):
> 	lmem0: Creating    262144 4KiB objects took 61348.6ms
>
> 	s/readl/READ_ONCE/
> 	lmem0: Creating    262144 4KiB objects took 61333.2ms
>
> So we do take a little bit more time than before the conversion, but
> that is due to other factors: bringing the READ_ONCE back would be as
> good as just doing this conversion.
>
> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>

Reviewed-by: Christian König <christian.koenig@amd.com> for the entire 
series.

> ---
>   include/linux/iosys-map.h | 26 ++++++++++++++++++++++----
>   1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
> index e69a002d5aa4..cd28c7a1b79c 100644
> --- a/include/linux/iosys-map.h
> +++ b/include/linux/iosys-map.h
> @@ -333,6 +333,20 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
>   		memset(dst->vaddr + offset, value, len);
>   }
>   
> +#ifdef CONFIG_64BIT
> +#define __iosys_map_rd_io_u64_case(val_, vaddr_iomem_)			\
> +	u64: val_ = readq(vaddr_iomem_),
> +#else
> +#define __iosys_map_rd_io_u64_case(val_, vaddr_iomem_)
> +#endif
> +
> +#define __iosys_map_rd_io(val__, vaddr_iomem__, type__) _Generic(val__,	\
> +	u8: val__ = readb(vaddr_iomem__),				\
> +	u16: val__ = readw(vaddr_iomem__),				\
> +	u32: val__ = readl(vaddr_iomem__),				\
> +	__iosys_map_rd_io_u64_case(val__, vaddr_iomem__)		\
> +	default: memcpy_fromio(&(val__), vaddr_iomem__, sizeof(val__)))
> +
>   /**
>    * iosys_map_rd - Read a C-type value from the iosys_map
>    *
> @@ -346,10 +360,14 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
>    * Returns:
>    * The value read from the mapping.
>    */
> -#define iosys_map_rd(map__, offset__, type__) ({			\
> -	type__ val;							\
> -	iosys_map_memcpy_from(&val, map__, offset__, sizeof(val));	\
> -	val;								\
> +#define iosys_map_rd(map__, offset__, type__) ({				\
> +	type__ val;								\
> +	if ((map__)->is_iomem) {						\
> +		__iosys_map_rd_io(val, (map__)->vaddr_iomem + offset__, type__);\
> +	} else {								\
> +		memcpy(&val, (map__)->vaddr + offset__, sizeof(val));		\
> +	}									\
> +	val;									\
>   })
>   
>   /**


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-06-11  8:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-10 23:21 [PATCH 1/3] iosys-map: Add per-word read Lucas De Marchi
2022-06-10 23:21 ` [PATCH 2/3] iosys-map: Add per-word write Lucas De Marchi
2022-06-10 23:21 ` [PATCH 3/3] iosys-map: Fix typo in documentation Lucas De Marchi
2022-06-11  8:16 ` [PATCH 1/3] iosys-map: Add per-word read Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).