linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] radeon: Deinline indirect register accessor functions
@ 2015-05-18 19:02 Denys Vlasenko
  2015-05-18 19:09 ` Christian König
  0 siblings, 1 reply; 5+ messages in thread
From: Denys Vlasenko @ 2015-05-18 19:02 UTC (permalink / raw)
  To: Christian König; +Cc: Denys Vlasenko, Alex Deucher, linux-kernel

This patch deinlines indirect register accessor functions.

These functions perform two mmio accesses, framed by spin lock/unlock.
Spin lock/unlock by itself takes more than 50 cycles in ideal case
(if lock is exclusively cached on current CPU).

With this .config: http://busybox.net/~vda/kernel_config,
after uninlining these functions have sizes and callsite counts
as follows:

r600_uvd_ctx_rreg: 111 bytes, 4 callsites
r600_uvd_ctx_wreg: 113 bytes, 5 callsites
eg_pif_phy0_rreg: 106 bytes, 13 callsites
eg_pif_phy0_wreg: 108 bytes, 13 callsites
eg_pif_phy1_rreg: 107 bytes, 13 callsites
eg_pif_phy1_wreg: 108 bytes, 13 callsites
rv370_pcie_rreg: 111 bytes, 21 callsites
rv370_pcie_wreg: 113 bytes, 24 callsites
r600_rcu_rreg: 111 bytes, 16 callsites
r600_rcu_wreg: 113 bytes, 25 callsites
cik_didt_rreg: 106 bytes, 10 callsites
cik_didt_wreg: 107 bytes, 10 callsites
tn_smc_rreg: 106 bytes, 126 callsites
tn_smc_wreg: 107 bytes, 116 callsites
eg_cg_rreg: 107 bytes, 20 callsites
eg_cg_wreg: 108 bytes, 52 callsites

Functions r100_mm_rreg() and r100_mm_rreg() have a fast path and
locked (slow) path.
This patch deinlines only slow path.

r100_mm_rreg_slow: 78 bytes, 2083 callsites
r100_mm_wreg_slow: 81 bytes, 3570 callsites

Reduction in code size is more than 65,000 bytes:

    text     data      bss       dec     hex filename
85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
85674192 22294776 20627456 128598664 7aa4288 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: linux-kernel@vger.kernel.org
---
Changes in v2: only partially deinline r100_mm_r/wreg

 drivers/gpu/drm/radeon/r100.c          |  22 ++++
 drivers/gpu/drm/radeon/radeon.h        | 218 ++++-----------------------------
 drivers/gpu/drm/radeon/radeon_device.c | 179 +++++++++++++++++++++++++++
 3 files changed, 223 insertions(+), 196 deletions(-)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 04f2514..238b13f 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -4090,6 +4090,28 @@ int r100_init(struct radeon_device *rdev)
 	return 0;
 }
 
+uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg)
+{
+	unsigned long flags;
+	uint32_t ret;
+
+	spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+	writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+	ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+	spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+	return ret;
+}
+
+void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+	writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+	writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+	spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+}
+
 u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
 {
 	if (reg < rdev->rio_mem_size)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 5587603..d9a7c55 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2465,38 +2465,24 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);
 
 #define RADEON_MIN_MMIO_SIZE 0x10000
 
+uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg);
+void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v);
 static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
 				    bool always_indirect)
 {
 	/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
 	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
 		return readl(((void __iomem *)rdev->rmmio) + reg);
-	else {
-		unsigned long flags;
-		uint32_t ret;
-
-		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
-		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
-		ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
-		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
-
-		return ret;
-	}
+	else
+		return r100_mm_rreg_slow(rdev, reg);
 }
-
 static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
 				bool always_indirect)
 {
 	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
 		writel(v, ((void __iomem *)rdev->rmmio) + reg);
-	else {
-		unsigned long flags;
-
-		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
-		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
-		writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
-		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
-	}
+	else
+		r100_mm_wreg_slow(rdev, reg, v);
 }
 
 u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
@@ -2582,182 +2568,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
 /*
  * Indirect registers accessor
  */
-static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
-{
-	unsigned long flags;
-	uint32_t r;
-
-	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
-	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
-	r = RREG32(RADEON_PCIE_DATA);
-	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
-	return r;
-}
-
-static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
-	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
-	WREG32(RADEON_PCIE_DATA, (v));
-	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
-}
-
-static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
-	WREG32(TN_SMC_IND_INDEX_0, (reg));
-	r = RREG32(TN_SMC_IND_DATA_0);
-	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
-	return r;
-}
-
-static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
-	WREG32(TN_SMC_IND_INDEX_0, (reg));
-	WREG32(TN_SMC_IND_DATA_0, (v));
-	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
-}
-
-static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
-	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
-	r = RREG32(R600_RCU_DATA);
-	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
-	return r;
-}
-
-static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
-	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
-	WREG32(R600_RCU_DATA, (v));
-	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
-}
-
-static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
-	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
-	r = RREG32(EVERGREEN_CG_IND_DATA);
-	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
-	return r;
-}
-
-static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
-	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
-	WREG32(EVERGREEN_CG_IND_DATA, (v));
-	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
-	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-	return r;
-}
-
-static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
-	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
-	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-	return r;
-}
-
-static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
-	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
-	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
-	r = RREG32(R600_UVD_CTX_DATA);
-	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
-	return r;
-}
-
-static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
-	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
-	WREG32(R600_UVD_CTX_DATA, (v));
-	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
-}
-
-
-static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
-	WREG32(CIK_DIDT_IND_INDEX, (reg));
-	r = RREG32(CIK_DIDT_IND_DATA);
-	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
-	return r;
-}
-
-static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
-	WREG32(CIK_DIDT_IND_INDEX, (reg));
-	WREG32(CIK_DIDT_IND_DATA, (v));
-	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
-}
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);
 
 void r100_pll_errata_after_index(struct radeon_device *rdev);
 
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index bd7519f..6712505 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
 		rdev->flags &= ~RADEON_IS_PX;
 }
 
+/*
+ * Indirect registers accessor
+ */
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
+{
+	unsigned long flags;
+	uint32_t r;
+
+	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+	r = RREG32(RADEON_PCIE_DATA);
+	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+	return r;
+}
+
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+	WREG32(RADEON_PCIE_DATA, (v));
+	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+}
+
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+	WREG32(TN_SMC_IND_INDEX_0, (reg));
+	r = RREG32(TN_SMC_IND_DATA_0);
+	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+	return r;
+}
+
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+	WREG32(TN_SMC_IND_INDEX_0, (reg));
+	WREG32(TN_SMC_IND_DATA_0, (v));
+	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+}
+
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+	r = RREG32(R600_RCU_DATA);
+	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+	return r;
+}
+
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+	WREG32(R600_RCU_DATA, (v));
+	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+}
+
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+	r = RREG32(EVERGREEN_CG_IND_DATA);
+	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+	return r;
+}
+
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+	WREG32(EVERGREEN_CG_IND_DATA, (v));
+	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+}
+
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+	return r;
+}
+
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+	return r;
+}
+
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+	r = RREG32(R600_UVD_CTX_DATA);
+	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+	return r;
+}
+
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+	WREG32(R600_UVD_CTX_DATA, (v));
+	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+}
+
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+	WREG32(CIK_DIDT_IND_INDEX, (reg));
+	r = RREG32(CIK_DIDT_IND_DATA);
+	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+	return r;
+}
+
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+	WREG32(CIK_DIDT_IND_INDEX, (reg));
+	WREG32(CIK_DIDT_IND_DATA, (v));
+	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+}
+
 /**
  * radeon_program_register_sequence - program an array of registers.
  *
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] radeon: Deinline indirect register accessor functions
  2015-05-18 19:02 [PATCH v2] radeon: Deinline indirect register accessor functions Denys Vlasenko
@ 2015-05-18 19:09 ` Christian König
  2015-05-18 22:50   ` Denys Vlasenko
  0 siblings, 1 reply; 5+ messages in thread
From: Christian König @ 2015-05-18 19:09 UTC (permalink / raw)
  To: Denys Vlasenko; +Cc: Alex Deucher, linux-kernel

On 18.05.2015 21:02, Denys Vlasenko wrote:
> This patch deinlines indirect register accessor functions.
>
> These functions perform two mmio accesses, framed by spin lock/unlock.
> Spin lock/unlock by itself takes more than 50 cycles in ideal case
> (if lock is exclusively cached on current CPU).
>
> With this .config: http://busybox.net/~vda/kernel_config,
> after uninlining these functions have sizes and callsite counts
> as follows:
>
> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
> eg_pif_phy0_rreg: 106 bytes, 13 callsites
> eg_pif_phy0_wreg: 108 bytes, 13 callsites
> eg_pif_phy1_rreg: 107 bytes, 13 callsites
> eg_pif_phy1_wreg: 108 bytes, 13 callsites
> rv370_pcie_rreg: 111 bytes, 21 callsites
> rv370_pcie_wreg: 113 bytes, 24 callsites
> r600_rcu_rreg: 111 bytes, 16 callsites
> r600_rcu_wreg: 113 bytes, 25 callsites
> cik_didt_rreg: 106 bytes, 10 callsites
> cik_didt_wreg: 107 bytes, 10 callsites
> tn_smc_rreg: 106 bytes, 126 callsites
> tn_smc_wreg: 107 bytes, 116 callsites
> eg_cg_rreg: 107 bytes, 20 callsites
> eg_cg_wreg: 108 bytes, 52 callsites
>
> Functions r100_mm_rreg() and r100_mm_rreg() have a fast path and
> locked (slow) path.
> This patch deinlines only slow path.
>
> r100_mm_rreg_slow: 78 bytes, 2083 callsites
> r100_mm_wreg_slow: 81 bytes, 3570 callsites
>
> Reduction in code size is more than 65,000 bytes:
>
>      text     data      bss       dec     hex filename
> 85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
> 85674192 22294776 20627456 128598664 7aa4288 vmlinux
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: linux-kernel@vger.kernel.org
> ---
> Changes in v2: only partially deinline r100_mm_r/wreg
>
>   drivers/gpu/drm/radeon/r100.c          |  22 ++++
>   drivers/gpu/drm/radeon/radeon.h        | 218 ++++-----------------------------
>   drivers/gpu/drm/radeon/radeon_device.c | 179 +++++++++++++++++++++++++++

Sorry haven't noticed that before:

radeon_device.c is most likely not the right place for the non-inlined 
functions. Please move them into to the appropriate files for each 
generation.

As noted on the other mail as well please also CC dri-devel.

Regards,
Christian.

>   3 files changed, 223 insertions(+), 196 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 04f2514..238b13f 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -4090,6 +4090,28 @@ int r100_init(struct radeon_device *rdev)
>   	return 0;
>   }
>   
> +uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg)
> +{
> +	unsigned long flags;
> +	uint32_t ret;
> +
> +	spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> +	writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> +	ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> +	spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> +	return ret;
> +}
> +
> +void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> +	writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> +	writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> +	spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> +}
> +
>   u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
>   {
>   	if (reg < rdev->rio_mem_size)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5587603..d9a7c55 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -2465,38 +2465,24 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);
>   
>   #define RADEON_MIN_MMIO_SIZE 0x10000
>   
> +uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg);
> +void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v);
>   static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
>   				    bool always_indirect)
>   {
>   	/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
>   	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
>   		return readl(((void __iomem *)rdev->rmmio) + reg);
> -	else {
> -		unsigned long flags;
> -		uint32_t ret;
> -
> -		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> -		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> -		ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> -		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> -
> -		return ret;
> -	}
> +	else
> +		return r100_mm_rreg_slow(rdev, reg);
>   }
> -
>   static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
>   				bool always_indirect)
>   {
>   	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
>   		writel(v, ((void __iomem *)rdev->rmmio) + reg);
> -	else {
> -		unsigned long flags;
> -
> -		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> -		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> -		writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> -		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> -	}
> +	else
> +		r100_mm_wreg_slow(rdev, reg, v);
>   }
>   
>   u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
> @@ -2582,182 +2568,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
>   /*
>    * Indirect registers accessor
>    */
> -static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
> -{
> -	unsigned long flags;
> -	uint32_t r;
> -
> -	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> -	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> -	r = RREG32(RADEON_PCIE_DATA);
> -	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> -	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> -	WREG32(RADEON_PCIE_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> -}
> -
> -static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> -	WREG32(TN_SMC_IND_INDEX_0, (reg));
> -	r = RREG32(TN_SMC_IND_DATA_0);
> -	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> -	WREG32(TN_SMC_IND_INDEX_0, (reg));
> -	WREG32(TN_SMC_IND_DATA_0, (v));
> -	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> -}
> -
> -static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> -	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> -	r = RREG32(R600_RCU_DATA);
> -	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> -	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> -	WREG32(R600_RCU_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> -}
> -
> -static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> -	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> -	r = RREG32(EVERGREEN_CG_IND_DATA);
> -	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> -	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> -	WREG32(EVERGREEN_CG_IND_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> -}
> -
> -static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> -	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> -	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -}
> -
> -static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> -	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> -	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -}
> -
> -static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> -	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> -	r = RREG32(R600_UVD_CTX_DATA);
> -	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> -	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> -	WREG32(R600_UVD_CTX_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> -}
> -
> -
> -static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> -	WREG32(CIK_DIDT_IND_INDEX, (reg));
> -	r = RREG32(CIK_DIDT_IND_DATA);
> -	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> -	WREG32(CIK_DIDT_IND_INDEX, (reg));
> -	WREG32(CIK_DIDT_IND_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> -}
> +uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
> +void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
> +u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
> +void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
> +void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
> +void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
> +void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);
>   
>   void r100_pll_errata_after_index(struct radeon_device *rdev);
>   
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index bd7519f..6712505 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
>   		rdev->flags &= ~RADEON_IS_PX;
>   }
>   
> +/*
> + * Indirect registers accessor
> + */
> +uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
> +{
> +	unsigned long flags;
> +	uint32_t r;
> +
> +	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> +	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> +	r = RREG32(RADEON_PCIE_DATA);
> +	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> +	return r;
> +}
> +
> +void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> +	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> +	WREG32(RADEON_PCIE_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> +}
> +
> +u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> +	WREG32(TN_SMC_IND_INDEX_0, (reg));
> +	r = RREG32(TN_SMC_IND_DATA_0);
> +	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> +	return r;
> +}
> +
> +void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> +	WREG32(TN_SMC_IND_INDEX_0, (reg));
> +	WREG32(TN_SMC_IND_DATA_0, (v));
> +	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> +}
> +
> +u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> +	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> +	r = RREG32(R600_RCU_DATA);
> +	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> +	return r;
> +}
> +
> +void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> +	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> +	WREG32(R600_RCU_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> +}
> +
> +u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> +	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> +	r = RREG32(EVERGREEN_CG_IND_DATA);
> +	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> +	return r;
> +}
> +
> +void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> +	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> +	WREG32(EVERGREEN_CG_IND_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> +}
> +
> +u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> +	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +	return r;
> +}
> +
> +void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> +	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +}
> +
> +u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> +	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +	return r;
> +}
> +
> +void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> +	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +}
> +
> +u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> +	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> +	r = RREG32(R600_UVD_CTX_DATA);
> +	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> +	return r;
> +}
> +
> +void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> +	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> +	WREG32(R600_UVD_CTX_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> +}
> +
> +u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> +	WREG32(CIK_DIDT_IND_INDEX, (reg));
> +	r = RREG32(CIK_DIDT_IND_DATA);
> +	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> +	return r;
> +}
> +
> +void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> +	WREG32(CIK_DIDT_IND_INDEX, (reg));
> +	WREG32(CIK_DIDT_IND_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> +}
> +
>   /**
>    * radeon_program_register_sequence - program an array of registers.
>    *


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] radeon: Deinline indirect register accessor functions
  2015-05-18 19:09 ` Christian König
@ 2015-05-18 22:50   ` Denys Vlasenko
  2015-05-18 23:06     ` Deucher, Alexander
  0 siblings, 1 reply; 5+ messages in thread
From: Denys Vlasenko @ 2015-05-18 22:50 UTC (permalink / raw)
  To: Christian König
  Cc: Denys Vlasenko, Alex Deucher, Linux Kernel Mailing List

On Mon, May 18, 2015 at 9:09 PM, Christian König
<christian.koenig@amd.com> wrote:
>> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
>> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
>> eg_pif_phy0_rreg: 106 bytes, 13 callsites
>> eg_pif_phy0_wreg: 108 bytes, 13 callsites
>> eg_pif_phy1_rreg: 107 bytes, 13 callsites
>> eg_pif_phy1_wreg: 108 bytes, 13 callsites
>> rv370_pcie_rreg: 111 bytes, 21 callsites
>> rv370_pcie_wreg: 113 bytes, 24 callsites
>> r600_rcu_rreg: 111 bytes, 16 callsites
>> r600_rcu_wreg: 113 bytes, 25 callsites
>> cik_didt_rreg: 106 bytes, 10 callsites
>> cik_didt_wreg: 107 bytes, 10 callsites
>> tn_smc_rreg: 106 bytes, 126 callsites
>> tn_smc_wreg: 107 bytes, 116 callsites
>> eg_cg_rreg: 107 bytes, 20 callsites
>> eg_cg_wreg: 108 bytes, 52 callsites

> Sorry haven't noticed that before:
>
> radeon_device.c is most likely not the right place for the non-inlined
> functions. Please move them into to the appropriate files for each
> generation.

Will do (probably tomorrow, not today).

Can you help me here a bit?
There are LOTS of  *.c files in drm/radeon/.
I guess r600_ functions should go into r600.c,
rv370_ to rv730_dpm.c (right?),
but some of the function names are less clear (to me).

Where would you like eg_pif_phyN_r/wreg() go? evergreen.c?
Should eg_cg_r/wreg() also go to this file?

cik_didt_r/wreg() - to cik.c?

tn_smc_r/wreg()? Is tn = trinity? so, trinity_smc.c?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v2] radeon: Deinline indirect register accessor functions
  2015-05-18 22:50   ` Denys Vlasenko
@ 2015-05-18 23:06     ` Deucher, Alexander
  2015-05-20 10:47       ` Denys Vlasenko
  0 siblings, 1 reply; 5+ messages in thread
From: Deucher, Alexander @ 2015-05-18 23:06 UTC (permalink / raw)
  To: Denys Vlasenko, Koenig, Christian
  Cc: Denys Vlasenko, Linux Kernel Mailing List

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 2128 bytes --]

> -----Original Message-----
> From: Denys Vlasenko [mailto:vda.linux@googlemail.com]
> Sent: Monday, May 18, 2015 6:50 PM
> To: Koenig, Christian
> Cc: Denys Vlasenko; Deucher, Alexander; Linux Kernel Mailing List
> Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions
> 
> On Mon, May 18, 2015 at 9:09 PM, Christian König
> <christian.koenig@amd.com> wrote:
> >> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
> >> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
> >> eg_pif_phy0_rreg: 106 bytes, 13 callsites
> >> eg_pif_phy0_wreg: 108 bytes, 13 callsites
> >> eg_pif_phy1_rreg: 107 bytes, 13 callsites
> >> eg_pif_phy1_wreg: 108 bytes, 13 callsites
> >> rv370_pcie_rreg: 111 bytes, 21 callsites
> >> rv370_pcie_wreg: 113 bytes, 24 callsites
> >> r600_rcu_rreg: 111 bytes, 16 callsites
> >> r600_rcu_wreg: 113 bytes, 25 callsites
> >> cik_didt_rreg: 106 bytes, 10 callsites
> >> cik_didt_wreg: 107 bytes, 10 callsites
> >> tn_smc_rreg: 106 bytes, 126 callsites
> >> tn_smc_wreg: 107 bytes, 116 callsites
> >> eg_cg_rreg: 107 bytes, 20 callsites
> >> eg_cg_wreg: 108 bytes, 52 callsites
> 
> > Sorry haven't noticed that before:
> >
> > radeon_device.c is most likely not the right place for the non-inlined
> > functions. Please move them into to the appropriate files for each
> > generation.
> 
> Will do (probably tomorrow, not today).

Is this whole exercise really worthwhile?  This will be the 3rd or 4th time these have been inlined/uninlined.

> 
> Can you help me here a bit?
> There are LOTS of  *.c files in drm/radeon/.
> I guess r600_ functions should go into r600.c,

Yes.

> rv370_ to rv730_dpm.c (right?),

No.  rv370_ should go in r300.c

> but some of the function names are less clear (to me).
> 
> Where would you like eg_pif_phyN_r/wreg() go? evergreen.c?

Yes.

> Should eg_cg_r/wreg() also go to this file?

Yes.

> 
> cik_didt_r/wreg() - to cik.c?

Yes.

> 
> tn_smc_r/wreg()? Is tn = trinity? so, trinity_smc.c?

ni.c

Alex

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] radeon: Deinline indirect register accessor functions
  2015-05-18 23:06     ` Deucher, Alexander
@ 2015-05-20 10:47       ` Denys Vlasenko
  0 siblings, 0 replies; 5+ messages in thread
From: Denys Vlasenko @ 2015-05-20 10:47 UTC (permalink / raw)
  To: Deucher, Alexander, Denys Vlasenko, Koenig, Christian
  Cc: Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 5919 bytes --]

On 05/19/2015 01:06 AM, Deucher, Alexander wrote:
>> -----Original Message-----
>> From: Denys Vlasenko [mailto:vda.linux@googlemail.com]
>> Sent: Monday, May 18, 2015 6:50 PM
>> To: Koenig, Christian
>> Cc: Denys Vlasenko; Deucher, Alexander; Linux Kernel Mailing List
>> Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions
>>
>> On Mon, May 18, 2015 at 9:09 PM, Christian König
>> <christian.koenig@amd.com> wrote:
>>>> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
>>>> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
>>>> eg_pif_phy0_rreg: 106 bytes, 13 callsites
>>>> eg_pif_phy0_wreg: 108 bytes, 13 callsites
>>>> eg_pif_phy1_rreg: 107 bytes, 13 callsites
>>>> eg_pif_phy1_wreg: 108 bytes, 13 callsites
>>>> rv370_pcie_rreg: 111 bytes, 21 callsites
>>>> rv370_pcie_wreg: 113 bytes, 24 callsites
>>>> r600_rcu_rreg: 111 bytes, 16 callsites
>>>> r600_rcu_wreg: 113 bytes, 25 callsites
>>>> cik_didt_rreg: 106 bytes, 10 callsites
>>>> cik_didt_wreg: 107 bytes, 10 callsites
>>>> tn_smc_rreg: 106 bytes, 126 callsites
>>>> tn_smc_wreg: 107 bytes, 116 callsites
>>>> eg_cg_rreg: 107 bytes, 20 callsites
>>>> eg_cg_wreg: 108 bytes, 52 callsites
>>
>>> Sorry haven't noticed that before:
>>>
>>> radeon_device.c is most likely not the right place for the non-inlined
>>> functions. Please move them into to the appropriate files for each
>>> generation.
>>
>> Will do (probably tomorrow, not today).
> 
> Is this whole exercise really worthwhile?
> This will be the 3rd or 4th time these have been inlined/uninlined.

When code grows by 65000 bytes, there ought to be a good reason to inline.
I don't see it.

Let's take a look what these functions actually do. cik_didt_wreg is():

       spin_lock_irqsave(&rdev->didt_idx_lock, flags);
       WREG32(CIK_DIDT_IND_INDEX, (reg));
       WREG32(CIK_DIDT_IND_DATA, (v));
       spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);

this compiles to (on defconfig + radeon enabled):

       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       48 83 ec 20             sub    $0x20,%rsp
       4c 89 65 e8             mov    %r12,-0x18(%rbp)
       4c 8d a7 cc 01 00 00    lea    0x1cc(%rdi),%r12
       48 89 5d e0             mov    %rbx,-0x20(%rbp)
       48 89 fb                mov    %rdi,%rbx
       4c 89 6d f0             mov    %r13,-0x10(%rbp)
       4c 89 75 f8             mov    %r14,-0x8(%rbp)
       4c 89 e7                mov    %r12,%rdi
       41 89 d6                mov    %edx,%r14d
       41 89 f5                mov    %esi,%r13d
       e8 20 6b 4d 00          callq  <_raw_spin_lock_irqsave> //spin_lock_irqsave
       48 8b 93 d0 01 00 00    mov    0x1d0(%rbx),%rdx
       44 89 aa 00 ca 00 00    mov    %r13d,0xca00(%rdx)       //WREG32
       48 8b 93 d0 01 00 00    mov    0x1d0(%rbx),%rdx
       44 89 b2 04 ca 00 00    mov    %r14d,0xca04(%rdx)       //WREG32
       4c 89 e7                mov    %r12,%rdi
       48 89 c6                mov    %rax,%rsi
       e8 b9 69 4d 00          callq  <_raw_spin_unlock_irqrestore> //spin_unlock_irqrestore
       48 8b 5d e0             mov    -0x20(%rbp),%rbx
       4c 8b 65 e8             mov    -0x18(%rbp),%r12
       4c 8b 6d f0             mov    -0x10(%rbp),%r13
       4c 8b 75 f8             mov    -0x8(%rbp),%r14
       c9                      leaveq
       c3                      retq

<_raw_spin_lock_irqsave>:
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       9c                      pushfq
       58                      pop    %rax
       fa                      cli
       ba 00 01 00 00          mov    $0x100,%edx
       f0 66 0f c1 17          lock xadd %dx,(%rdi)  // expensive
       0f b6 ce                movzbl %dh,%ecx
       38 d1                   cmp    %dl,%cl
       75 04                   jne    <_raw_spin_lock_irqsave+0x1c>
       5d                      pop    %rbp
       c3                      retq
       f3 90                   pause
       0f b6 17                movzbl (%rdi),%edx
       38 ca                   cmp    %cl,%dl
       75 f7                   jne    <_raw_spin_lock_irqsave+0x1a>
       5d                      pop    %rbp
       c3                      retq

<_raw_spin_unlock_irqrestore>:
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       80 07 01                addb   $0x1,(%rdi)
       56                      push   %rsi
       9d                      popfq                  //expensive
       5d                      pop    %rbp
       c3                      retq

Now, using attached test program, I measure how long
call+ret pair takes:

# ./timing_test64 callret
400000000 loops in 0.71467s = 1.79 nsec/loop for callret

Unlocked read-modify-write memory operation:

# ./timing_test64 or
400000000 loops in 0.86119s = 2.15 nsec/loop for or

Locked read-modify-write memory operations:

# ./timing_test64 lock_or
100000000 loops in 0.68902s = 6.89 nsec/loop for lock_or
# ./timing_test64 lock_xadd
100000000 loops in 0.68582s = 6.86 nsec/loop for lock_xadd

And POPF:

# ./timing_test64 popf
100000000 loops in 0.68861s = 6.89 nsec/loop for popf

This is on Sandy Bridge CPU with cycle time of about 0.30 ns:

# ./timing_test64 nothing
2000000000 loops in 0.59716s = 0.30 nsec/loop for nothing


So, what do we see?

call+ret takes 5 cycles. This is cheaper that one unlocked
RMW memory operation which is 7 cycles.

Locked RMW is 21 cycles in the ideal case (this is what
spin_lock_irqsave does). POPF is also 21 cycles
(spin_unlock_irqrestore does this). Add to this two mmio
accesses (easily 50s of cycles) and all other necessary operations
visible in the assembly code - 5 memory stores,
7 memory loads, and two call+ret pairs.

I expect overhead of call+ret added by deinlining to be in 1-4%,
if you run a microbenchmark which does nothing but one of these ops.
-- 
vda

[-- Attachment #2: timing_test.c --]
[-- Type: text/x-csrc, Size: 8267 bytes --]

// To be unaffected by random cacheline placement, use generous "align":
//
// i686-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static
// x86_64-gcc -O2 -Wall -falign-loops=32 -falign-jumps=32 -falign-labels=32 -static

#include <inttypes.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <stdio.h>

#if !defined(__i386__)
#define get_sysenter_addr() 0
#else
#include <elf.h>
long sysenter_addr;
long get_sysenter_addr(char **envp)
{
	Elf32_auxv_t *auxv;
	while (*envp++ != NULL)
		continue;
	for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++)
		if( auxv->a_type == AT_SYSINFO)
			return (sysenter_addr = auxv->a_un.a_val);
	fprintf(stderr, "AT_SYSINFO not supplied, can't test\n");
	exit(0); /* this is not a failure */
}

void sysenter_getpid(void)
{
	asm volatile(
	"\n"   "	mov	$20,%eax" // GETPID
	"\n"   "	call	*sysenter_addr"
	);
}
#endif

#if defined(__i386__)
#define L_or_Q "l"
#define E_or_R "e"
#else
#define L_or_Q "q"
#define E_or_R "r"
#endif

static int memvar;

asm (
"\n"   "	.text"
"\n"   "ret__:	ret"
);

int main(int argc, char **argv, char **envp)
{
	struct timespec start, end;
	unsigned long long duration;
	size_t loops, i;
	const char *mode;

	if (argc < 2) {
		printf("Usage: timing_test [MILLIONS_OF_ITERATIONS] MODE\n");
		return 1;
	}
	mode = argv[2];
	if (!mode) {
		mode = argv[1];
		loops = 10*1000;
	} else {
		loops = (size_t)atol(argv[1]) * 1000000;
	}

 again:
	if (!strcmp(mode, "nothing")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile ("# nothing");
		}
	} else if (!strcmp(mode, "nop")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile ("nop");
		}
	} else if (!strcmp(mode, "rdtsc")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			unsigned int a, d;
			asm volatile ("rdtsc" : "=a" (a), "=d" (d));
		}
	} else if (!strcmp(mode, "lfence_rdtsc")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			unsigned int a, d;
			asm volatile ("lfence;rdtsc" : "=a" (a), "=d" (d));
		}
	} else if (!strcmp(mode, "lfence_rdtsc_lfence")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			unsigned int a, d;
			asm volatile ("");
			asm volatile ("lfence;rdtsc;lfence" : "=a" (a), "=d" (d));
		}
	} else if (!strcmp(mode, "mfence_rdtsc_mfence")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			unsigned int a, d;
			asm volatile ("mfence;rdtsc;mfence" : "=a" (a), "=d" (d));
		}
	} else if (!strcmp(mode, "rdtscp")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			unsigned int a, c, d;
			asm volatile ("rdtscp" : "=a" (a), "=c" (c), "=d" (d));
		}
	} else if (!strcmp(mode, "gettimeofday")) {
		struct timeval tv;
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--)
			gettimeofday(&tv, 0);
	} else if (!strcmp(mode, "getpid")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--)
			syscall(SYS_getpid);
#if defined(__i386__)
	} else if (!strcmp(mode, "sysenter_getpid")) {
		get_sysenter_addr(envp);
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--)
			sysenter_getpid();
	} else if (!strcmp(mode, "iret")) {
		/* "push cs" is itself a bit expensive, moving it out of loop */
		long saved_cs;
		asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	push	$0"	// flags
			"\n"   "	push	%0"	// cs
			"\n"   "	push	$1f"	// ip
			"\n"   "	iret"
			"\n"   "1:"
			:
			: "r" (saved_cs)
			);
		}
#endif
#if defined(__x86_64__)
	} else if (!strcmp(mode, "iret")) {
		long saved_cs;
		long saved_ss;
		asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
		asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	mov	%%rsp,%%rax"
			"\n"   "	push	%0"	// ss
			"\n"   "	push	%%rax"	// sp
			"\n"   "	push	$0"	// flags
			"\n"   "	push	%1"	// cs
			"\n"   "	push	$1f"	// ip
			"\n"   "	iretq"
			"\n"   "1:"
			:
			: "r" (saved_ss), "r" (saved_cs)
			: "ax"
			);
		}
#endif
	} else if (!strcmp(mode, "lret")) {
		/* "push cs" is itself a bit expensive, moving it out of loop */
		long saved_cs;
		asm volatile ("mov %%cs,%0" : "=r" (saved_cs));
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	push	%0"
			"\n"   "	push	$1f"
			"\n"   "	lret"L_or_Q
			"\n"   "1:"
			:
			: "r" (saved_cs)
			);
		}
	} else if (!strcmp(mode, "callret")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile ("call ret__");
		}
	} else if (!strcmp(mode, "ret")) {
		/* This is useful to measure delays due to
		 * return stack branch prediction not working
		 * (we aren't using paired call/rets here, as CPU expects).
		 * I observed "callret" test above being 4 times faster than this:
		 */
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	push	$1f"
			"\n"   "	ret"
			"\n"   "1:"
			);
		}
	} else if (!strcmp(mode, "loadss")) {
		long saved_ss;
		asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile ("mov %0,%%ss" : : "r" (saved_ss));
		}
	} else if (!strcmp(mode, "readss")) {
		long saved_ss;
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile ("mov %%ss,%0" : "=r" (saved_ss));
		}
	} else if (!strcmp(mode, "leave")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	push	%"E_or_R"bp"
			"\n"   "	mov	%"E_or_R"sp,%"E_or_R"bp"
			"\n"   "	leave"
			);
		}
	} else if (!strcmp(mode, "noleave")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	push	%"E_or_R"bp"
			"\n"   "	mov	%"E_or_R"sp,%"E_or_R"bp"
			"\n"   "	mov	%"E_or_R"bp,%"E_or_R"sp"
			"\n"   "	pop	%"E_or_R"bp"
			);
		}
	} else if (!strcmp(mode, "pushf")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	pushf"
			"\n"   "	pop	%%"E_or_R"ax"
			:
			:
			: "ax"
			);
		}
	} else if (!strcmp(mode, "popf")) {
		long flags;
		asm volatile (
		"\n"   "	pushf"
		"\n"   "	pop	%0"
		: "=r" (flags)
		);
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	push	%0"
			"\n"   "	popf"
			:
			: "r" (flags)
			: "ax"
			);
		}
	} else if (!strcmp(mode, "or")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	orl $1,%0"
			:
			: "m" (memvar)
			);
		}
	} else if (!strcmp(mode, "lock_or")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	lock orl $1,%0"
			:
			: "m" (memvar)
			);
		}
	} else if (!strcmp(mode, "lock_xadd")) {
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--) {
			asm volatile (
			"\n"   "	lock xaddl %0,%1"
			:
			: "r" (0), "m" (memvar)
			);
		}
	} else if (!strcmp(mode, "rdpmc")) {
		// Unlikely to work.
		unsigned int eax, edx;
		unsigned int ecx = 0;
		clock_gettime(CLOCK_MONOTONIC, &start);
	        for (i = loops; i != 0; i--)
			asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx));
	} else {
		printf("Unknown mode %s\n", mode);
		return 1;
	}

	clock_gettime(CLOCK_MONOTONIC, &end);
	duration = (1000*1000*1000ULL * end.tv_sec + end.tv_nsec)
	         - (1000*1000*1000ULL * start.tv_sec + start.tv_nsec);
	printf("%lu loops in %.5fs = %.2f nsec/loop for %s\n",
		(unsigned long)loops, (double)duration * 1e-9,
		(double)duration / loops,
		mode
	);
	if (!argv[2]) {
		if (duration < 90*1000*1000) {
			loops *= 10;
			goto again;
		}
		if (duration < 490*1000*1000) {
			loops *= 2;
			goto again;
		}
	}
	return 0;
}

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-05-20 10:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-18 19:02 [PATCH v2] radeon: Deinline indirect register accessor functions Denys Vlasenko
2015-05-18 19:09 ` Christian König
2015-05-18 22:50   ` Denys Vlasenko
2015-05-18 23:06     ` Deucher, Alexander
2015-05-20 10:47       ` Denys Vlasenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).