All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] radeon: Deinline indirect register accessor functions
@ 2015-05-18 17:59 Denys Vlasenko
  2015-05-18 17:59 ` [PATCH] radeon: Shrink radeon_ring_write() Denys Vlasenko
  2015-05-18 18:06 ` [PATCH] radeon: Deinline indirect register accessor functions Christian König
  0 siblings, 2 replies; 10+ messages in thread
From: Denys Vlasenko @ 2015-05-18 17:59 UTC (permalink / raw)
  To: Christian König; +Cc: Denys Vlasenko, Alex Deucher, linux-kernel

This patch deinlines indirect register accessor functions.

These functions perform two mmio accesses, framed by spinlock/unlock.
Spin lock/unlock by itself takes more than 50 cycles in ideal case.

With this .config: http://busybox.net/~vda/kernel_config,
after uninlining these functions have sizes and callsite counts
as follows:

r600_uvd_ctx_rreg: 111 bytes, 4 callsites
r600_uvd_ctx_wreg: 113 bytes, 5 callsites
eg_pif_phy0_rreg: 106 bytes, 13 callsites
eg_pif_phy0_wreg: 108 bytes, 13 callsites
eg_pif_phy1_rreg: 107 bytes, 13 callsites
eg_pif_phy1_wreg: 108 bytes, 13 callsites
rv370_pcie_rreg: 111 bytes, 21 callsites
rv370_pcie_wreg: 113 bytes, 24 callsites
r600_rcu_rreg: 111 bytes, 16 callsites
r600_rcu_wreg: 113 bytes, 25 callsites
cik_didt_rreg: 106 bytes, 10 callsites
cik_didt_wreg: 107 bytes, 10 callsites
r100_mm_rreg: 112 bytes, 2083 callsites
r100_mm_wreg: 116 bytes, 3570 callsites
tn_smc_rreg: 106 bytes, 126 callsites
tn_smc_wreg: 107 bytes, 116 callsites
eg_cg_rreg: 107 bytes, 20 callsites
eg_cg_wreg: 108 bytes, 52 callsites

Reduction in code size is more than 80,000 bytes:

    text     data      bss       dec     hex filename
85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
85657104 22294872 20627456 128579432 7a9f768 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: linux-kernel@vger.kernel.org
---
 drivers/gpu/drm/radeon/r100.c          |  34 +++++
 drivers/gpu/drm/radeon/radeon.h        | 229 +++------------------------------
 drivers/gpu/drm/radeon/radeon_device.c | 179 ++++++++++++++++++++++++++
 3 files changed, 233 insertions(+), 209 deletions(-)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 04f2514..95868c7 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -4090,6 +4090,40 @@ int r100_init(struct radeon_device *rdev)
 	return 0;
 }
 
+uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
+				    bool always_indirect)
+{
+	/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
+	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
+		return readl(((void __iomem *)rdev->rmmio) + reg);
+	else {
+		unsigned long flags;
+		uint32_t ret;
+
+		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+		ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+
+		return ret;
+	}
+}
+
+void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
+				bool always_indirect)
+{
+	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
+		writel(v, ((void __iomem *)rdev->rmmio) + reg);
+	else {
+		unsigned long flags;
+
+		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+		writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+	}
+}
+
 u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
 {
 	if (reg < rdev->rio_mem_size)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 5587603..bb6b25c 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2465,39 +2465,10 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);
 
 #define RADEON_MIN_MMIO_SIZE 0x10000
 
-static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
-				    bool always_indirect)
-{
-	/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
-	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
-		return readl(((void __iomem *)rdev->rmmio) + reg);
-	else {
-		unsigned long flags;
-		uint32_t ret;
-
-		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
-		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
-		ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
-		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
-
-		return ret;
-	}
-}
-
-static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
-				bool always_indirect)
-{
-	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
-		writel(v, ((void __iomem *)rdev->rmmio) + reg);
-	else {
-		unsigned long flags;
-
-		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
-		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
-		writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
-		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
-	}
-}
+uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
+		      bool always_indirect);
+void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
+		  bool always_indirect);
 
 u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
 void r100_io_wreg(struct radeon_device *rdev, u32 reg, u32 v);
@@ -2582,182 +2553,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
 /*
  * Indirect registers accessor
  */
-static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
-{
-	unsigned long flags;
-	uint32_t r;
-
-	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
-	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
-	r = RREG32(RADEON_PCIE_DATA);
-	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
-	return r;
-}
-
-static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
-	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
-	WREG32(RADEON_PCIE_DATA, (v));
-	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
-}
-
-static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
-	WREG32(TN_SMC_IND_INDEX_0, (reg));
-	r = RREG32(TN_SMC_IND_DATA_0);
-	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
-	return r;
-}
-
-static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
-	WREG32(TN_SMC_IND_INDEX_0, (reg));
-	WREG32(TN_SMC_IND_DATA_0, (v));
-	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
-}
-
-static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
-	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
-	r = RREG32(R600_RCU_DATA);
-	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
-	return r;
-}
-
-static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
-	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
-	WREG32(R600_RCU_DATA, (v));
-	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
-}
-
-static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
-	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
-	r = RREG32(EVERGREEN_CG_IND_DATA);
-	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
-	return r;
-}
-
-static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
-	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
-	WREG32(EVERGREEN_CG_IND_DATA, (v));
-	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
-	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-	return r;
-}
-
-static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
-	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
-	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-	return r;
-}
-
-static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
-	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
-	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
-	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
-	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
-	r = RREG32(R600_UVD_CTX_DATA);
-	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
-	return r;
-}
-
-static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
-	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
-	WREG32(R600_UVD_CTX_DATA, (v));
-	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
-}
-
-
-static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
-{
-	unsigned long flags;
-	u32 r;
-
-	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
-	WREG32(CIK_DIDT_IND_INDEX, (reg));
-	r = RREG32(CIK_DIDT_IND_DATA);
-	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
-	return r;
-}
-
-static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
-	WREG32(CIK_DIDT_IND_INDEX, (reg));
-	WREG32(CIK_DIDT_IND_DATA, (v));
-	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
-}
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);
 
 void r100_pll_errata_after_index(struct radeon_device *rdev);
 
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index bd7519f..6712505 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
 		rdev->flags &= ~RADEON_IS_PX;
 }
 
+/*
+ * Indirect registers accessor
+ */
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
+{
+	unsigned long flags;
+	uint32_t r;
+
+	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+	r = RREG32(RADEON_PCIE_DATA);
+	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+	return r;
+}
+
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+	WREG32(RADEON_PCIE_DATA, (v));
+	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+}
+
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+	WREG32(TN_SMC_IND_INDEX_0, (reg));
+	r = RREG32(TN_SMC_IND_DATA_0);
+	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+	return r;
+}
+
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+	WREG32(TN_SMC_IND_INDEX_0, (reg));
+	WREG32(TN_SMC_IND_DATA_0, (v));
+	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+}
+
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+	r = RREG32(R600_RCU_DATA);
+	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+	return r;
+}
+
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+	WREG32(R600_RCU_DATA, (v));
+	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+}
+
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+	r = RREG32(EVERGREEN_CG_IND_DATA);
+	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+	return r;
+}
+
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+	WREG32(EVERGREEN_CG_IND_DATA, (v));
+	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+}
+
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+	return r;
+}
+
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+	return r;
+}
+
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
+	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+	r = RREG32(R600_UVD_CTX_DATA);
+	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+	return r;
+}
+
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+	WREG32(R600_UVD_CTX_DATA, (v));
+	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+}
+
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+	WREG32(CIK_DIDT_IND_INDEX, (reg));
+	r = RREG32(CIK_DIDT_IND_DATA);
+	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+	return r;
+}
+
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+	WREG32(CIK_DIDT_IND_INDEX, (reg));
+	WREG32(CIK_DIDT_IND_DATA, (v));
+	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+}
+
 /**
  * radeon_program_register_sequence - program an array of registers.
  *
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] radeon: Shrink radeon_ring_write()
  2015-05-18 17:59 [PATCH] radeon: Deinline indirect register accessor functions Denys Vlasenko
@ 2015-05-18 17:59 ` Denys Vlasenko
  2015-05-18 18:11   ` Christian König
  2015-05-18 18:06 ` [PATCH] radeon: Deinline indirect register accessor functions Christian König
  1 sibling, 1 reply; 10+ messages in thread
From: Denys Vlasenko @ 2015-05-18 17:59 UTC (permalink / raw)
  To: Christian König; +Cc: Denys Vlasenko, Alex Deucher, linux-kernel

Inlined radeon_ring_write() has 729 callers, which amounts to about 50000
bytes of code. however, deinlining it is probably too much
of a performance impact.

This patch shrinks slow path a bit and optimizes fast path.
Comparison of generated machine code is below:

old___________________________________ new____________________________
55             push %rbp               55         push %rbp
4889e5         mov  %rsp,%rbp          ff4f38     decl 0x38(%rdi)
4154           push %r12               4889e5     mov  %rsp,%rbp
4189f4         mov  %esi,%r12d         4154       push %r12
53             push %rbx               4189f4     mov  %esi,%r12d
837f3800       cmpl $0x0,0x38(%rdi)    53         push %rbx
4889fb         mov  %rdi,%rbx          4889fb     mov  %rdi,%rbx
7f0e           jg   <.Lbl>             7905       jns  <.Lbl>
48c7c78f51a785 mov  $message,%rdi
31c0           xor  %eax,%eax
e89306f9ff     call <drm_err>          e8cbffffff call <radeon_ring_overflow>
    .Lbl:
8b4328         mov  0x28(%rbx),%eax    8b5328     mov  0x28(%rbx),%edx
488b5308       mov  0x8(%rbx),%rdx     488b4308   mov  0x8(%rbx),%rax
89c1           mov  %eax,%ecx          488d0490   lea  (%rax,%rdx,4),%rax
ffc0           inc  %eax
488d148a       lea  (%rdx,%rcx,4),%rdx 448920     mov  %r12d,(%rax)
448922         mov  %r12d,(%rdx)       8b4328     mov  0x28(%rbx),%eax
234354         and  0x54(%rbx),%eax    ff4b34     decl 0x34(%rbx)
ff4b38         decl 0x38(%rbx)         ffc0       inc  %eax
ff4b34         decl 0x34(%rbx)         234354     and  0x54(%rbx),%eax
894328         mov  %eax,0x28(%rbx)    894328     mov  %eax,0x28(%rbx)
5b             pop  %rbx               5b         pop  %rbx
415c           pop  %r12               415c       pop  %r12
5d             pop  %rbp               5d         pop  %rbp

This shaves off more than 10 kbytes of code off the kernel:

    text     data      bss       dec     hex filename
85657104 22294872 20627456 128579432 7a9f768 vmlinux.before
85646544 22294872 20627456 128568872 7a9ce28 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: linux-kernel@vger.kernel.org
---
 drivers/gpu/drm/radeon/radeon.h      | 11 +++++------
 drivers/gpu/drm/radeon/radeon_ring.c |  5 +++++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index bb6b25c..9106873 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2658,14 +2658,13 @@ void radeon_atombios_fini(struct radeon_device *rdev);
  *
  * Write a value to the requested ring buffer (all asics).
  */
+void radeon_ring_overflow(void);
 static inline void radeon_ring_write(struct radeon_ring *ring, uint32_t v)
 {
-	if (ring->count_dw <= 0)
-		DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
-
-	ring->ring[ring->wptr++] = v;
-	ring->wptr &= ring->ptr_mask;
-	ring->count_dw--;
+	if (--ring->count_dw < 0)
+		radeon_ring_overflow();
+	ring->ring[ring->wptr] = v;
+	ring->wptr = (ring->wptr + 1) & ring->ptr_mask;
 	ring->ring_free_dw--;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index 2456f69..8204c23 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -126,6 +126,11 @@ int radeon_ring_alloc(struct radeon_device *rdev, struct radeon_ring *ring, unsi
 	return 0;
 }
 
+void radeon_ring_overflow(void)
+{
+	DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
+}
+
 /**
  * radeon_ring_lock - lock the ring and allocate space on it
  *
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Deinline indirect register accessor functions
  2015-05-18 17:59 [PATCH] radeon: Deinline indirect register accessor functions Denys Vlasenko
  2015-05-18 17:59 ` [PATCH] radeon: Shrink radeon_ring_write() Denys Vlasenko
@ 2015-05-18 18:06 ` Christian König
  2015-05-18 18:50   ` Denys Vlasenko
  1 sibling, 1 reply; 10+ messages in thread
From: Christian König @ 2015-05-18 18:06 UTC (permalink / raw)
  To: Denys Vlasenko; +Cc: Alex Deucher, linux-kernel

I'm actually surprised how often people come along with that. The last 
time we tried this it caused a noticeable performance drop.

Basic problem is that this line:
> +	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
optimizes away in most of the cases which reduces the call to a readl 
which is way faster than the spinlock path.

So this is a NAK,
Christian.

On 18.05.2015 19:59, Denys Vlasenko wrote:
> This patch deinlines indirect register accessor functions.
>
> These functions perform two mmio accesses, framed by spinlock/unlock.
> Spin lock/unlock by itself takes more than 50 cycles in ideal case.
>
> With this .config: http://busybox.net/~vda/kernel_config,
> after uninlining these functions have sizes and callsite counts
> as follows:
>
> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
> eg_pif_phy0_rreg: 106 bytes, 13 callsites
> eg_pif_phy0_wreg: 108 bytes, 13 callsites
> eg_pif_phy1_rreg: 107 bytes, 13 callsites
> eg_pif_phy1_wreg: 108 bytes, 13 callsites
> rv370_pcie_rreg: 111 bytes, 21 callsites
> rv370_pcie_wreg: 113 bytes, 24 callsites
> r600_rcu_rreg: 111 bytes, 16 callsites
> r600_rcu_wreg: 113 bytes, 25 callsites
> cik_didt_rreg: 106 bytes, 10 callsites
> cik_didt_wreg: 107 bytes, 10 callsites
> r100_mm_rreg: 112 bytes, 2083 callsites
> r100_mm_wreg: 116 bytes, 3570 callsites
> tn_smc_rreg: 106 bytes, 126 callsites
> tn_smc_wreg: 107 bytes, 116 callsites
> eg_cg_rreg: 107 bytes, 20 callsites
> eg_cg_wreg: 108 bytes, 52 callsites
>
> Reduction in code size is more than 80,000 bytes:
>
>      text     data      bss       dec     hex filename
> 85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
> 85657104 22294872 20627456 128579432 7a9f768 vmlinux
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: linux-kernel@vger.kernel.org
> ---
>   drivers/gpu/drm/radeon/r100.c          |  34 +++++
>   drivers/gpu/drm/radeon/radeon.h        | 229 +++------------------------------
>   drivers/gpu/drm/radeon/radeon_device.c | 179 ++++++++++++++++++++++++++
>   3 files changed, 233 insertions(+), 209 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 04f2514..95868c7 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -4090,6 +4090,40 @@ int r100_init(struct radeon_device *rdev)
>   	return 0;
>   }
>   
> +uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
> +				    bool always_indirect)
> +{
> +	/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
> +	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> +		return readl(((void __iomem *)rdev->rmmio) + reg);
> +	else {
> +		unsigned long flags;
> +		uint32_t ret;
> +
> +		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> +		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> +		ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> +		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> +
> +		return ret;
> +	}
> +}
> +
> +void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
> +				bool always_indirect)
> +{
> +	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> +		writel(v, ((void __iomem *)rdev->rmmio) + reg);
> +	else {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> +		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> +		writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> +		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> +	}
> +}
> +
>   u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
>   {
>   	if (reg < rdev->rio_mem_size)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5587603..bb6b25c 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -2465,39 +2465,10 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);
>   
>   #define RADEON_MIN_MMIO_SIZE 0x10000
>   
> -static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
> -				    bool always_indirect)
> -{
> -	/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
> -	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> -		return readl(((void __iomem *)rdev->rmmio) + reg);
> -	else {
> -		unsigned long flags;
> -		uint32_t ret;
> -
> -		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> -		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> -		ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> -		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> -
> -		return ret;
> -	}
> -}
> -
> -static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
> -				bool always_indirect)
> -{
> -	if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> -		writel(v, ((void __iomem *)rdev->rmmio) + reg);
> -	else {
> -		unsigned long flags;
> -
> -		spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> -		writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> -		writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> -		spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> -	}
> -}
> +uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
> +		      bool always_indirect);
> +void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
> +		  bool always_indirect);
>   
>   u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
>   void r100_io_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> @@ -2582,182 +2553,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
>   /*
>    * Indirect registers accessor
>    */
> -static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
> -{
> -	unsigned long flags;
> -	uint32_t r;
> -
> -	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> -	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> -	r = RREG32(RADEON_PCIE_DATA);
> -	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> -	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> -	WREG32(RADEON_PCIE_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> -}
> -
> -static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> -	WREG32(TN_SMC_IND_INDEX_0, (reg));
> -	r = RREG32(TN_SMC_IND_DATA_0);
> -	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> -	WREG32(TN_SMC_IND_INDEX_0, (reg));
> -	WREG32(TN_SMC_IND_DATA_0, (v));
> -	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> -}
> -
> -static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> -	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> -	r = RREG32(R600_RCU_DATA);
> -	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> -	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> -	WREG32(R600_RCU_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> -}
> -
> -static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> -	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> -	r = RREG32(EVERGREEN_CG_IND_DATA);
> -	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> -	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> -	WREG32(EVERGREEN_CG_IND_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> -}
> -
> -static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> -	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> -	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -}
> -
> -static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> -	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> -	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> -	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -}
> -
> -static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> -	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> -	r = RREG32(R600_UVD_CTX_DATA);
> -	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> -	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> -	WREG32(R600_UVD_CTX_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> -}
> -
> -
> -static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
> -{
> -	unsigned long flags;
> -	u32 r;
> -
> -	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> -	WREG32(CIK_DIDT_IND_INDEX, (reg));
> -	r = RREG32(CIK_DIDT_IND_DATA);
> -	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> -	return r;
> -}
> -
> -static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> -	WREG32(CIK_DIDT_IND_INDEX, (reg));
> -	WREG32(CIK_DIDT_IND_DATA, (v));
> -	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> -}
> +uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
> +void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
> +u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
> +void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
> +void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
> +void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
> +void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);
>   
>   void r100_pll_errata_after_index(struct radeon_device *rdev);
>   
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index bd7519f..6712505 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
>   		rdev->flags &= ~RADEON_IS_PX;
>   }
>   
> +/*
> + * Indirect registers accessor
> + */
> +uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
> +{
> +	unsigned long flags;
> +	uint32_t r;
> +
> +	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> +	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> +	r = RREG32(RADEON_PCIE_DATA);
> +	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> +	return r;
> +}
> +
> +void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> +	WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> +	WREG32(RADEON_PCIE_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> +}
> +
> +u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> +	WREG32(TN_SMC_IND_INDEX_0, (reg));
> +	r = RREG32(TN_SMC_IND_DATA_0);
> +	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> +	return r;
> +}
> +
> +void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> +	WREG32(TN_SMC_IND_INDEX_0, (reg));
> +	WREG32(TN_SMC_IND_DATA_0, (v));
> +	spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> +}
> +
> +u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> +	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> +	r = RREG32(R600_RCU_DATA);
> +	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> +	return r;
> +}
> +
> +void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> +	WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> +	WREG32(R600_RCU_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> +}
> +
> +u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> +	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> +	r = RREG32(EVERGREEN_CG_IND_DATA);
> +	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> +	return r;
> +}
> +
> +void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> +	WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> +	WREG32(EVERGREEN_CG_IND_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> +}
> +
> +u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> +	r = RREG32(EVERGREEN_PIF_PHY0_DATA);
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +	return r;
> +}
> +
> +void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> +	WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +}
> +
> +u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> +	r = RREG32(EVERGREEN_PIF_PHY1_DATA);
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +	return r;
> +}
> +
> +void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> +	WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> +	WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +}
> +
> +u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> +	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> +	r = RREG32(R600_UVD_CTX_DATA);
> +	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> +	return r;
> +}
> +
> +void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> +	WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> +	WREG32(R600_UVD_CTX_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> +}
> +
> +u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
> +{
> +	unsigned long flags;
> +	u32 r;
> +
> +	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> +	WREG32(CIK_DIDT_IND_INDEX, (reg));
> +	r = RREG32(CIK_DIDT_IND_DATA);
> +	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> +	return r;
> +}
> +
> +void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> +	WREG32(CIK_DIDT_IND_INDEX, (reg));
> +	WREG32(CIK_DIDT_IND_DATA, (v));
> +	spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> +}
> +
>   /**
>    * radeon_program_register_sequence - program an array of registers.
>    *


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Shrink radeon_ring_write()
  2015-05-18 17:59 ` [PATCH] radeon: Shrink radeon_ring_write() Denys Vlasenko
@ 2015-05-18 18:11   ` Christian König
  2015-05-18 18:25     ` Denys Vlasenko
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2015-05-18 18:11 UTC (permalink / raw)
  To: Denys Vlasenko; +Cc: Alex Deucher, linux-kernel

De-duplicating the error message is probably a good idea, but are the 
remaining code changes really necessary for the size reduction?

Regards,
Christian.

On 18.05.2015 19:59, Denys Vlasenko wrote:
> Inlined radeon_ring_write() has 729 callers, which amounts to about 50000
> bytes of code. however, deinlining it is probably too much
> of a performance impact.
>
> This patch shrinks slow path a bit and optimizes fast path.
> Comparison of generated machine code is below:
>
> old___________________________________ new____________________________
> 55             push %rbp               55         push %rbp
> 4889e5         mov  %rsp,%rbp          ff4f38     decl 0x38(%rdi)
> 4154           push %r12               4889e5     mov  %rsp,%rbp
> 4189f4         mov  %esi,%r12d         4154       push %r12
> 53             push %rbx               4189f4     mov  %esi,%r12d
> 837f3800       cmpl $0x0,0x38(%rdi)    53         push %rbx
> 4889fb         mov  %rdi,%rbx          4889fb     mov  %rdi,%rbx
> 7f0e           jg   <.Lbl>             7905       jns  <.Lbl>
> 48c7c78f51a785 mov  $message,%rdi
> 31c0           xor  %eax,%eax
> e89306f9ff     call <drm_err>          e8cbffffff call <radeon_ring_overflow>
>      .Lbl:
> 8b4328         mov  0x28(%rbx),%eax    8b5328     mov  0x28(%rbx),%edx
> 488b5308       mov  0x8(%rbx),%rdx     488b4308   mov  0x8(%rbx),%rax
> 89c1           mov  %eax,%ecx          488d0490   lea  (%rax,%rdx,4),%rax
> ffc0           inc  %eax
> 488d148a       lea  (%rdx,%rcx,4),%rdx 448920     mov  %r12d,(%rax)
> 448922         mov  %r12d,(%rdx)       8b4328     mov  0x28(%rbx),%eax
> 234354         and  0x54(%rbx),%eax    ff4b34     decl 0x34(%rbx)
> ff4b38         decl 0x38(%rbx)         ffc0       inc  %eax
> ff4b34         decl 0x34(%rbx)         234354     and  0x54(%rbx),%eax
> 894328         mov  %eax,0x28(%rbx)    894328     mov  %eax,0x28(%rbx)
> 5b             pop  %rbx               5b         pop  %rbx
> 415c           pop  %r12               415c       pop  %r12
> 5d             pop  %rbp               5d         pop  %rbp
>
> This shaves off more than 10 kbytes of code off the kernel:
>
>      text     data      bss       dec     hex filename
> 85657104 22294872 20627456 128579432 7a9f768 vmlinux.before
> 85646544 22294872 20627456 128568872 7a9ce28 vmlinux
>
> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> Cc: linux-kernel@vger.kernel.org
> ---
>   drivers/gpu/drm/radeon/radeon.h      | 11 +++++------
>   drivers/gpu/drm/radeon/radeon_ring.c |  5 +++++
>   2 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index bb6b25c..9106873 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -2658,14 +2658,13 @@ void radeon_atombios_fini(struct radeon_device *rdev);
>    *
>    * Write a value to the requested ring buffer (all asics).
>    */
> +void radeon_ring_overflow(void);
>   static inline void radeon_ring_write(struct radeon_ring *ring, uint32_t v)
>   {
> -	if (ring->count_dw <= 0)
> -		DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
> -
> -	ring->ring[ring->wptr++] = v;
> -	ring->wptr &= ring->ptr_mask;
> -	ring->count_dw--;
> +	if (--ring->count_dw < 0)
> +		radeon_ring_overflow();
> +	ring->ring[ring->wptr] = v;
> +	ring->wptr = (ring->wptr + 1) & ring->ptr_mask;
>   	ring->ring_free_dw--;
>   }
>   
> diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
> index 2456f69..8204c23 100644
> --- a/drivers/gpu/drm/radeon/radeon_ring.c
> +++ b/drivers/gpu/drm/radeon/radeon_ring.c
> @@ -126,6 +126,11 @@ int radeon_ring_alloc(struct radeon_device *rdev, struct radeon_ring *ring, unsi
>   	return 0;
>   }
>   
> +void radeon_ring_overflow(void)
> +{
> +	DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
> +}
> +
>   /**
>    * radeon_ring_lock - lock the ring and allocate space on it
>    *


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Shrink radeon_ring_write()
  2015-05-18 18:11   ` Christian König
@ 2015-05-18 18:25     ` Denys Vlasenko
  2015-05-18 19:01       ` Christian König
  0 siblings, 1 reply; 10+ messages in thread
From: Denys Vlasenko @ 2015-05-18 18:25 UTC (permalink / raw)
  To: Christian König; +Cc: Alex Deucher, linux-kernel

On 05/18/2015 08:11 PM, Christian König wrote:
> De-duplicating the error message is probably a good idea,
> but are the remaining code changes really necessary for the size reduction?

The conversion from

   if (ring->count_dw <= 0)
        error;
   ...
   ring->count_dw--;

to

   if (--ring->count_dw < 0)
        error;
   ...

eliminates one access to ring->count_dw.


Conversion from

    ring->ring[ring->wptr++] = v;
    ring->wptr &= ring->ptr_mask;

to

    ring->ring[ring->wptr] = v;
    ring->wptr = (ring->wptr + 1) & ring->ptr_mask;

ideally (with infinitely smart compiler)
shouldn't be necessary, but at least for my gcc
version it eliminates one additional insn:
gcc copies ring->wptr to a scratch reg, then increments it,
then uses scratch reg for addressing...


> On 18.05.2015 19:59, Denys Vlasenko wrote:
>> Inlined radeon_ring_write() has 729 callers, which amounts to about 50000
>> bytes of code. however, deinlining it is probably too much
>> of a performance impact.
>>
>> This patch shrinks slow path a bit and optimizes fast path.
>> Comparison of generated machine code is below:
>>
>> old___________________________________ new____________________________
>> 55             push %rbp               55         push %rbp
>> 4889e5         mov  %rsp,%rbp          ff4f38     decl 0x38(%rdi)
>> 4154           push %r12               4889e5     mov  %rsp,%rbp
>> 4189f4         mov  %esi,%r12d         4154       push %r12
>> 53             push %rbx               4189f4     mov  %esi,%r12d
>> 837f3800       cmpl $0x0,0x38(%rdi)    53         push %rbx
>> 4889fb         mov  %rdi,%rbx          4889fb     mov  %rdi,%rbx
>> 7f0e           jg   <.Lbl>             7905       jns  <.Lbl>
>> 48c7c78f51a785 mov  $message,%rdi
>> 31c0           xor  %eax,%eax
>> e89306f9ff     call <drm_err>          e8cbffffff call <radeon_ring_overflow>
>>      .Lbl:
>> 8b4328         mov  0x28(%rbx),%eax    8b5328     mov  0x28(%rbx),%edx
>> 488b5308       mov  0x8(%rbx),%rdx     488b4308   mov  0x8(%rbx),%rax
>> 89c1           mov  %eax,%ecx          488d0490   lea  (%rax,%rdx,4),%rax
>> ffc0           inc  %eax
>> 488d148a       lea  (%rdx,%rcx,4),%rdx 448920     mov  %r12d,(%rax)
>> 448922         mov  %r12d,(%rdx)       8b4328     mov  0x28(%rbx),%eax
>> 234354         and  0x54(%rbx),%eax    ff4b34     decl 0x34(%rbx)
>> ff4b38         decl 0x38(%rbx)         ffc0       inc  %eax
>> ff4b34         decl 0x34(%rbx)         234354     and  0x54(%rbx),%eax
>> 894328         mov  %eax,0x28(%rbx)    894328     mov  %eax,0x28(%rbx)
>> 5b             pop  %rbx               5b         pop  %rbx
>> 415c           pop  %r12               415c       pop  %r12
>> 5d             pop  %rbp               5d         pop  %rbp
>>
>> This shaves off more than 10 kbytes of code off the kernel:
>>
>>      text     data      bss       dec     hex filename
>> 85657104 22294872 20627456 128579432 7a9f768 vmlinux.before
>> 85646544 22294872 20627456 128568872 7a9ce28 vmlinux
>>
>> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
>> Cc: Christian König <christian.koenig@amd.com>
>> Cc: Alex Deucher <alexander.deucher@amd.com>
>> Cc: linux-kernel@vger.kernel.org
>> ---
>>   drivers/gpu/drm/radeon/radeon.h      | 11 +++++------
>>   drivers/gpu/drm/radeon/radeon_ring.c |  5 +++++
>>   2 files changed, 10 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
>> index bb6b25c..9106873 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -2658,14 +2658,13 @@ void radeon_atombios_fini(struct radeon_device *rdev);
>>    *
>>    * Write a value to the requested ring buffer (all asics).
>>    */
>> +void radeon_ring_overflow(void);
>>   static inline void radeon_ring_write(struct radeon_ring *ring, uint32_t v)
>>   {
>> -    if (ring->count_dw <= 0)
>> -        DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
>> -
>> -    ring->ring[ring->wptr++] = v;
>> -    ring->wptr &= ring->ptr_mask;
>> -    ring->count_dw--;
>> +    if (--ring->count_dw < 0)
>> +        radeon_ring_overflow();
>> +    ring->ring[ring->wptr] = v;
>> +    ring->wptr = (ring->wptr + 1) & ring->ptr_mask;
>>       ring->ring_free_dw--;
>>   }
>>   diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
>> index 2456f69..8204c23 100644
>> --- a/drivers/gpu/drm/radeon/radeon_ring.c
>> +++ b/drivers/gpu/drm/radeon/radeon_ring.c
>> @@ -126,6 +126,11 @@ int radeon_ring_alloc(struct radeon_device *rdev, struct radeon_ring *ring, unsi
>>       return 0;
>>   }
>>   +void radeon_ring_overflow(void)
>> +{
>> +    DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
>> +}
>> +
>>   /**
>>    * radeon_ring_lock - lock the ring and allocate space on it
>>    *
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Deinline indirect register accessor functions
  2015-05-18 18:06 ` [PATCH] radeon: Deinline indirect register accessor functions Christian König
@ 2015-05-18 18:50   ` Denys Vlasenko
  2015-05-18 19:04     ` Christian König
  0 siblings, 1 reply; 10+ messages in thread
From: Denys Vlasenko @ 2015-05-18 18:50 UTC (permalink / raw)
  To: Christian König; +Cc: Alex Deucher, linux-kernel

On 05/18/2015 08:06 PM, Christian König wrote:
> I'm actually surprised how often people come along with that. The last time we tried this it caused a noticeable performance drop.
> 
> Basic problem is that this line:
>> +    if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> optimizes away in most of the cases which reduces the call to a readl which is way faster than the spinlock path.
> 
> So this is a NAK,


Fair enough.

I'm preparing a v2 where the fast branch of r100_mm_{r,w}reg() will stay inlined.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Shrink radeon_ring_write()
  2015-05-18 18:25     ` Denys Vlasenko
@ 2015-05-18 19:01       ` Christian König
  0 siblings, 0 replies; 10+ messages in thread
From: Christian König @ 2015-05-18 19:01 UTC (permalink / raw)
  To: Denys Vlasenko; +Cc: Alex Deucher, linux-kernel

Well thinking about this we could actually get completely ride of 
decrementing count_dw if we precalculate the last valid wptr instead.

That would also reduce the number of error messages to once every ring 
submit, but I would say this is actually a rather positive side effect.

Regards,
Christian.

On 18.05.2015 20:25, Denys Vlasenko wrote:
> On 05/18/2015 08:11 PM, Christian König wrote:
>> De-duplicating the error message is probably a good idea,
>> but are the remaining code changes really necessary for the size reduction?
> The conversion from
>
>     if (ring->count_dw <= 0)
>          error;
>     ...
>     ring->count_dw--;
>
> to
>
>     if (--ring->count_dw < 0)
>          error;
>     ...
>
> eliminates one access to ring->count_dw.
>
>
> Conversion from
>
>      ring->ring[ring->wptr++] = v;
>      ring->wptr &= ring->ptr_mask;
>
> to
>
>      ring->ring[ring->wptr] = v;
>      ring->wptr = (ring->wptr + 1) & ring->ptr_mask;
>
> ideally (with infinitely smart compiler)
> shouldn't be necessary, but at least for my gcc
> version it eliminates one additional insn:
> gcc copies ring->wptr to a scratch reg, then increments it,
> then uses scratch reg for addressing...
>
>
>> On 18.05.2015 19:59, Denys Vlasenko wrote:
>>> Inlined radeon_ring_write() has 729 callers, which amounts to about 50000
>>> bytes of code. however, deinlining it is probably too much
>>> of a performance impact.
>>>
>>> This patch shrinks slow path a bit and optimizes fast path.
>>> Comparison of generated machine code is below:
>>>
>>> old___________________________________ new____________________________
>>> 55             push %rbp               55         push %rbp
>>> 4889e5         mov  %rsp,%rbp          ff4f38     decl 0x38(%rdi)
>>> 4154           push %r12               4889e5     mov  %rsp,%rbp
>>> 4189f4         mov  %esi,%r12d         4154       push %r12
>>> 53             push %rbx               4189f4     mov  %esi,%r12d
>>> 837f3800       cmpl $0x0,0x38(%rdi)    53         push %rbx
>>> 4889fb         mov  %rdi,%rbx          4889fb     mov  %rdi,%rbx
>>> 7f0e           jg   <.Lbl>             7905       jns  <.Lbl>
>>> 48c7c78f51a785 mov  $message,%rdi
>>> 31c0           xor  %eax,%eax
>>> e89306f9ff     call <drm_err>          e8cbffffff call <radeon_ring_overflow>
>>>       .Lbl:
>>> 8b4328         mov  0x28(%rbx),%eax    8b5328     mov  0x28(%rbx),%edx
>>> 488b5308       mov  0x8(%rbx),%rdx     488b4308   mov  0x8(%rbx),%rax
>>> 89c1           mov  %eax,%ecx          488d0490   lea  (%rax,%rdx,4),%rax
>>> ffc0           inc  %eax
>>> 488d148a       lea  (%rdx,%rcx,4),%rdx 448920     mov  %r12d,(%rax)
>>> 448922         mov  %r12d,(%rdx)       8b4328     mov  0x28(%rbx),%eax
>>> 234354         and  0x54(%rbx),%eax    ff4b34     decl 0x34(%rbx)
>>> ff4b38         decl 0x38(%rbx)         ffc0       inc  %eax
>>> ff4b34         decl 0x34(%rbx)         234354     and  0x54(%rbx),%eax
>>> 894328         mov  %eax,0x28(%rbx)    894328     mov  %eax,0x28(%rbx)
>>> 5b             pop  %rbx               5b         pop  %rbx
>>> 415c           pop  %r12               415c       pop  %r12
>>> 5d             pop  %rbp               5d         pop  %rbp
>>>
>>> This shaves off more than 10 kbytes of code off the kernel:
>>>
>>>       text     data      bss       dec     hex filename
>>> 85657104 22294872 20627456 128579432 7a9f768 vmlinux.before
>>> 85646544 22294872 20627456 128568872 7a9ce28 vmlinux
>>>
>>> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
>>> Cc: Christian König <christian.koenig@amd.com>
>>> Cc: Alex Deucher <alexander.deucher@amd.com>
>>> Cc: linux-kernel@vger.kernel.org
>>> ---
>>>    drivers/gpu/drm/radeon/radeon.h      | 11 +++++------
>>>    drivers/gpu/drm/radeon/radeon_ring.c |  5 +++++
>>>    2 files changed, 10 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
>>> index bb6b25c..9106873 100644
>>> --- a/drivers/gpu/drm/radeon/radeon.h
>>> +++ b/drivers/gpu/drm/radeon/radeon.h
>>> @@ -2658,14 +2658,13 @@ void radeon_atombios_fini(struct radeon_device *rdev);
>>>     *
>>>     * Write a value to the requested ring buffer (all asics).
>>>     */
>>> +void radeon_ring_overflow(void);
>>>    static inline void radeon_ring_write(struct radeon_ring *ring, uint32_t v)
>>>    {
>>> -    if (ring->count_dw <= 0)
>>> -        DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
>>> -
>>> -    ring->ring[ring->wptr++] = v;
>>> -    ring->wptr &= ring->ptr_mask;
>>> -    ring->count_dw--;
>>> +    if (--ring->count_dw < 0)
>>> +        radeon_ring_overflow();
>>> +    ring->ring[ring->wptr] = v;
>>> +    ring->wptr = (ring->wptr + 1) & ring->ptr_mask;
>>>        ring->ring_free_dw--;
>>>    }
>>>    diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
>>> index 2456f69..8204c23 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_ring.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_ring.c
>>> @@ -126,6 +126,11 @@ int radeon_ring_alloc(struct radeon_device *rdev, struct radeon_ring *ring, unsi
>>>        return 0;
>>>    }
>>>    +void radeon_ring_overflow(void)
>>> +{
>>> +    DRM_ERROR("radeon: writing more dwords to the ring than expected!\n");
>>> +}
>>> +
>>>    /**
>>>     * radeon_ring_lock - lock the ring and allocate space on it
>>>     *


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Deinline indirect register accessor functions
  2015-05-18 18:50   ` Denys Vlasenko
@ 2015-05-18 19:04     ` Christian König
  2015-05-18 19:22         ` Ilia Mirkin
  0 siblings, 1 reply; 10+ messages in thread
From: Christian König @ 2015-05-18 19:04 UTC (permalink / raw)
  To: Denys Vlasenko; +Cc: Alex Deucher, linux-kernel

On 18.05.2015 20:50, Denys Vlasenko wrote:
> On 05/18/2015 08:06 PM, Christian König wrote:
>> I'm actually surprised how often people come along with that. The last time we tried this it caused a noticeable performance drop.
>>
>> Basic problem is that this line:
>>> +    if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
>> optimizes away in most of the cases which reduces the call to a readl which is way faster than the spinlock path.
>>
>> So this is a NAK,
>
> Fair enough.
>
> I'm preparing a v2 where the fast branch of r100_mm_{r,w}reg() will stay inlined.
>
Sounds good to be, but IIRC that was suggested the last time this came 
up as well. You might just want to google a bit why it wasn't done like 
this before submitting the patch for review.

BTW: Please CC the dri-devel list as well, cause not everybody is 
reading on linux-kernel.

Regards,
Christian.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Deinline indirect register accessor functions
  2015-05-18 19:04     ` Christian König
@ 2015-05-18 19:22         ` Ilia Mirkin
  0 siblings, 0 replies; 10+ messages in thread
From: Ilia Mirkin @ 2015-05-18 19:22 UTC (permalink / raw)
  To: Christian König
  Cc: Denys Vlasenko, Alex Deucher, linux-kernel, Lauri Kasanen, dri-devel

On Mon, May 18, 2015 at 3:04 PM, Christian König
<christian.koenig@amd.com> wrote:
> On 18.05.2015 20:50, Denys Vlasenko wrote:
>>
>> On 05/18/2015 08:06 PM, Christian König wrote:
>>>
>>> I'm actually surprised how often people come along with that. The last
>>> time we tried this it caused a noticeable performance drop.
>>>
>>> Basic problem is that this line:
>>>>
>>>> +    if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) &&
>>>> !always_indirect)
>>>
>>> optimizes away in most of the cases which reduces the call to a readl
>>> which is way faster than the spinlock path.
>>>
>>> So this is a NAK,
>>
>>
>> Fair enough.
>>
>> I'm preparing a v2 where the fast branch of r100_mm_{r,w}reg() will stay
>> inlined.
>>
> Sounds good to be, but IIRC that was suggested the last time this came up as
> well. You might just want to google a bit why it wasn't done like this
> before submitting the patch for review.
>
> BTW: Please CC the dri-devel list as well, cause not everybody is reading on
> linux-kernel.

http://lists.freedesktop.org/archives/dri-devel/2014-April/057349.html
...
http://lists.freedesktop.org/archives/dri-devel/2014-April/057520.html

Actually Lauri was *inlining* the function, not out-of-lining. I made
the suggestion (and you agreed at the time) that the slow-path should
be kept out of line, but apparently there was still high CPU overhead
as a result?

Cheers,

  -ilia

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] radeon: Deinline indirect register accessor functions
@ 2015-05-18 19:22         ` Ilia Mirkin
  0 siblings, 0 replies; 10+ messages in thread
From: Ilia Mirkin @ 2015-05-18 19:22 UTC (permalink / raw)
  To: Christian König
  Cc: Alex Deucher, Lauri Kasanen, Denys Vlasenko, linux-kernel, dri-devel

On Mon, May 18, 2015 at 3:04 PM, Christian König
<christian.koenig@amd.com> wrote:
> On 18.05.2015 20:50, Denys Vlasenko wrote:
>>
>> On 05/18/2015 08:06 PM, Christian König wrote:
>>>
>>> I'm actually surprised how often people come along with that. The last
>>> time we tried this it caused a noticeable performance drop.
>>>
>>> Basic problem is that this line:
>>>>
>>>> +    if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) &&
>>>> !always_indirect)
>>>
>>> optimizes away in most of the cases which reduces the call to a readl
>>> which is way faster than the spinlock path.
>>>
>>> So this is a NAK,
>>
>>
>> Fair enough.
>>
>> I'm preparing a v2 where the fast branch of r100_mm_{r,w}reg() will stay
>> inlined.
>>
> Sounds good to be, but IIRC that was suggested the last time this came up as
> well. You might just want to google a bit why it wasn't done like this
> before submitting the patch for review.
>
> BTW: Please CC the dri-devel list as well, cause not everybody is reading on
> linux-kernel.

http://lists.freedesktop.org/archives/dri-devel/2014-April/057349.html
...
http://lists.freedesktop.org/archives/dri-devel/2014-April/057520.html

Actually Lauri was *inlining* the function, not out-of-lining. I made
the suggestion (and you agreed at the time) that the slow-path should
be kept out of line, but apparently there was still high CPU overhead
as a result?

Cheers,

  -ilia
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-05-18 19:39 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-18 17:59 [PATCH] radeon: Deinline indirect register accessor functions Denys Vlasenko
2015-05-18 17:59 ` [PATCH] radeon: Shrink radeon_ring_write() Denys Vlasenko
2015-05-18 18:11   ` Christian König
2015-05-18 18:25     ` Denys Vlasenko
2015-05-18 19:01       ` Christian König
2015-05-18 18:06 ` [PATCH] radeon: Deinline indirect register accessor functions Christian König
2015-05-18 18:50   ` Denys Vlasenko
2015-05-18 19:04     ` Christian König
2015-05-18 19:22       ` Ilia Mirkin
2015-05-18 19:22         ` Ilia Mirkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.