linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Mark Hairgrove <mhairgrove@nvidia.com>
To: <linuxppc-dev@lists.ozlabs.org>
Cc: Alistair Popple <alistair@popple.id.au>,
	Mark Hairgrove <mhairgrove@nvidia.com>,
	Reza Arbab <arbab@us.ibm.com>
Subject: [PATCH 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates
Date: Thu, 27 Sep 2018 16:23:09 -0700	[thread overview]
Message-ID: <1538090591-28519-2-git-send-email-mhairgrove@nvidia.com> (raw)
In-Reply-To: <1538090591-28519-1-git-send-email-mhairgrove@nvidia.com>

There are two types of ATSDs issued to the NPU: invalidates targeting a
specific virtual address and invalidates targeting the whole address
space. In both cases prior to this change, the sequence was:

    for each NPU
        - Write the target address to the XTS_ATSD_AVA register
        - EIEIO
        - Write the launch value to issue the ATSD

First, a target address is not required when invalidating the whole
address space, so that write and the EIEIO have been removed. The AP
(size) field in the launch is not needed either.

Second, for per-address invalidates the above sequence is inefficient in
the common case of multiple NPUs because an EIEIO is issued per NPU. This
unnecessarily forces the launches of later ATSDs to be ordered with the
launches of earlier ones. The new sequence only issues a single EIEIO:

    for each NPU
        - Write the target address to the XTS_ATSD_AVA register
    EIEIO
    for each NPU
        - Write the launch value to issue the ATSD

Performance results were gathered using a microbenchmark which creates a
1G allocation then uses mprotect with PROT_NONE to trigger invalidates in
strides across the allocation.

With only a single NPU active (one GPU) the difference is in the noise for
both types of invalidates (+/-1%).

With two NPUs active (on a 6-GPU system) the effect is more noticeable:

         mprotect rate (GB/s)
Stride   Before      After      Speedup
64K         5.9        6.5          10%
1M         31.2       33.4           7%
2M         36.3       38.7           7%
4M        322.6      356.7          11%

Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
---
 arch/powerpc/platforms/powernv/npu-dma.c |   99 ++++++++++++++---------------
 1 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 8006c54..c8f438a 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
 }
 
 /* MMIO ATSD register offsets */
-#define XTS_ATSD_AVA  1
-#define XTS_ATSD_STAT 2
+#define XTS_ATSD_LAUNCH 0
+#define XTS_ATSD_AVA    1
+#define XTS_ATSD_STAT   2
 
-static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
-				unsigned long launch, unsigned long va)
+static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
+					bool flush)
 {
-	struct npu *npu = mmio_atsd_reg->npu;
-	int reg = mmio_atsd_reg->reg;
+	unsigned long launch = 0;
 
-	__raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
-	eieio();
-	__raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
+	if (psize == MMU_PAGE_COUNT) {
+		/* IS set to invalidate entire matching PID */
+		launch |= PPC_BIT(12);
+	} else {
+		/* AP set to invalidate region of psize */
+		launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
+	}
+
+	/* PRS set to process-scoped */
+	launch |= PPC_BIT(13);
+
+	/* PID */
+	launch |= pid << PPC_BITLSHIFT(38);
+
+	/* No flush */
+	launch |= !flush << PPC_BITLSHIFT(39);
+
+	return launch;
 }
 
-static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-				unsigned long pid, bool flush)
+static void mmio_atsd_regs_write(struct mmio_atsd_reg
+			mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
+			unsigned long val)
 {
-	int i;
-	unsigned long launch;
+	struct npu *npu;
+	int i, reg;
 
 	for (i = 0; i <= max_npu2_index; i++) {
-		if (mmio_atsd_reg[i].reg < 0)
+		reg = mmio_atsd_reg[i].reg;
+		if (reg < 0)
 			continue;
 
-		/* IS set to invalidate matching PID */
-		launch = PPC_BIT(12);
-
-		/* PRS set to process-scoped */
-		launch |= PPC_BIT(13);
-
-		/* AP */
-		launch |= (u64)
-			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-
-		/* PID */
-		launch |= pid << PPC_BITLSHIFT(38);
+		npu = mmio_atsd_reg[i].npu;
+		__raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
+	}
+}
 
-		/* No flush */
-		launch |= !flush << PPC_BITLSHIFT(39);
+static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
+				unsigned long pid, bool flush)
+{
+	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
 
-		/* Invalidating the entire process doesn't use a va */
-		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
-	}
+	/* Invalidating the entire process doesn't use a va */
+	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
 }
 
 static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
 			unsigned long va, unsigned long pid, bool flush)
 {
-	int i;
 	unsigned long launch;
 
-	for (i = 0; i <= max_npu2_index; i++) {
-		if (mmio_atsd_reg[i].reg < 0)
-			continue;
-
-		/* IS set to invalidate target VA */
-		launch = 0;
+	launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
 
-		/* PRS set to process scoped */
-		launch |= PPC_BIT(13);
+	/* Write all VAs first */
+	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
 
-		/* AP */
-		launch |= (u64)
-			mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-
-		/* PID */
-		launch |= pid << PPC_BITLSHIFT(38);
-
-		/* No flush */
-		launch |= !flush << PPC_BITLSHIFT(39);
+	/* Issue one barrier for all address writes */
+	eieio();
 
-		mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
-	}
+	/* Launch */
+	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
 }
 
 #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
-- 
1.7.2.5


  reply	other threads:[~2018-09-27 23:35 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-09-27 23:23 [PATCH 0/3] powerpc/powernv/npu: Improve ATSD invalidation overhead Mark Hairgrove
2018-09-27 23:23 ` Mark Hairgrove [this message]
2018-10-02  7:23   ` [PATCH 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates Alistair Popple
2018-09-27 23:23 ` [PATCH 2/3] powerpc/powernv/npu: Use size-based " Mark Hairgrove
2018-10-02  7:11   ` Alistair Popple
2018-10-03  1:10     ` Mark Hairgrove
2018-10-03  2:27       ` Alistair Popple
2018-10-03 18:33         ` Mark Hairgrove
2018-09-27 23:23 ` [PATCH 3/3] powerpc/powernv/npu: Remove atsd_threshold debugfs setting Mark Hairgrove
2018-10-02  7:12   ` Alistair Popple

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1538090591-28519-2-git-send-email-mhairgrove@nvidia.com \
    --to=mhairgrove@nvidia.com \
    --cc=alistair@popple.id.au \
    --cc=arbab@us.ibm.com \
    --cc=linuxppc-dev@lists.ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).