All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-18 21:51 ` Gary R Hook
  0 siblings, 0 replies; 8+ messages in thread
From: Gary R Hook @ 2018-05-18 21:51 UTC (permalink / raw)
  To: iommu; +Cc: joro, linux-kernel

Improve the performance of the PPR log polling function (i.e. the
task of emptying the log) by minimizing MMIO operations and more
efficiently processing groups of log entries. Cache the head
pointer, as there's never a reason to read it. Ensure the head
pointer register is updated every so often, to inform the IOMMU
that space is available in the log.

Finally, since a single pass may leave logged events unread, use
an outer loop to repeat until head has caught up to tail.

Signed-off-by: Gary R Hook <gary.hook@amd.com>
---
 drivers/iommu/amd_iommu.c       |   69 +++++++++++++++++++++------------------
 drivers/iommu/amd_iommu_init.c  |    1 +
 drivers/iommu/amd_iommu_types.h |    1 +
 3 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 77c056ae082c..13a550fb7d47 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -660,51 +660,56 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
 
 static void iommu_poll_ppr_log(struct amd_iommu *iommu)
 {
-	u32 head, tail;
+	u32 tail;
 
 	if (iommu->ppr_log == NULL)
 		return;
 
-	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
 
-	while (head != tail) {
-		volatile u64 *raw;
-		u64 entry[2];
-		int i;
+	while (iommu->ppr_log_head != tail) {
+		uint count = PPR_LOG_ENTRIES / 8;
+
+		while (iommu->ppr_log_head != tail && count--) {
+			volatile u64 *raw;
+			u64 entry[2];
+			int i;
+
+			raw = (u64 *)(iommu->ppr_log + iommu->ppr_log_head);
+
+			/*
+			 * Hardware bug: Interrupt may arrive before the
+			 * entry is written to memory. If this happens we
+			 * need to wait for the entry to arrive.
+			 */
+			for (i = 0; i < LOOP_TIMEOUT; ++i) {
+				if (PPR_REQ_TYPE(raw[0]) != 0)
+					break;
+				udelay(1);
+			}
 
-		raw = (u64 *)(iommu->ppr_log + head);
+			/* Avoid memcpy function-call overhead */
+			entry[0] = raw[0];
+			entry[1] = raw[1];
 
-		/*
-		 * Hardware bug: Interrupt may arrive before the entry is
-		 * written to memory. If this happens we need to wait for the
-		 * entry to arrive.
-		 */
-		for (i = 0; i < LOOP_TIMEOUT; ++i) {
-			if (PPR_REQ_TYPE(raw[0]) != 0)
-				break;
-			udelay(1);
-		}
+			/*
+			 * To detect the hardware bug we need to clear the
+			 * entry back to zero.
+			 */
+			raw[0] = raw[1] = 0UL;
 
-		/* Avoid memcpy function-call overhead */
-		entry[0] = raw[0];
-		entry[1] = raw[1];
+			/* Handle PPR entry */
+			iommu_handle_ppr_entry(iommu, entry);
 
-		/*
-		 * To detect the hardware bug we need to clear the entry
-		 * back to zero.
-		 */
-		raw[0] = raw[1] = 0UL;
+			iommu->ppr_log_head += PPR_ENTRY_SIZE;
+			iommu->ppr_log_head %= PPR_LOG_SIZE;
+		}
 
 		/* Update head pointer of hardware ring-buffer */
-		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
-		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
-
-		/* Handle PPR entry */
-		iommu_handle_ppr_entry(iommu, entry);
+		writel(iommu->ppr_log_head,
+		       iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 
-		/* Refresh ring-buffer information */
-		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+		/* Get the current value of tail */
 		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
 	}
 }
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 904c575d1677..227a9887feb1 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -721,6 +721,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu)
 	/* set head and tail to zero manually */
 	writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 	writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+	iommu->ppr_log_head = 0;
 
 	iommu_feature_enable(iommu, CONTROL_PPFLOG_EN);
 	iommu_feature_enable(iommu, CONTROL_PPR_EN);
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 986cbe0cc189..31afcc54434c 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -548,6 +548,7 @@ struct amd_iommu {
 
 	/* Base of the PPR log, if present */
 	u8 *ppr_log;
+	u32 ppr_log_head;
 
 	/* Base of the GA log, if present */
 	u8 *ga_log;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-18 21:51 ` Gary R Hook
  0 siblings, 0 replies; 8+ messages in thread
From: Gary R Hook @ 2018-05-18 21:51 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA

Improve the performance of the PPR log polling function (i.e. the
task of emptying the log) by minimizing MMIO operations and more
efficiently processing groups of log entries. Cache the head
pointer, as there's never a reason to read it. Ensure the head
pointer register is updated every so often, to inform the IOMMU
that space is available in the log.

Finally, since a single pass may leave logged events unread, use
an outer loop to repeat until head has caught up to tail.

Signed-off-by: Gary R Hook <gary.hook-5C7GfCeVMHo@public.gmane.org>
---
 drivers/iommu/amd_iommu.c       |   69 +++++++++++++++++++++------------------
 drivers/iommu/amd_iommu_init.c  |    1 +
 drivers/iommu/amd_iommu_types.h |    1 +
 3 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 77c056ae082c..13a550fb7d47 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -660,51 +660,56 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
 
 static void iommu_poll_ppr_log(struct amd_iommu *iommu)
 {
-	u32 head, tail;
+	u32 tail;
 
 	if (iommu->ppr_log == NULL)
 		return;
 
-	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
 
-	while (head != tail) {
-		volatile u64 *raw;
-		u64 entry[2];
-		int i;
+	while (iommu->ppr_log_head != tail) {
+		uint count = PPR_LOG_ENTRIES / 8;
+
+		while (iommu->ppr_log_head != tail && count--) {
+			volatile u64 *raw;
+			u64 entry[2];
+			int i;
+
+			raw = (u64 *)(iommu->ppr_log + iommu->ppr_log_head);
+
+			/*
+			 * Hardware bug: Interrupt may arrive before the
+			 * entry is written to memory. If this happens we
+			 * need to wait for the entry to arrive.
+			 */
+			for (i = 0; i < LOOP_TIMEOUT; ++i) {
+				if (PPR_REQ_TYPE(raw[0]) != 0)
+					break;
+				udelay(1);
+			}
 
-		raw = (u64 *)(iommu->ppr_log + head);
+			/* Avoid memcpy function-call overhead */
+			entry[0] = raw[0];
+			entry[1] = raw[1];
 
-		/*
-		 * Hardware bug: Interrupt may arrive before the entry is
-		 * written to memory. If this happens we need to wait for the
-		 * entry to arrive.
-		 */
-		for (i = 0; i < LOOP_TIMEOUT; ++i) {
-			if (PPR_REQ_TYPE(raw[0]) != 0)
-				break;
-			udelay(1);
-		}
+			/*
+			 * To detect the hardware bug we need to clear the
+			 * entry back to zero.
+			 */
+			raw[0] = raw[1] = 0UL;
 
-		/* Avoid memcpy function-call overhead */
-		entry[0] = raw[0];
-		entry[1] = raw[1];
+			/* Handle PPR entry */
+			iommu_handle_ppr_entry(iommu, entry);
 
-		/*
-		 * To detect the hardware bug we need to clear the entry
-		 * back to zero.
-		 */
-		raw[0] = raw[1] = 0UL;
+			iommu->ppr_log_head += PPR_ENTRY_SIZE;
+			iommu->ppr_log_head %= PPR_LOG_SIZE;
+		}
 
 		/* Update head pointer of hardware ring-buffer */
-		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
-		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
-
-		/* Handle PPR entry */
-		iommu_handle_ppr_entry(iommu, entry);
+		writel(iommu->ppr_log_head,
+		       iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 
-		/* Refresh ring-buffer information */
-		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+		/* Get the current value of tail */
 		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
 	}
 }
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 904c575d1677..227a9887feb1 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -721,6 +721,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu)
 	/* set head and tail to zero manually */
 	writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 	writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+	iommu->ppr_log_head = 0;
 
 	iommu_feature_enable(iommu, CONTROL_PPFLOG_EN);
 	iommu_feature_enable(iommu, CONTROL_PPR_EN);
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 986cbe0cc189..31afcc54434c 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -548,6 +548,7 @@ struct amd_iommu {
 
 	/* Base of the PPR log, if present */
 	u8 *ppr_log;
+	u32 ppr_log_head;
 
 	/* Base of the GA log, if present */
 	u8 *ga_log;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-29 14:54   ` Joerg Roedel
  0 siblings, 0 replies; 8+ messages in thread
From: Joerg Roedel @ 2018-05-29 14:54 UTC (permalink / raw)
  To: Gary R Hook; +Cc: iommu, linux-kernel

Hey Gary,

On Fri, May 18, 2018 at 04:51:56PM -0500, Gary R Hook wrote:
> Improve the performance of the PPR log polling function (i.e. the
> task of emptying the log) by minimizing MMIO operations and more
> efficiently processing groups of log entries. Cache the head
> pointer, as there's never a reason to read it. Ensure the head
> pointer register is updated every so often, to inform the IOMMU
> that space is available in the log.
> 
> Finally, since a single pass may leave logged events unread, use
> an outer loop to repeat until head has caught up to tail.
> 
> Signed-off-by: Gary R Hook <gary.hook@amd.com>

Do you have numbers for the performance improvement? How did you test
this patch?


Regards,

	Joerg

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-29 14:54   ` Joerg Roedel
  0 siblings, 0 replies; 8+ messages in thread
From: Joerg Roedel @ 2018-05-29 14:54 UTC (permalink / raw)
  To: Gary R Hook
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Hey Gary,

On Fri, May 18, 2018 at 04:51:56PM -0500, Gary R Hook wrote:
> Improve the performance of the PPR log polling function (i.e. the
> task of emptying the log) by minimizing MMIO operations and more
> efficiently processing groups of log entries. Cache the head
> pointer, as there's never a reason to read it. Ensure the head
> pointer register is updated every so often, to inform the IOMMU
> that space is available in the log.
> 
> Finally, since a single pass may leave logged events unread, use
> an outer loop to repeat until head has caught up to tail.
> 
> Signed-off-by: Gary R Hook <gary.hook-5C7GfCeVMHo@public.gmane.org>

Do you have numbers for the performance improvement? How did you test
this patch?


Regards,

	Joerg

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-29 17:28     ` Gary R Hook
  0 siblings, 0 replies; 8+ messages in thread
From: Gary R Hook @ 2018-05-29 17:28 UTC (permalink / raw)
  To: Joerg Roedel, Gary R Hook; +Cc: iommu, linux-kernel

On 05/29/2018 09:54 AM, Joerg Roedel wrote:
> Hey Gary,
> 
> On Fri, May 18, 2018 at 04:51:56PM -0500, Gary R Hook wrote:
>> Improve the performance of the PPR log polling function (i.e. the
>> task of emptying the log) by minimizing MMIO operations and more
>> efficiently processing groups of log entries. Cache the head
>> pointer, as there's never a reason to read it. Ensure the head
>> pointer register is updated every so often, to inform the IOMMU
>> that space is available in the log.
>>
>> Finally, since a single pass may leave logged events unread, use
>> an outer loop to repeat until head has caught up to tail.
>>
>> Signed-off-by: Gary R Hook <gary.hook@amd.com>
> 
> Do you have numbers for the performance improvement? How did you test
> this patch?

No, no numbers. We're still working out how best to test this, and 
suggestions/strategies are welcome.

The change is modeled after the function iommu_poll_events(), which is 
much cleaner. The GA log handling should be changed, as well (there are 
superfluous writes in the loop), but I figured, "one thing at a time". 
This is admittedly a minor optimization, but discussions with Tom 
Lendacky have led us down this path.

Your feedback is appreciated.

Gary

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-29 17:28     ` Gary R Hook
  0 siblings, 0 replies; 8+ messages in thread
From: Gary R Hook @ 2018-05-29 17:28 UTC (permalink / raw)
  To: Joerg Roedel, Gary R Hook
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 05/29/2018 09:54 AM, Joerg Roedel wrote:
> Hey Gary,
> 
> On Fri, May 18, 2018 at 04:51:56PM -0500, Gary R Hook wrote:
>> Improve the performance of the PPR log polling function (i.e. the
>> task of emptying the log) by minimizing MMIO operations and more
>> efficiently processing groups of log entries. Cache the head
>> pointer, as there's never a reason to read it. Ensure the head
>> pointer register is updated every so often, to inform the IOMMU
>> that space is available in the log.
>>
>> Finally, since a single pass may leave logged events unread, use
>> an outer loop to repeat until head has caught up to tail.
>>
>> Signed-off-by: Gary R Hook <gary.hook-5C7GfCeVMHo@public.gmane.org>
> 
> Do you have numbers for the performance improvement? How did you test
> this patch?

No, no numbers. We're still working out how best to test this, and 
suggestions/strategies are welcome.

The change is modeled after the function iommu_poll_events(), which is 
much cleaner. The GA log handling should be changed, as well (there are 
superfluous writes in the loop), but I figured, "one thing at a time". 
This is admittedly a minor optimization, but discussions with Tom 
Lendacky have led us down this path.

Your feedback is appreciated.

Gary

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-30  4:52       ` Joerg Roedel
  0 siblings, 0 replies; 8+ messages in thread
From: Joerg Roedel @ 2018-05-30  4:52 UTC (permalink / raw)
  To: Gary R Hook; +Cc: Gary R Hook, iommu, linux-kernel

On Tue, May 29, 2018 at 12:28:54PM -0500, Gary R Hook wrote:
> No, no numbers. We're still working out how best to test this, and
> suggestions/strategies are welcome.

Maybe run a simple kernel on the CPU that does a memcpy on a larger
portion of mmapped (but yet unmapped) process address space and measure
the time it takes for the kernel to run. The page-fault path in the
iommu-driver is only a small part of the involved code here, but maybe
you already see a difference. Doing a u-benchmark only for that code is
probably a bit more challenging.

> The change is modeled after the function iommu_poll_events(), which is much
> cleaner. The GA log handling should be changed, as well (there are
> superfluous writes in the loop), but I figured, "one thing at a time". This
> is admittedly a minor optimization, but discussions with Tom Lendacky have
> led us down this path.
> 
> Your feedback is appreciated.

Yeah, the patch looks good to me from my first review. But since I can't
test that code myself I was wondering if you did any tests and can share
something with me to run my own tests :)



	Joerg

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] iommu/amd - Optimize PPR log handling
@ 2018-05-30  4:52       ` Joerg Roedel
  0 siblings, 0 replies; 8+ messages in thread
From: Joerg Roedel @ 2018-05-30  4:52 UTC (permalink / raw)
  To: Gary R Hook
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Tue, May 29, 2018 at 12:28:54PM -0500, Gary R Hook wrote:
> No, no numbers. We're still working out how best to test this, and
> suggestions/strategies are welcome.

Maybe run a simple kernel on the CPU that does a memcpy on a larger
portion of mmapped (but yet unmapped) process address space and measure
the time it takes for the kernel to run. The page-fault path in the
iommu-driver is only a small part of the involved code here, but maybe
you already see a difference. Doing a u-benchmark only for that code is
probably a bit more challenging.

> The change is modeled after the function iommu_poll_events(), which is much
> cleaner. The GA log handling should be changed, as well (there are
> superfluous writes in the loop), but I figured, "one thing at a time". This
> is admittedly a minor optimization, but discussions with Tom Lendacky have
> led us down this path.
> 
> Your feedback is appreciated.

Yeah, the patch looks good to me from my first review. But since I can't
test that code myself I was wondering if you did any tests and can share
something with me to run my own tests :)



	Joerg

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-05-30  4:52 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-18 21:51 [PATCH] iommu/amd - Optimize PPR log handling Gary R Hook
2018-05-18 21:51 ` Gary R Hook
2018-05-29 14:54 ` Joerg Roedel
2018-05-29 14:54   ` Joerg Roedel
2018-05-29 17:28   ` Gary R Hook
2018-05-29 17:28     ` Gary R Hook
2018-05-30  4:52     ` Joerg Roedel
2018-05-30  4:52       ` Joerg Roedel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.