All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode
@ 2021-08-29 13:54 ` Leo Yan
  0 siblings, 0 replies; 6+ messages in thread
From: Leo Yan @ 2021-08-29 13:54 UTC (permalink / raw)
  To: Mathieu Poirier, Suzuki K Poulose, Mike Leach,
	Alexander Shishkin, coresight, linux-arm-kernel, linux-kernel
  Cc: Leo Yan

The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
low level's architecture code, e.g. for Arm64, it maps the memory with
the attribution "Normal non-cacheable"; this can be concluded from the
definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.

Later when access the AUX bounce buffer, since the memory mapping is
non-cacheable, it's low efficiency due to every load instruction must
reach out DRAM.

This patch changes to allocate pages with alloc_pages_node(), thus the
driver can access the memory with cacheable mapping in the kernel linear
virtual address; therefore, because load instructions can fetch data
from cache lines rather than always read data from DRAM, the driver can
boost memory coping performance.  After using the cacheable mapping, the
driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
read bounce buffer so can avoid read stale trace data.

By measurement the duration for function tmc_update_etr_buffer() with
ftrace function_graph tracer, it shows the performance significant
improvement for copying 4MiB data from bounce buffer:

  # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
  # echo tmc_update_etr_buffer > set_graph_function
  # echo function_graph > current_tracer

  before:

  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
  2)               |    tmc_update_etr_buffer() {
  ...
  2) # 8148.320 us |    }

  after:

  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
  2)               |  tmc_update_etr_buffer() {
  ...
  2) # 2463.980 us |  }

Signed-off-by: Leo Yan <leo.yan@linaro.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---

Changes from v2:
Sync the entire buffer in one go when the tracing is wrap around
(Suzuki);
Add Suzuki's review tage.

Changes from v1:
Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
unexpected if condition change in tmc_etr_free_flat_buf().

 .../hwtracing/coresight/coresight-tmc-etr.c   | 47 ++++++++++++++++---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 13fd1fc730ed..ac37e9376d2b 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -21,6 +21,7 @@
 
 struct etr_flat_buf {
 	struct device	*dev;
+	struct page	*pages;
 	dma_addr_t	daddr;
 	void		*vaddr;
 	size_t		size;
@@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
 {
 	struct etr_flat_buf *flat_buf;
 	struct device *real_dev = drvdata->csdev->dev.parent;
+	ssize_t	aligned_size;
 
 	/* We cannot reuse existing pages for flat buf */
 	if (pages)
@@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
 	if (!flat_buf)
 		return -ENOMEM;
 
-	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
-					     &flat_buf->daddr, GFP_KERNEL);
-	if (!flat_buf->vaddr) {
-		kfree(flat_buf);
-		return -ENOMEM;
+	aligned_size = PAGE_ALIGN(etr_buf->size);
+	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+					   get_order(aligned_size));
+	if (!flat_buf->pages)
+		goto fail_alloc_pages;
+
+	flat_buf->vaddr = page_address(flat_buf->pages);
+	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
+				       aligned_size, DMA_FROM_DEVICE);
+	if (dma_mapping_error(real_dev, flat_buf->daddr)) {
+		flat_buf->daddr = 0;
+		goto fail_dma_map_page;
 	}
 
 	flat_buf->size = etr_buf->size;
@@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
 	etr_buf->mode = ETR_MODE_FLAT;
 	etr_buf->private = flat_buf;
 	return 0;
+
+fail_dma_map_page:
+	__free_pages(flat_buf->pages, get_order(aligned_size));
+fail_alloc_pages:
+	kfree(flat_buf);
+	return -ENOMEM;
 }
 
 static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
@@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
 
 	if (flat_buf && flat_buf->daddr) {
 		struct device *real_dev = flat_buf->dev->parent;
+		ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
 
-		dma_free_coherent(real_dev, flat_buf->size,
-				  flat_buf->vaddr, flat_buf->daddr);
+		dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
+			       DMA_FROM_DEVICE);
+		__free_pages(flat_buf->pages, get_order(aligned_size));
 	}
 	kfree(flat_buf);
 }
 
 static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
 {
+	struct etr_flat_buf *flat_buf = etr_buf->private;
+	struct device *real_dev = flat_buf->dev->parent;
+
 	/*
 	 * Adjust the buffer to point to the beginning of the trace data
 	 * and update the available trace data.
@@ -648,6 +668,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
 		etr_buf->len = etr_buf->size;
 	else
 		etr_buf->len = rwp - rrp;
+
+	/*
+	 * The driver always starts tracing at the beginning of the buffer,
+	 * the only reason why we would get a wrap around is when the buffer
+	 * is full.  Sync the entire buffer in one go for this case.
+	 */
+	if (etr_buf->offset + etr_buf->len > etr_buf->size)
+		dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
+					etr_buf->size, DMA_FROM_DEVICE);
+	else
+		dma_sync_single_for_cpu(real_dev,
+					flat_buf->daddr + etr_buf->offset,
+					etr_buf->len, DMA_FROM_DEVICE);
 }
 
 static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode
@ 2021-08-29 13:54 ` Leo Yan
  0 siblings, 0 replies; 6+ messages in thread
From: Leo Yan @ 2021-08-29 13:54 UTC (permalink / raw)
  To: Mathieu Poirier, Suzuki K Poulose, Mike Leach,
	Alexander Shishkin, coresight, linux-arm-kernel, linux-kernel
  Cc: Leo Yan

The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
low level's architecture code, e.g. for Arm64, it maps the memory with
the attribution "Normal non-cacheable"; this can be concluded from the
definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.

Later when access the AUX bounce buffer, since the memory mapping is
non-cacheable, it's low efficiency due to every load instruction must
reach out DRAM.

This patch changes to allocate pages with alloc_pages_node(), thus the
driver can access the memory with cacheable mapping in the kernel linear
virtual address; therefore, because load instructions can fetch data
from cache lines rather than always read data from DRAM, the driver can
boost memory coping performance.  After using the cacheable mapping, the
driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
read bounce buffer so can avoid read stale trace data.

By measurement the duration for function tmc_update_etr_buffer() with
ftrace function_graph tracer, it shows the performance significant
improvement for copying 4MiB data from bounce buffer:

  # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
  # echo tmc_update_etr_buffer > set_graph_function
  # echo function_graph > current_tracer

  before:

  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
  2)               |    tmc_update_etr_buffer() {
  ...
  2) # 8148.320 us |    }

  after:

  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
  2)               |  tmc_update_etr_buffer() {
  ...
  2) # 2463.980 us |  }

Signed-off-by: Leo Yan <leo.yan@linaro.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---

Changes from v2:
Sync the entire buffer in one go when the tracing is wrap around
(Suzuki);
Add Suzuki's review tage.

Changes from v1:
Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
unexpected if condition change in tmc_etr_free_flat_buf().

 .../hwtracing/coresight/coresight-tmc-etr.c   | 47 ++++++++++++++++---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 13fd1fc730ed..ac37e9376d2b 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -21,6 +21,7 @@
 
 struct etr_flat_buf {
 	struct device	*dev;
+	struct page	*pages;
 	dma_addr_t	daddr;
 	void		*vaddr;
 	size_t		size;
@@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
 {
 	struct etr_flat_buf *flat_buf;
 	struct device *real_dev = drvdata->csdev->dev.parent;
+	ssize_t	aligned_size;
 
 	/* We cannot reuse existing pages for flat buf */
 	if (pages)
@@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
 	if (!flat_buf)
 		return -ENOMEM;
 
-	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
-					     &flat_buf->daddr, GFP_KERNEL);
-	if (!flat_buf->vaddr) {
-		kfree(flat_buf);
-		return -ENOMEM;
+	aligned_size = PAGE_ALIGN(etr_buf->size);
+	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+					   get_order(aligned_size));
+	if (!flat_buf->pages)
+		goto fail_alloc_pages;
+
+	flat_buf->vaddr = page_address(flat_buf->pages);
+	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
+				       aligned_size, DMA_FROM_DEVICE);
+	if (dma_mapping_error(real_dev, flat_buf->daddr)) {
+		flat_buf->daddr = 0;
+		goto fail_dma_map_page;
 	}
 
 	flat_buf->size = etr_buf->size;
@@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
 	etr_buf->mode = ETR_MODE_FLAT;
 	etr_buf->private = flat_buf;
 	return 0;
+
+fail_dma_map_page:
+	__free_pages(flat_buf->pages, get_order(aligned_size));
+fail_alloc_pages:
+	kfree(flat_buf);
+	return -ENOMEM;
 }
 
 static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
@@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
 
 	if (flat_buf && flat_buf->daddr) {
 		struct device *real_dev = flat_buf->dev->parent;
+		ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
 
-		dma_free_coherent(real_dev, flat_buf->size,
-				  flat_buf->vaddr, flat_buf->daddr);
+		dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
+			       DMA_FROM_DEVICE);
+		__free_pages(flat_buf->pages, get_order(aligned_size));
 	}
 	kfree(flat_buf);
 }
 
 static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
 {
+	struct etr_flat_buf *flat_buf = etr_buf->private;
+	struct device *real_dev = flat_buf->dev->parent;
+
 	/*
 	 * Adjust the buffer to point to the beginning of the trace data
 	 * and update the available trace data.
@@ -648,6 +668,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
 		etr_buf->len = etr_buf->size;
 	else
 		etr_buf->len = rwp - rrp;
+
+	/*
+	 * The driver always starts tracing at the beginning of the buffer,
+	 * the only reason why we would get a wrap around is when the buffer
+	 * is full.  Sync the entire buffer in one go for this case.
+	 */
+	if (etr_buf->offset + etr_buf->len > etr_buf->size)
+		dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
+					etr_buf->size, DMA_FROM_DEVICE);
+	else
+		dma_sync_single_for_cpu(real_dev,
+					flat_buf->daddr + etr_buf->offset,
+					etr_buf->len, DMA_FROM_DEVICE);
 }
 
 static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
-- 
2.25.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode
  2021-08-29 13:54 ` Leo Yan
@ 2021-09-01 20:03   ` Robin Murphy
  -1 siblings, 0 replies; 6+ messages in thread
From: Robin Murphy @ 2021-09-01 20:03 UTC (permalink / raw)
  To: Leo Yan, Mathieu Poirier, Suzuki K Poulose, Mike Leach,
	Alexander Shishkin, coresight, linux-arm-kernel, linux-kernel

On 2021-08-29 14:54, Leo Yan wrote:
> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
> low level's architecture code, e.g. for Arm64, it maps the memory with
> the attribution "Normal non-cacheable"; this can be concluded from the
> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
> 
> Later when access the AUX bounce buffer, since the memory mapping is
> non-cacheable, it's low efficiency due to every load instruction must
> reach out DRAM.
> 
> This patch changes to allocate pages with alloc_pages_node(), thus the
> driver can access the memory with cacheable mapping in the kernel linear
> virtual address; therefore, because load instructions can fetch data
> from cache lines rather than always read data from DRAM, the driver can
> boost memory coping performance.  After using the cacheable mapping, the
> driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
> read bounce buffer so can avoid read stale trace data.
> 
> By measurement the duration for function tmc_update_etr_buffer() with
> ftrace function_graph tracer, it shows the performance significant
> improvement for copying 4MiB data from bounce buffer:
> 
>    # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
>    # echo tmc_update_etr_buffer > set_graph_function
>    # echo function_graph > current_tracer
> 
>    before:
> 
>    # CPU  DURATION                  FUNCTION CALLS
>    # |     |   |                     |   |   |   |
>    2)               |    tmc_update_etr_buffer() {
>    ...
>    2) # 8148.320 us |    }
> 
>    after:
> 
>    # CPU  DURATION                  FUNCTION CALLS
>    # |     |   |                     |   |   |   |
>    2)               |  tmc_update_etr_buffer() {
>    ...
>    2) # 2463.980 us |  }
> 
> Signed-off-by: Leo Yan <leo.yan@linaro.org>
> Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> ---
> 
> Changes from v2:
> Sync the entire buffer in one go when the tracing is wrap around
> (Suzuki);
> Add Suzuki's review tage.
> 
> Changes from v1:
> Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
> unexpected if condition change in tmc_etr_free_flat_buf().
> 
>   .../hwtracing/coresight/coresight-tmc-etr.c   | 47 ++++++++++++++++---
>   1 file changed, 40 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> index 13fd1fc730ed..ac37e9376d2b 100644
> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> @@ -21,6 +21,7 @@
>   
>   struct etr_flat_buf {
>   	struct device	*dev;
> +	struct page	*pages;
>   	dma_addr_t	daddr;
>   	void		*vaddr;
>   	size_t		size;
> @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   {
>   	struct etr_flat_buf *flat_buf;
>   	struct device *real_dev = drvdata->csdev->dev.parent;
> +	ssize_t	aligned_size;
>   
>   	/* We cannot reuse existing pages for flat buf */
>   	if (pages)
> @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   	if (!flat_buf)
>   		return -ENOMEM;
>   
> -	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> -					     &flat_buf->daddr, GFP_KERNEL);
> -	if (!flat_buf->vaddr) {
> -		kfree(flat_buf);
> -		return -ENOMEM;
> +	aligned_size = PAGE_ALIGN(etr_buf->size);
> +	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
> +					   get_order(aligned_size));
> +	if (!flat_buf->pages)
> +		goto fail_alloc_pages;
> +
> +	flat_buf->vaddr = page_address(flat_buf->pages);
> +	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
> +				       aligned_size, DMA_FROM_DEVICE);

Use dma_alloc_noncoherent() rather than open-coding this - bare 
alloc_pages() has no understanding of DMA masks, and you wouldn't want 
to end up in the worst case of dma_map_page() bounce-buffering your 
bounce buffer...

Robin.

> +	if (dma_mapping_error(real_dev, flat_buf->daddr)) {
> +		flat_buf->daddr = 0;
> +		goto fail_dma_map_page;
>   	}
>   
>   	flat_buf->size = etr_buf->size;
> @@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   	etr_buf->mode = ETR_MODE_FLAT;
>   	etr_buf->private = flat_buf;
>   	return 0;
> +
> +fail_dma_map_page:
> +	__free_pages(flat_buf->pages, get_order(aligned_size));
> +fail_alloc_pages:
> +	kfree(flat_buf);
> +	return -ENOMEM;
>   }
>   
>   static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
> @@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
>   
>   	if (flat_buf && flat_buf->daddr) {
>   		struct device *real_dev = flat_buf->dev->parent;
> +		ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
>   
> -		dma_free_coherent(real_dev, flat_buf->size,
> -				  flat_buf->vaddr, flat_buf->daddr);
> +		dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
> +			       DMA_FROM_DEVICE);
> +		__free_pages(flat_buf->pages, get_order(aligned_size));
>   	}
>   	kfree(flat_buf);
>   }
>   
>   static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
>   {
> +	struct etr_flat_buf *flat_buf = etr_buf->private;
> +	struct device *real_dev = flat_buf->dev->parent;
> +
>   	/*
>   	 * Adjust the buffer to point to the beginning of the trace data
>   	 * and update the available trace data.
> @@ -648,6 +668,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
>   		etr_buf->len = etr_buf->size;
>   	else
>   		etr_buf->len = rwp - rrp;
> +
> +	/*
> +	 * The driver always starts tracing at the beginning of the buffer,
> +	 * the only reason why we would get a wrap around is when the buffer
> +	 * is full.  Sync the entire buffer in one go for this case.
> +	 */
> +	if (etr_buf->offset + etr_buf->len > etr_buf->size)
> +		dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
> +					etr_buf->size, DMA_FROM_DEVICE);
> +	else
> +		dma_sync_single_for_cpu(real_dev,
> +					flat_buf->daddr + etr_buf->offset,
> +					etr_buf->len, DMA_FROM_DEVICE);
>   }
>   
>   static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode
@ 2021-09-01 20:03   ` Robin Murphy
  0 siblings, 0 replies; 6+ messages in thread
From: Robin Murphy @ 2021-09-01 20:03 UTC (permalink / raw)
  To: Leo Yan, Mathieu Poirier, Suzuki K Poulose, Mike Leach,
	Alexander Shishkin, coresight, linux-arm-kernel, linux-kernel

On 2021-08-29 14:54, Leo Yan wrote:
> The AUX bounce buffer is allocated with API dma_alloc_coherent(), in the
> low level's architecture code, e.g. for Arm64, it maps the memory with
> the attribution "Normal non-cacheable"; this can be concluded from the
> definition for pgprot_dmacoherent() in arch/arm64/include/asm/pgtable.h.
> 
> Later when access the AUX bounce buffer, since the memory mapping is
> non-cacheable, it's low efficiency due to every load instruction must
> reach out DRAM.
> 
> This patch changes to allocate pages with alloc_pages_node(), thus the
> driver can access the memory with cacheable mapping in the kernel linear
> virtual address; therefore, because load instructions can fetch data
> from cache lines rather than always read data from DRAM, the driver can
> boost memory coping performance.  After using the cacheable mapping, the
> driver uses dma_sync_single_for_cpu() to invalidate cacheline prior to
> read bounce buffer so can avoid read stale trace data.
> 
> By measurement the duration for function tmc_update_etr_buffer() with
> ftrace function_graph tracer, it shows the performance significant
> improvement for copying 4MiB data from bounce buffer:
> 
>    # echo tmc_etr_get_data_flat_buf > set_graph_notrace // avoid noise
>    # echo tmc_update_etr_buffer > set_graph_function
>    # echo function_graph > current_tracer
> 
>    before:
> 
>    # CPU  DURATION                  FUNCTION CALLS
>    # |     |   |                     |   |   |   |
>    2)               |    tmc_update_etr_buffer() {
>    ...
>    2) # 8148.320 us |    }
> 
>    after:
> 
>    # CPU  DURATION                  FUNCTION CALLS
>    # |     |   |                     |   |   |   |
>    2)               |  tmc_update_etr_buffer() {
>    ...
>    2) # 2463.980 us |  }
> 
> Signed-off-by: Leo Yan <leo.yan@linaro.org>
> Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
> ---
> 
> Changes from v2:
> Sync the entire buffer in one go when the tracing is wrap around
> (Suzuki);
> Add Suzuki's review tage.
> 
> Changes from v1:
> Set "flat_buf->daddr" to 0 when fails to map DMA region; and dropped the
> unexpected if condition change in tmc_etr_free_flat_buf().
> 
>   .../hwtracing/coresight/coresight-tmc-etr.c   | 47 ++++++++++++++++---
>   1 file changed, 40 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> index 13fd1fc730ed..ac37e9376d2b 100644
> --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
> +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
> @@ -21,6 +21,7 @@
>   
>   struct etr_flat_buf {
>   	struct device	*dev;
> +	struct page	*pages;
>   	dma_addr_t	daddr;
>   	void		*vaddr;
>   	size_t		size;
> @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   {
>   	struct etr_flat_buf *flat_buf;
>   	struct device *real_dev = drvdata->csdev->dev.parent;
> +	ssize_t	aligned_size;
>   
>   	/* We cannot reuse existing pages for flat buf */
>   	if (pages)
> @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   	if (!flat_buf)
>   		return -ENOMEM;
>   
> -	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> -					     &flat_buf->daddr, GFP_KERNEL);
> -	if (!flat_buf->vaddr) {
> -		kfree(flat_buf);
> -		return -ENOMEM;
> +	aligned_size = PAGE_ALIGN(etr_buf->size);
> +	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
> +					   get_order(aligned_size));
> +	if (!flat_buf->pages)
> +		goto fail_alloc_pages;
> +
> +	flat_buf->vaddr = page_address(flat_buf->pages);
> +	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
> +				       aligned_size, DMA_FROM_DEVICE);

Use dma_alloc_noncoherent() rather than open-coding this - bare 
alloc_pages() has no understanding of DMA masks, and you wouldn't want 
to end up in the worst case of dma_map_page() bounce-buffering your 
bounce buffer...

Robin.

> +	if (dma_mapping_error(real_dev, flat_buf->daddr)) {
> +		flat_buf->daddr = 0;
> +		goto fail_dma_map_page;
>   	}
>   
>   	flat_buf->size = etr_buf->size;
> @@ -622,6 +631,12 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
>   	etr_buf->mode = ETR_MODE_FLAT;
>   	etr_buf->private = flat_buf;
>   	return 0;
> +
> +fail_dma_map_page:
> +	__free_pages(flat_buf->pages, get_order(aligned_size));
> +fail_alloc_pages:
> +	kfree(flat_buf);
> +	return -ENOMEM;
>   }
>   
>   static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
> @@ -630,15 +645,20 @@ static void tmc_etr_free_flat_buf(struct etr_buf *etr_buf)
>   
>   	if (flat_buf && flat_buf->daddr) {
>   		struct device *real_dev = flat_buf->dev->parent;
> +		ssize_t aligned_size = PAGE_ALIGN(etr_buf->size);
>   
> -		dma_free_coherent(real_dev, flat_buf->size,
> -				  flat_buf->vaddr, flat_buf->daddr);
> +		dma_unmap_page(real_dev, flat_buf->daddr, aligned_size,
> +			       DMA_FROM_DEVICE);
> +		__free_pages(flat_buf->pages, get_order(aligned_size));
>   	}
>   	kfree(flat_buf);
>   }
>   
>   static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
>   {
> +	struct etr_flat_buf *flat_buf = etr_buf->private;
> +	struct device *real_dev = flat_buf->dev->parent;
> +
>   	/*
>   	 * Adjust the buffer to point to the beginning of the trace data
>   	 * and update the available trace data.
> @@ -648,6 +668,19 @@ static void tmc_etr_sync_flat_buf(struct etr_buf *etr_buf, u64 rrp, u64 rwp)
>   		etr_buf->len = etr_buf->size;
>   	else
>   		etr_buf->len = rwp - rrp;
> +
> +	/*
> +	 * The driver always starts tracing at the beginning of the buffer,
> +	 * the only reason why we would get a wrap around is when the buffer
> +	 * is full.  Sync the entire buffer in one go for this case.
> +	 */
> +	if (etr_buf->offset + etr_buf->len > etr_buf->size)
> +		dma_sync_single_for_cpu(real_dev, flat_buf->daddr,
> +					etr_buf->size, DMA_FROM_DEVICE);
> +	else
> +		dma_sync_single_for_cpu(real_dev,
> +					flat_buf->daddr + etr_buf->offset,
> +					etr_buf->len, DMA_FROM_DEVICE);
>   }
>   
>   static ssize_t tmc_etr_get_data_flat_buf(struct etr_buf *etr_buf,
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode
  2021-09-01 20:03   ` Robin Murphy
@ 2021-09-02 12:54     ` Leo Yan
  -1 siblings, 0 replies; 6+ messages in thread
From: Leo Yan @ 2021-09-02 12:54 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Mathieu Poirier, Suzuki K Poulose, Mike Leach,
	Alexander Shishkin, coresight, linux-arm-kernel, linux-kernel

Hi Robin,

On Wed, Sep 01, 2021 at 09:03:30PM +0100, Robin Murphy wrote:

[...]

> > @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> >   {
> >   	struct etr_flat_buf *flat_buf;
> >   	struct device *real_dev = drvdata->csdev->dev.parent;
> > +	ssize_t	aligned_size;
> >   	/* We cannot reuse existing pages for flat buf */
> >   	if (pages)
> > @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> >   	if (!flat_buf)
> >   		return -ENOMEM;
> > -	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> > -					     &flat_buf->daddr, GFP_KERNEL);
> > -	if (!flat_buf->vaddr) {
> > -		kfree(flat_buf);
> > -		return -ENOMEM;
> > +	aligned_size = PAGE_ALIGN(etr_buf->size);
> > +	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
> > +					   get_order(aligned_size));
> > +	if (!flat_buf->pages)
> > +		goto fail_alloc_pages;
> > +
> > +	flat_buf->vaddr = page_address(flat_buf->pages);
> > +	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
> > +				       aligned_size, DMA_FROM_DEVICE);
> 
> Use dma_alloc_noncoherent() rather than open-coding this - bare
> alloc_pages() has no understanding of DMA masks, and you wouldn't want to
> end up in the worst case of dma_map_page() bounce-buffering your bounce
> buffer...

Will refine the code with dma_alloc_noncoherent(); it's much reliable
than the self writing code.

Thanks a lot for the suggestion!

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode
@ 2021-09-02 12:54     ` Leo Yan
  0 siblings, 0 replies; 6+ messages in thread
From: Leo Yan @ 2021-09-02 12:54 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Mathieu Poirier, Suzuki K Poulose, Mike Leach,
	Alexander Shishkin, coresight, linux-arm-kernel, linux-kernel

Hi Robin,

On Wed, Sep 01, 2021 at 09:03:30PM +0100, Robin Murphy wrote:

[...]

> > @@ -600,6 +601,7 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> >   {
> >   	struct etr_flat_buf *flat_buf;
> >   	struct device *real_dev = drvdata->csdev->dev.parent;
> > +	ssize_t	aligned_size;
> >   	/* We cannot reuse existing pages for flat buf */
> >   	if (pages)
> > @@ -609,11 +611,18 @@ static int tmc_etr_alloc_flat_buf(struct tmc_drvdata *drvdata,
> >   	if (!flat_buf)
> >   		return -ENOMEM;
> > -	flat_buf->vaddr = dma_alloc_coherent(real_dev, etr_buf->size,
> > -					     &flat_buf->daddr, GFP_KERNEL);
> > -	if (!flat_buf->vaddr) {
> > -		kfree(flat_buf);
> > -		return -ENOMEM;
> > +	aligned_size = PAGE_ALIGN(etr_buf->size);
> > +	flat_buf->pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
> > +					   get_order(aligned_size));
> > +	if (!flat_buf->pages)
> > +		goto fail_alloc_pages;
> > +
> > +	flat_buf->vaddr = page_address(flat_buf->pages);
> > +	flat_buf->daddr = dma_map_page(real_dev, flat_buf->pages, 0,
> > +				       aligned_size, DMA_FROM_DEVICE);
> 
> Use dma_alloc_noncoherent() rather than open-coding this - bare
> alloc_pages() has no understanding of DMA masks, and you wouldn't want to
> end up in the worst case of dma_map_page() bounce-buffering your bounce
> buffer...

Will refine the code with dma_alloc_noncoherent(); it's much reliable
than the self writing code.

Thanks a lot for the suggestion!

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-09-02 12:56 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-29 13:54 [PATCH v3] coresight: tmc-etr: Speed up for bounce buffer in flat mode Leo Yan
2021-08-29 13:54 ` Leo Yan
2021-09-01 20:03 ` Robin Murphy
2021-09-01 20:03   ` Robin Murphy
2021-09-02 12:54   ` Leo Yan
2021-09-02 12:54     ` Leo Yan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.