All of lore.kernel.org
 help / color / mirror / Atom feed
From: Douglas Anderson <dianders@chromium.org>
To: Russell King <linux@arm.linux.org.uk>
Cc: Robin Murphy <robin.murphy@arm.com>,
	Tomasz Figa <tfiga@chromium.org>,
	Marek Szyprowski <m.szyprowski@samsung.com>,
	Pawel Osciak <pawel@osciak.com>,
	Dmitry Torokhov <dmitry.torokhov@gmail.com>,
	Douglas Anderson <dianders@chromium.org>,
	will.deacon@arm.com, akpm@linux-foundation.org, carlo@caione.org,
	laurent.pinchart+renesas@ideasonboard.com,
	mike.looijmans@topic.nl, penguin-kernel@i-love.sakura.ne.jp,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 2/2] ARM: dma-mapping: sort the pages after allocation
Date: Fri, 18 Dec 2015 14:27:02 -0800	[thread overview]
Message-ID: <1450477622-30948-2-git-send-email-dianders@chromium.org> (raw)
In-Reply-To: <1450477622-30948-1-git-send-email-dianders@chromium.org>

After doing allocation, make one last-ditch effort to get contiguous
regions of pages to optimize TLB usage.  This is a rather simplistic
approach that could be later optimized, but it doesn't hurt and should
only have the opportunity to help.

>From my testing the sort took less than 400us for a 4MB allocation.
That's much faster than the actual allocation which was more than a
millisecond even in the fastest case (and was often several hundred ms).

Signed-off-by: Douglas Anderson <dianders@chromium.org>
---
Changes in v2:
- Sort patch new for v2 (and optional if people hate it).

 arch/arm/mm/dma-mapping.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 9887d432cf1f..d1b3d3e6fe47 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/memblock.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
 #include <linux/iommu.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
@@ -1122,6 +1123,21 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
 	spin_unlock_irqrestore(&mapping->lock, flags);
 }
 
+static int cmp_pfns(const void *a, const void *b)
+{
+	unsigned long a_pfn;
+	unsigned long b_pfn;
+
+	a_pfn = page_to_pfn(*(struct page **)a);
+	b_pfn = page_to_pfn(*(struct page **)b);
+
+	if (a_pfn < b_pfn)
+		return -1;
+	else if (a_pfn > b_pfn)
+		return 1;
+	return 0;
+}
+
 /* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */
 static const int iommu_order_array[] = { 9, 8, 4, 0 };
 
@@ -1133,6 +1149,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 	int array_size = count * sizeof(struct page *);
 	int i = 0;
 	int order_idx = 0;
+	int first_order_zero = -1;
 
 	if (array_size <= PAGE_SIZE)
 		pages = kzalloc(array_size, GFP_KERNEL);
@@ -1171,6 +1188,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		/* Drop down when we get small */
 		if (__fls(count) < order) {
 			order_idx++;
+			/* Don't update first_order_zero; no need to sort end */
 			continue;
 		}
 
@@ -1181,6 +1199,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 			/* Go down a notch at first sign of pressure */
 			if (!pages[i]) {
 				order_idx++;
+				if (iommu_order_array[order_idx] == 0)
+					first_order_zero = i;
 				continue;
 			}
 		} else {
@@ -1201,6 +1221,26 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		count -= 1 << order;
 	}
 
+	/*
+	 * If we folded under memory pressure, try one last ditch event to get
+	 * contiguous pages via sorting.  Under testing this sometimes helped
+	 * get a few more contiguous pages and didn't cost much compared to
+	 * the above allocations.
+	 *
+	 * Note that we only sort the order zero pages so that we don't mess
+	 * up the higher order allocations by sticking small pages in between
+	 * them.
+	 *
+	 * If someone wanted to optimize this more, they could insert extra
+	 * (out of order) single pages in places to help keep virtual and
+	 * physical pages aligned with each other.  As it is we often get
+	 * lucky and get the needed alignment but we're not guaranteed.
+	 */
+	if (first_order_zero >= 0)
+		sort(pages + first_order_zero,
+		     (size >> PAGE_SHIFT) - first_order_zero, sizeof(*pages),
+		     cmp_pfns, NULL);
+
 	return pages;
 error:
 	while (i--)
-- 
2.6.0.rc2.230.g3dd15c0


WARNING: multiple messages have this Message-ID (diff)
From: dianders@chromium.org (Douglas Anderson)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH v2 2/2] ARM: dma-mapping: sort the pages after allocation
Date: Fri, 18 Dec 2015 14:27:02 -0800	[thread overview]
Message-ID: <1450477622-30948-2-git-send-email-dianders@chromium.org> (raw)
In-Reply-To: <1450477622-30948-1-git-send-email-dianders@chromium.org>

After doing allocation, make one last-ditch effort to get contiguous
regions of pages to optimize TLB usage.  This is a rather simplistic
approach that could be later optimized, but it doesn't hurt and should
only have the opportunity to help.

>From my testing the sort took less than 400us for a 4MB allocation.
That's much faster than the actual allocation which was more than a
millisecond even in the fastest case (and was often several hundred ms).

Signed-off-by: Douglas Anderson <dianders@chromium.org>
---
Changes in v2:
- Sort patch new for v2 (and optional if people hate it).

 arch/arm/mm/dma-mapping.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 9887d432cf1f..d1b3d3e6fe47 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/memblock.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
 #include <linux/iommu.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
@@ -1122,6 +1123,21 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
 	spin_unlock_irqrestore(&mapping->lock, flags);
 }
 
+static int cmp_pfns(const void *a, const void *b)
+{
+	unsigned long a_pfn;
+	unsigned long b_pfn;
+
+	a_pfn = page_to_pfn(*(struct page **)a);
+	b_pfn = page_to_pfn(*(struct page **)b);
+
+	if (a_pfn < b_pfn)
+		return -1;
+	else if (a_pfn > b_pfn)
+		return 1;
+	return 0;
+}
+
 /* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */
 static const int iommu_order_array[] = { 9, 8, 4, 0 };
 
@@ -1133,6 +1149,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 	int array_size = count * sizeof(struct page *);
 	int i = 0;
 	int order_idx = 0;
+	int first_order_zero = -1;
 
 	if (array_size <= PAGE_SIZE)
 		pages = kzalloc(array_size, GFP_KERNEL);
@@ -1171,6 +1188,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		/* Drop down when we get small */
 		if (__fls(count) < order) {
 			order_idx++;
+			/* Don't update first_order_zero; no need to sort end */
 			continue;
 		}
 
@@ -1181,6 +1199,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 			/* Go down a notch@first sign of pressure */
 			if (!pages[i]) {
 				order_idx++;
+				if (iommu_order_array[order_idx] == 0)
+					first_order_zero = i;
 				continue;
 			}
 		} else {
@@ -1201,6 +1221,26 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		count -= 1 << order;
 	}
 
+	/*
+	 * If we folded under memory pressure, try one last ditch event to get
+	 * contiguous pages via sorting.  Under testing this sometimes helped
+	 * get a few more contiguous pages and didn't cost much compared to
+	 * the above allocations.
+	 *
+	 * Note that we only sort the order zero pages so that we don't mess
+	 * up the higher order allocations by sticking small pages in between
+	 * them.
+	 *
+	 * If someone wanted to optimize this more, they could insert extra
+	 * (out of order) single pages in places to help keep virtual and
+	 * physical pages aligned with each other.  As it is we often get
+	 * lucky and get the needed alignment but we're not guaranteed.
+	 */
+	if (first_order_zero >= 0)
+		sort(pages + first_order_zero,
+		     (size >> PAGE_SHIFT) - first_order_zero, sizeof(*pages),
+		     cmp_pfns, NULL);
+
 	return pages;
 error:
 	while (i--)
-- 
2.6.0.rc2.230.g3dd15c0

  reply	other threads:[~2015-12-18 22:27 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-12-18 22:27 [PATCH v2 1/2] ARM: dma-mapping: Optimize allocation Douglas Anderson
2015-12-18 22:27 ` Douglas Anderson
2015-12-18 22:27 ` Douglas Anderson [this message]
2015-12-18 22:27   ` [PATCH v2 2/2] ARM: dma-mapping: sort the pages after allocation Douglas Anderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1450477622-30948-2-git-send-email-dianders@chromium.org \
    --to=dianders@chromium.org \
    --cc=akpm@linux-foundation.org \
    --cc=carlo@caione.org \
    --cc=dmitry.torokhov@gmail.com \
    --cc=laurent.pinchart+renesas@ideasonboard.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@arm.linux.org.uk \
    --cc=m.szyprowski@samsung.com \
    --cc=mike.looijmans@topic.nl \
    --cc=pawel@osciak.com \
    --cc=penguin-kernel@i-love.sakura.ne.jp \
    --cc=robin.murphy@arm.com \
    --cc=tfiga@chromium.org \
    --cc=will.deacon@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.