linux-pci.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yinghai Lu <yinghai@kernel.org>
To: Bjorn Helgaas <bhelgaas@google.com>,
	David Miller <davem@davemloft.net>,
	David Ahern <david.ahern@oracle.com>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Wei Yang <weiyang@linux.vnet.ibm.com>, TJ <linux@iam.tj>,
	Yijing Wang <wangyijing@huawei.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-pci@vger.kernel.org, linux-kernel@vger.kernel.org,
	Yinghai Lu <yinghai@kernel.org>
Subject: [PATCH 04/36] PCI: Optimize bus align/size calculation during sizing
Date: Mon,  6 Jul 2015 16:38:54 -0700	[thread overview]
Message-ID: <1436225966-27247-5-git-send-email-yinghai@kernel.org> (raw)
In-Reply-To: <1436225966-27247-1-git-send-email-yinghai@kernel.org>

Current code try to get align as small as possible and use that to
align final size. But it does not handle resource that size is bigger
than align in optimal way, kernel only use max align for them.

For example:
 when we have resources with align/size: 1M/1M, 512M/512M,
   bus resource min_align/size0 will be 256M/768M.
 when we have resources with align/size: 1M/2M, 512M/512M,
   bus resource min_align/size0 will be 512M/1024M,
   but optimal value should be 256M/768M.

Under following cases that we have resource size that is bigger
than resource alignment:
1. SRIOV bar.
2. PCI bridges with several bridges or devices as children.

We can try to allocate children devices resources under range
[half_align, half_align + aligned_size).
If sucesses, we can use that half_align as new min_align.

After this patch, we get:
 align/size: 1M/2M, 2M/4M, 4M/8M, 8M/16M
 new min_align/min_size: 4M/32M, and old is 8M/32M

 align/size: 1M/2M, 2M/4M, 4M/8M
 new min_align/min_size: 2M/14M, and old is 4M/16M

 align/size: 1M/2M, 512M/512M
 new min_align/min_size: 256M/768M, and old is 512M/1024M

The real result from one system with one pcie card that has
four functions that support sriov:
 align/size:
   00800000/00800000
   00800000/00800000
   00800000/00800000
   00800000/00800000
   00010000/00200000
   00010000/00200000
   00010000/00200000
   00010000/00200000
   00008000/00008000
   00008000/00008000
   00008000/00008000
   00008000/00008000
   00004000/00080000
   00004000/00080000
   00004000/00080000
   00004000/00080000
 old min_align/min_size: 00400000/02c00000
     min_align/min_size: 00100000/02b00000

So align will be 1M instead of 4M.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Reported-by: TJ <linux@iam.tj>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 drivers/pci/setup-bus.c | 195 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 157 insertions(+), 38 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 9f4c477..87cf431 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -30,6 +30,34 @@
 
 unsigned int pci_flags;
 
+static inline bool is_before(resource_size_t align1, resource_size_t size1,
+			     resource_size_t align2, resource_size_t size2)
+{
+	resource_size_t size1_left, size2_left;
+
+	/* big align is before small align */
+	if (align1 > align2)
+		return true;
+
+	/*
+	 * for same align:
+	 *   aligned is before not aligned
+	 *   for not aligned, big remainder is before small remainder
+	 */
+	if (align1 == align2) {
+		size1_left = size1 & (align1 - 1);
+		if (!size1_left)
+			size1_left = align1;
+		size2_left = size2 & (align2 - 1);
+		if (!size2_left)
+			size2_left = align2;
+		if (size1_left > size2_left)
+			return true;
+	}
+
+	return false;
+}
+
 struct pci_dev_resource {
 	struct list_head list;
 	struct resource *res;
@@ -994,26 +1022,125 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 	}
 }
 
-static inline resource_size_t calculate_mem_align(resource_size_t *aligns,
-						  int max_order)
+struct align_test_res {
+	struct list_head list;
+	struct resource res;
+	resource_size_t size;
+	resource_size_t align;
+};
+
+static void free_align_test_list(struct list_head *head)
 {
-	resource_size_t align = 0;
-	resource_size_t min_align = 0;
-	int order;
+	struct align_test_res *p, *tmp;
 
-	for (order = 0; order <= max_order; order++) {
-		resource_size_t align1 = 1;
+	list_for_each_entry_safe(p, tmp, head, list) {
+		list_del(&p->list);
+		kfree(p);
+	}
+}
 
-		align1 <<= (order + 20);
+static int add_to_align_test_list(struct list_head *head,
+				  resource_size_t align, resource_size_t size)
+{
+	struct align_test_res *tmp;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	tmp->align = align;
+	tmp->size = size;
+
+	list_add_tail(&tmp->list, head);
+
+	return 0;
+}
+
+static void __sort_align_test(struct list_head *head)
+{
+	struct align_test_res *res1, *tmp_res, *res2;
 
-		if (!align)
-			min_align = align1;
-		else if (ALIGN(align + min_align, min_align) < align1)
-			min_align = align1 >> 1;
-		align += aligns[order];
+	list_for_each_entry_safe(res1, tmp_res, head, list) {
+		/* reorder it */
+		list_for_each_entry(res2, head, list) {
+			if (res2 == res1)
+				break;
+
+			if (is_before(res1->align, res1->size,
+				      res2->align, res2->size)) {
+				list_move_tail(&res1->list, &res2->list);
+				break;
+			}
+		}
+	}
+}
+
+static bool is_align_size_good(struct list_head *head,
+			resource_size_t min_align, resource_size_t size,
+			resource_size_t start)
+{
+	struct align_test_res *p;
+	struct resource root;
+
+	memset(&root, 0, sizeof(root));
+	root.start = start;
+	root.end = start + size - 1;
+
+	list_for_each_entry(p, head, list)
+		memset(&p->res, 0, sizeof(p->res));
+
+	list_for_each_entry(p, head, list)
+		if (allocate_resource(&root, &p->res, p->size,
+				0, (resource_size_t)-1ULL,
+				p->align, NULL, NULL))
+			return false;
+
+	return true;
+}
+
+static resource_size_t calculate_mem_align(struct list_head *head,
+				resource_size_t max_align, resource_size_t size,
+				resource_size_t align_low)
+{
+	struct align_test_res *p;
+	resource_size_t min_align, good_align, aligned_size, start;
+	int count = 0;
+
+	if (max_align <= align_low) {
+		good_align = align_low;
+		goto out;
 	}
 
-	return min_align;
+	good_align = max_align;
+
+	list_for_each_entry(p, head, list)
+		count++;
+
+	if (count <= 1)
+		goto out;
+
+	__sort_align_test(head);
+
+	do {
+		/* check if we can use smaller align */
+		min_align = good_align >> 1;
+		aligned_size = ALIGN(size, min_align);
+
+		/* need to make sure every offset work */
+		for (start = min_align; start < max_align; start += min_align) {
+			/* checked already with last align ? */
+			if (!(start & (good_align - 1)))
+				continue;
+
+			if (!is_align_size_good(head, min_align, aligned_size,
+					       start))
+				goto out;
+		}
+		good_align = min_align;
+	} while (min_align > align_low);
+
+out:
+	return good_align;
 }
 
 /**
@@ -1043,19 +1170,17 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 {
 	struct pci_dev *dev;
 	resource_size_t min_align, align, size, size0, size1;
-	resource_size_t aligns[18];	/* Alignments from 1Mb to 128Gb */
-	int order, max_order;
+	resource_size_t max_align = 0;
 	struct resource *b_res = find_free_bus_resource(bus,
 					mask | IORESOURCE_PREFETCH, type);
 	resource_size_t children_add_size = 0;
 	resource_size_t children_add_align = 0;
 	resource_size_t add_align = 0;
+	LIST_HEAD(align_test_list);
 
 	if (!b_res)
 		return -ENOSPC;
 
-	memset(aligns, 0, sizeof(aligns));
-	max_order = 0;
 	size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
@@ -1081,29 +1206,20 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 				continue;
 			}
 #endif
-			/*
-			 * aligns[0] is for 1MB (since bridge memory
-			 * windows are always at least 1MB aligned), so
-			 * keep "order" from being negative for smaller
-			 * resources.
-			 */
 			align = pci_resource_alignment(dev, r);
-			order = __ffs(align) - 20;
-			if (order < 0)
-				order = 0;
-			if (order >= ARRAY_SIZE(aligns)) {
+			if (align > (1ULL<<37)) { /*128 Gb*/
 				dev_warn(&dev->dev, "disabling BAR %d: %pR (bad alignment %#llx)\n",
-					 i, r, (unsigned long long) align);
+					i, r, (unsigned long long) align);
 				r->flags = 0;
 				continue;
 			}
+
+			if (r_size > 1)
+				add_to_align_test_list(&align_test_list,
+							align, r_size);
 			size += r_size;
-			/* Exclude ranges with size > align from
-			   calculation of the alignment. */
-			if (r_size == align)
-				aligns[order] += align;
-			if (order > max_order)
-				max_order = order;
+			if (align > max_align)
+				max_align = align;
 
 			if (realloc_head) {
 				children_add_size += get_res_add_size(realloc_head, r);
@@ -1113,9 +1229,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 		}
 	}
 
-	min_align = calculate_mem_align(aligns, max_order);
-	min_align = max(min_align, window_alignment(bus, b_res->flags));
-	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
+	max_align = max(max_align, window_alignment(bus, b_res->flags));
+	min_align = calculate_mem_align(&align_test_list, max_align, size,
+					window_alignment(bus, b_res->flags));
+	size0 = calculate_memsize(size, min_size, 0,
+				  resource_size(b_res), min_align);
+	free_align_test_list(&align_test_list);
 	add_align = max(min_align, add_align);
 	if (children_add_size > add_size)
 		add_size = children_add_size;
-- 
1.8.4.5


  parent reply	other threads:[~2015-07-06 23:41 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-07-06 23:38 [PATCH 00/36] PCI: Resource allocation cleanup for v4.3 Yinghai Lu
2015-07-06 23:38 ` [PATCH 01/36] PCI: Cleanup res_to_dev_res() printout for addon resources Yinghai Lu
2015-07-06 23:38 ` [PATCH 02/36] PCI: Reuse res_to_dev_res in reassign_resources_sorted Yinghai Lu
2015-07-06 23:38 ` [PATCH 03/36] PCI: Use correct align for optional only resources during sorting for resource allocation Yinghai Lu
2015-07-06 23:38 ` Yinghai Lu [this message]
2015-07-06 23:38 ` [PATCH 05/36] PCI: Optimize bus align/size calculation for optional during sizing Yinghai Lu
2015-07-06 23:38 ` [PATCH 06/36] PCI: Reorder resources list for must/optional resources Yinghai Lu
2015-07-06 23:38 ` [PATCH 07/36] PCI: Remove duplicated code for resource sorting Yinghai Lu
2015-07-06 23:38 ` [PATCH 08/36] PCI: Rename pdev_sort_resources to pdev_check_resources Yinghai Lu
2015-07-06 23:38 ` [PATCH 09/36] PCI: Treat ROM resource as optional during realloc Yinghai Lu
2015-07-06 23:39 ` [PATCH 10/36] PCI: Add debug printout during releasing partial must/optinal assigned resources Yinghai Lu
2015-07-06 23:39 ` [PATCH 11/36] PCI: Simplify res reference using in __assign_resourcs_sorted Yinghai Lu
2015-07-06 23:39 ` [PATCH 12/36] PCI: Separate realloc list checking after allocation Yinghai Lu
2015-07-06 23:39 ` [PATCH 13/36] PCI: Add __add_to_list() Yinghai Lu
2015-07-06 23:39 ` [PATCH 14/36] PCI: Separate must_add assigning to another function Yinghai Lu
2015-07-06 23:39 ` [PATCH 15/36] PCI: Bail out early if there is no addon Yinghai Lu
2015-07-06 23:39 ` [PATCH 16/36] PCI: Add alt_size allocation support Yinghai Lu
2015-07-06 23:39 ` [PATCH 17/36] PCI: Add support for more than two alt_size under same bridge Yinghai Lu
2015-07-15  3:07   ` Yijing Wang
2015-07-15  5:08     ` Yinghai Lu
2015-07-15  5:16       ` Yijing Wang
2015-07-06 23:39 ` [PATCH 18/36] PCI: Better support for two alt_size Yinghai Lu
2015-07-06 23:39 ` [PATCH 19/36] resources: Split out __allocate_resource() Yinghai Lu
2015-07-06 23:39 ` [PATCH 20/36] resources: Make allocate_resource return just fit resource Yinghai Lu
2015-07-06 23:39 ` [PATCH 21/36] PCI: Check pref compatible bit for mem64 resource of pcie device Yinghai Lu
2015-07-06 23:39 ` [PATCH 22/36] PCI: Only treat non-pef mmio64 as pref if all bridges has MEM_64 Yinghai Lu
2015-07-06 23:39 ` [PATCH 23/36] PCI: Add has_mem64 for host_bridge Yinghai Lu
2015-07-06 23:39 ` [PATCH 24/36] PCI: Only treat non-pef mmio64 as pref if host-bridge has_mem64 Yinghai Lu
2015-07-06 23:39 ` [PATCH 25/36] PCI: Restore pref mmio allocation logic for hostbridge without mmio64 Yinghai Lu
2015-07-06 23:39 ` [PATCH 26/36] sparc/PCI: Add mem64 resource parsing for root bus Yinghai Lu
2015-07-06 23:39 ` [PATCH 27/36] sparc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in of parsing Yinghai Lu
2015-07-06 23:39 ` [PATCH 28/36] powerpc/PCI: " Yinghai Lu
2015-07-06 23:39 ` [PATCH 29/36] of/PCI: Add IORESOURCE_MEM_64 for 64-bit resource Yinghai Lu
2015-07-10 20:21   ` Rob Herring
2015-07-06 23:39 ` [PATCH 30/36] PCI: Treat optional as must in first try for bridge rescan Yinghai Lu
2015-07-06 23:39 ` [PATCH 31/36] PCI: Get new realloc size for bridge for last try Yinghai Lu
2015-07-06 23:39 ` [PATCH 32/36] PCI: Don't release sibiling bridge resources during hotplug Yinghai Lu
2015-07-06 23:39 ` [PATCH 33/36] PCI: Don't release fixed resource for realloc Yinghai Lu
2015-07-06 23:39 ` [PATCH 34/36] PCI: Set resource to FIXED for lsi devices Yinghai Lu
2015-07-06 23:39 ` [PATCH 35/36] PCI, x86: Add pci=assign_pref_bars to re-allocate pref bars Yinghai Lu
2015-07-06 23:39 ` [PATCH 36/36] PCI: Don't set flags to 0 when assign resource fail Yinghai Lu
2015-07-09  3:30   ` Wei Yang
2015-07-09  5:01     ` Yinghai Lu
2015-07-09  6:04       ` Wei Yang
2015-07-09 16:20         ` Yinghai Lu
2015-07-10  2:30           ` Wei Yang
2015-07-10  2:48             ` Yinghai Lu
2015-07-10  5:49               ` Yinghai Lu
2015-07-11  0:03                 ` Wei Yang
2015-07-11  0:42                   ` Yinghai Lu
2015-07-11  1:37                     ` Wei Yang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1436225966-27247-5-git-send-email-yinghai@kernel.org \
    --to=yinghai@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=benh@kernel.crashing.org \
    --cc=bhelgaas@google.com \
    --cc=davem@davemloft.net \
    --cc=david.ahern@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux@iam.tj \
    --cc=wangyijing@huawei.com \
    --cc=weiyang@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).