linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yinghai Lu <yinghai@kernel.org>
To: Bjorn Helgaas <bhelgaas@google.com>,
	David Miller <davem@davemloft.net>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Linus Torvalds <torvalds@linux-foundation.org>
Cc: Wei Yang <weiyang@linux.vnet.ibm.com>, TJ <linux@iam.tj>,
	Yijing Wang <wangyijing@huawei.com>,
	Khalid Aziz <khalid.aziz@oracle.com>,
	linux-pci@vger.kernel.org, linux-kernel@vger.kernel.org,
	Yinghai Lu <yinghai@kernel.org>
Subject: [PATCH v11 27/60] PCI: Optimize bus min_align/size calculation during sizing
Date: Thu,  7 Apr 2016 17:15:40 -0700	[thread overview]
Message-ID: <1460074573-7481-28-git-send-email-yinghai@kernel.org> (raw)
In-Reply-To: <1460074573-7481-1-git-send-email-yinghai@kernel.org>

During bus mmio resource sizing stage, current code try to get alignment as
small as possible and use that to align size to get final size. But it does
not handle resource that size is bigger than alignment in optimal way, kernel
only use max alignment for them.

For example:
 When we have resources with align/size: 1M/2M, 512M/512M,
 current code will have bus resource min_align/size: 512M/1024M,
 but optimal value should be 256M/768M, as we can fit them into
 [256M,768M) or [512M,1280M) instead of [512M,1536M).

 0M        256M        512M       768M       1024M      1280M
 |----------|-----------|----------|----------|----------|----------|
when we have [256M,1024M)
            |---------------------------------|
            |-2M-|      |---512M--------------|
when we have [512M,1280M)
                        |--------------------------------|
                        |---512M--------------|-2M-|

For following cases that we have resource size that is bigger
than resource alignment:
1. SRIOV bar.
2. PCI bridges with children that need several MMIOs that are more than 1M.

We can keep on trying to allocate children devices resources from range
[offset, offset + aligned_size) and offset is aligned with half min_align.
If it sucesses, we can use that half min_align as new min_align.

After this patch, we get:
 align/size: 1M/2M, 2M/4M, 4M/8M, 8M/16M
 new min_align/min_size: 4M/32M, and original is 8M/32M

 align/size: 1M/2M, 2M/4M, 4M/8M
 new min_align/min_size: 2M/14M, and original is 4M/16M

 align/size: 1M/2M, 512M/512M
 new min_align/min_size: 256M/768M, and original is 512M/1024M

The real result from one system with one pcie card that has
four functions that support sriov:
 children resources with align/size:
   00800000/00800000, 00800000/00800000, 00800000/00800000,
   00800000/00800000, 00010000/00200000, 00010000/00200000,
   00010000/00200000, 00010000/00200000, 00008000/00008000,
   00008000/00008000, 00008000/00008000, 00008000/00008000,
   00004000/00080000, 00004000/00080000, 00004000/00080000,
   00004000/00080000
for the bridge:
With original code we have min_align/min_size: 00400000/02c00000,
and with this patch we have min_align/min_size: 00100000/02b00000
So min_align will be 1M instead of 4M and we even have smaller size.

-v2: Need to check more offset with every min_alignment.
-v3: skip r_size <= 1 for optional only bridge resources.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=81431
Reported-by: TJ <linux@iam.tj>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
---
 drivers/pci/setup-bus.c | 195 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 157 insertions(+), 38 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 544f518..3051bb7 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -29,6 +29,34 @@
 
 unsigned int pci_flags;
 
+static inline bool is_before(resource_size_t align1, resource_size_t size1,
+			     resource_size_t align2, resource_size_t size2)
+{
+	resource_size_t size1_left, size2_left;
+
+	/* big align is before small align */
+	if (align1 > align2)
+		return true;
+
+	/*
+	 * for same align:
+	 *   aligned is before not aligned
+	 *   for not aligned, big remainder is before small remainder
+	 */
+	if (align1 == align2) {
+		size1_left = size1 & (align1 - 1);
+		if (!size1_left)
+			size1_left = align1;
+		size2_left = size2 & (align2 - 1);
+		if (!size2_left)
+			size2_left = align2;
+		if (size1_left > size2_left)
+			return true;
+	}
+
+	return false;
+}
+
 struct pci_dev_resource {
 	struct list_head list;
 	struct resource *res;
@@ -1041,26 +1069,125 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 	}
 }
 
-static inline resource_size_t calculate_mem_align(resource_size_t *aligns,
-						  int max_order)
+struct align_test_res {
+	struct list_head list;
+	struct resource res;
+	resource_size_t size;
+	resource_size_t align;
+};
+
+static void free_align_test_list(struct list_head *head)
 {
-	resource_size_t align = 0;
-	resource_size_t min_align = 0;
-	int order;
+	struct align_test_res *p, *tmp;
 
-	for (order = 0; order <= max_order; order++) {
-		resource_size_t align1 = 1;
+	list_for_each_entry_safe(p, tmp, head, list) {
+		list_del(&p->list);
+		kfree(p);
+	}
+}
 
-		align1 <<= (order + 20);
+static int add_to_align_test_list(struct list_head *head,
+				  resource_size_t align, resource_size_t size)
+{
+	struct align_test_res *tmp;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	tmp->align = align;
+	tmp->size = size;
+
+	list_add_tail(&tmp->list, head);
+
+	return 0;
+}
+
+static void sort_align_test(struct list_head *head)
+{
+	struct align_test_res *res1, *tmp_res, *res2;
 
-		if (!align)
-			min_align = align1;
-		else if (ALIGN(align + min_align, min_align) < align1)
-			min_align = align1 >> 1;
-		align += aligns[order];
+	list_for_each_entry_safe(res1, tmp_res, head, list) {
+		/* reorder it */
+		list_for_each_entry(res2, head, list) {
+			if (res2 == res1)
+				break;
+
+			if (is_before(res1->align, res1->size,
+				      res2->align, res2->size)) {
+				list_move_tail(&res1->list, &res2->list);
+				break;
+			}
+		}
+	}
+}
+
+static bool is_align_size_good(struct list_head *head,
+			resource_size_t min_align, resource_size_t size,
+			resource_size_t start)
+{
+	struct align_test_res *p;
+	struct resource root;
+
+	memset(&root, 0, sizeof(root));
+	root.start = start;
+	root.end = start + size - 1;
+
+	list_for_each_entry(p, head, list)
+		memset(&p->res, 0, sizeof(p->res));
+
+	list_for_each_entry(p, head, list)
+		if (allocate_resource(&root, &p->res, p->size,
+				0, (resource_size_t)-1ULL,
+				p->align, NULL, NULL))
+			return false;
+
+	return true;
+}
+
+static resource_size_t calculate_mem_align(struct list_head *head,
+				resource_size_t max_align, resource_size_t size,
+				resource_size_t align_low)
+{
+	struct align_test_res *p;
+	resource_size_t min_align, good_align, aligned_size, start;
+	int count = 0;
+
+	if (max_align <= align_low) {
+		good_align = align_low;
+		goto out;
 	}
 
-	return min_align;
+	good_align = max_align;
+
+	list_for_each_entry(p, head, list)
+		count++;
+
+	if (count <= 1)
+		goto out;
+
+	sort_align_test(head);
+
+	do {
+		/* check if we can use smaller align */
+		min_align = good_align >> 1;
+		aligned_size = ALIGN(size, min_align);
+
+		/* need to make sure every offset work */
+		for (start = min_align; start < max_align; start += min_align) {
+			/* checked already with last align ? */
+			if (!(start & (good_align - 1)))
+				continue;
+
+			if (!is_align_size_good(head, min_align, aligned_size,
+					       start))
+				goto out;
+		}
+		good_align = min_align;
+	} while (min_align > align_low);
+
+out:
+	return good_align;
 }
 
 /**
@@ -1090,19 +1217,17 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 {
 	struct pci_dev *dev;
 	resource_size_t min_align, align, size, size0, size1;
-	resource_size_t aligns[18];	/* Alignments from 1Mb to 128Gb */
-	int order, max_order;
+	resource_size_t max_align = 0;
 	struct resource *b_res = find_free_bus_resource(bus,
 					mask | IORESOURCE_PREFETCH, type);
 	resource_size_t children_add_size = 0;
 	resource_size_t children_add_align = 0;
 	resource_size_t add_align = 0;
+	LIST_HEAD(align_test_list);
 
 	if (!b_res)
 		return -ENOSPC;
 
-	memset(aligns, 0, sizeof(aligns));
-	max_order = 0;
 	size = 0;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
@@ -1130,29 +1255,20 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 				continue;
 			}
 #endif
-			/*
-			 * aligns[0] is for 1MB (since bridge memory
-			 * windows are always at least 1MB aligned), so
-			 * keep "order" from being negative for smaller
-			 * resources.
-			 */
 			align = pci_resource_alignment(dev, r);
-			order = __ffs(align) - 20;
-			if (order < 0)
-				order = 0;
-			if (order >= ARRAY_SIZE(aligns)) {
+			if (align > (1ULL<<37)) { /*128 Gb*/
 				dev_warn(&dev->dev, "disabling BAR %d: %pR (bad alignment %#llx)\n",
-					 i, r, (unsigned long long) align);
+					i, r, (unsigned long long) align);
 				r->flags = 0;
 				continue;
 			}
+
+			if (r_size > 1)
+				add_to_align_test_list(&align_test_list,
+							align, r_size);
 			size += r_size;
-			/* Exclude ranges with size > align from
-			   calculation of the alignment. */
-			if (r_size == align)
-				aligns[order] += align;
-			if (order > max_order)
-				max_order = order;
+			if (align > max_align)
+				max_align = align;
 
 			if (realloc_head) {
 				children_add_size += get_res_add_size(realloc_head, r);
@@ -1162,9 +1278,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 		}
 	}
 
-	min_align = calculate_mem_align(aligns, max_order);
-	min_align = max(min_align, window_alignment(bus, b_res->flags));
-	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
+	max_align = max(max_align, window_alignment(bus, b_res->flags));
+	min_align = calculate_mem_align(&align_test_list, max_align, size,
+					window_alignment(bus, b_res->flags));
+	size0 = calculate_memsize(size, min_size, 0,
+				  resource_size(b_res), min_align);
+	free_align_test_list(&align_test_list);
 	add_align = max(min_align, add_align);
 	if (children_add_size > add_size)
 		add_size = children_add_size;
-- 
1.8.4.5

  parent reply	other threads:[~2016-04-08  0:29 UTC|newest]

Thread overview: 86+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-04-08  0:15 [PATCH v11 00/60] PCI: Resource allocation cleanup for v4.7 Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 01/60] PCI: Fix iomem_is_exclusive() checking in pci_mmap_resource() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 02/60] alpha/PCI: Only check iomem_is_exclusive() for IORESOURCE_MEM, not IORESOURCE_IO Yinghai Lu
2016-04-25 21:01   ` Bjorn Helgaas
2016-04-08  0:15 ` [PATCH v11 03/60] PCI: Add pci_find_bus_resource() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 04/60] sparc/PCI: Use correct offset for bus address to resource Yinghai Lu
2016-04-22 20:49   ` Bjorn Helgaas
2016-04-28  4:55     ` Yinghai Lu
2016-04-28 13:56       ` Bjorn Helgaas
2016-04-29  7:19         ` Yinghai Lu
2016-05-03 22:52           ` Yinghai Lu
2016-05-04  0:37             ` Benjamin Herrenschmidt
2016-05-04  1:25               ` Bjorn Helgaas
2016-05-04  5:08                 ` Yinghai Lu
2016-05-04  5:52                   ` Yinghai Lu
2016-05-04 15:17                     ` Bjorn Helgaas
2016-05-04 18:46                       ` Yinghai Lu
2016-05-05  0:25                         ` Yinghai Lu
2016-05-05 15:53                           ` Yinghai Lu
2016-05-05 22:02                             ` Benjamin Herrenschmidt
2016-05-06  0:56                               ` Yinghai Lu
2016-05-06  4:18                                 ` Yinghai Lu
2016-05-06 18:26                             ` Bjorn Helgaas
2016-05-10  6:18                               ` Yinghai Lu
2016-05-04  4:17               ` David Miller
2016-04-08  0:15 ` [PATCH v11 05/60] sparc/PCI: Reserve legacy mmio after PCI mmio Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 06/60] sparc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in OF parsing Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 07/60] sparc/PCI: Keep resource idx order with bridge register number Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 08/60] PCI: Kill wrong quirk about M7101 Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 09/60] powerpc/PCI: Keep resource idx order with bridge register number Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 10/60] powerpc/PCI: Add IORESOURCE_MEM_64 for 64-bit resource in OF parsing Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 11/60] OF/PCI: Add IORESOURCE_MEM_64 for 64-bit resource Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 12/60] PCI: Check pref compatible bit for mem64 resource of PCIe device Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 13/60] PCI: Only treat non-pref mmio64 as pref if all bridges have MEM_64 Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 14/60] PCI: Add has_mem64 for struct host_bridge Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 15/60] PCI: Only treat non-pref mmio64 as pref if host bridge has mmio64 Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 16/60] PCI: Restore pref MMIO allocation logic for host bridge without mmio64 Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 17/60] PCI: Don't release fixed resource for realloc Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 18/60] PCI: Claim fixed resource during remove/rescan path Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 19/60] PCI: Set resource to FIXED for LSI devices Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 20/60] PCI: Separate realloc list checking after allocation Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 21/60] PCI: Treat optional as required in first try for bridge rescan Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 22/60] PCI: Get new realloc size for bridge for last try Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 23/60] PCI: Don't release sibling bridge resources during hotplug Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 24/60] PCI: Cleanup res_to_dev_res() printout Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 25/60] PCI: Reuse res_to_dev_res() in reassign_resources_sorted() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 26/60] PCI: Use correct align for optional only resources during sorting Yinghai Lu
2016-04-08  0:15 ` Yinghai Lu [this message]
2016-04-08  0:15 ` [PATCH v11 28/60] PCI: Optimize bus align/size calculation for optional during sizing Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 29/60] PCI: Don't add too much optional size for hotplug bridge MMIO Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 30/60] PCI: Reorder resources list for required/optional resources Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 31/60] PCI: Remove duplicated code for resource sorting Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 32/60] PCI: Rename pdev_sort_resources() to pdev_assign_resources_prepare() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 33/60] PCI: Treat ROM resource as optional during realloc Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 34/60] PCI: Add debug printout during releasing partial assigned resources Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 35/60] PCI: Simplify res reference using in __assign_resources_sorted() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 36/60] PCI: Add __add_to_list() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 37/60] PCI: Cache window alignment value during bus sizing Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 38/60] PCI: Check if resource is allocated before trying to assign one Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 39/60] PCI: Separate out save_resources()/restore_resources() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 40/60] PCI: Move comment to pci_need_to_release() Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 41/60] PCI: Separate required+optional assigning to another function Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 42/60] PCI: Skip required+optional if there is no optional Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 43/60] PCI: Move saved required resource list out of required+optional assigning Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 44/60] PCI: Add alt_size ressource allocation support Yinghai Lu
2016-04-08  0:56   ` Linus Torvalds
2016-04-08  5:50     ` Yinghai Lu
2016-04-08  6:24     ` Benjamin Herrenschmidt
2016-04-08  0:15 ` [PATCH v11 45/60] PCI: Add support for more than two alt_size entries under same bridge Yinghai Lu
2016-04-08  0:15 ` [PATCH v11 46/60] PCI: Fix size calculation with old_size on rescan path Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 47/60] PCI: Don't add too much optional size for hotplug bridge io Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 48/60] PCI: Move ISA io port align out of calculate_iosize() Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 49/60] PCI: Don't add too much io port for hotplug bridge with old size Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 50/60] PCI: Unify calculate_size() for io port and MMIO Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 51/60] PCI: Allow bridge optional only io port resource required size to be 0 Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 52/60] PCI: Unify skip_ioresource_align() Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 53/60] PCI: Kill macro checking for bus io port sizing Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 54/60] resources: Make allocate_resource() return best fit resource Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 55/60] PCI, x86: Allocate from high in available window for MMIO Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 56/60] PCI: Add debug print out for min_align and alt_size Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 57/60] PCI, x86: Add pci=assign_pref_bars to reallocate pref BARs Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 58/60] PCI: Introduce resource_disabled() Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 59/60] PCI: Don't set flags to 0 when assign resource fail Yinghai Lu
2016-04-08  0:16 ` [PATCH v11 60/60] PCI: Only try to assign io port only for root bus that support it Yinghai Lu
2016-04-08  0:51 ` [PATCH v11 00/60] PCI: Resource allocation cleanup for v4.7 Linus Torvalds
2016-04-09  5:29   ` Yinghai Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1460074573-7481-28-git-send-email-yinghai@kernel.org \
    --to=yinghai@kernel.org \
    --cc=benh@kernel.crashing.org \
    --cc=bhelgaas@google.com \
    --cc=davem@davemloft.net \
    --cc=khalid.aziz@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux@iam.tj \
    --cc=torvalds@linux-foundation.org \
    --cc=wangyijing@huawei.com \
    --cc=weiyang@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).