All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>,
	Alistair Popple <alistair@popple.id.au>,
	Oliver O'Halloran <oohall@gmail.com>,
	Sam Bobroff <sbobroff@linux.ibm.com>,
	David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH kernel v3 2/3] powerpc/powernv/ioda2: Allocate TCE table levels on demand for default DMA window
Date: Thu, 30 May 2019 17:03:54 +1000	[thread overview]
Message-ID: <20190530070355.121802-3-aik@ozlabs.ru> (raw)
In-Reply-To: <20190530070355.121802-1-aik@ozlabs.ru>

We allocate only the first level of multilevel TCE tables for KVM
already (alloc_userspace_copy==true), and the rest is allocated on demand.
This is not enabled though for baremetal.

This removes the KVM limitation (implicit, via the alloc_userspace_copy
parameter) and always allocates just the first level. The on-demand
allocation of missing levels is already implemented.

As from now on DMA map might happen with disabled interrupts, this
allocates TCEs with GFP_ATOMIC.

To save time when creating a new clean table, this skips non-allocated
indirect TCE entries in pnv_tce_free just like we already do in
the VFIO IOMMU TCE driver.

This changes the default level number from 1 to 2 to reduce the amount
of memory required for the default 32bit DMA window at the boot time.
The default window size is up to 2GB which requires 4MB of TCEs which is
unlikely to be used entirely or at all as most devices these days are
64bit capable so by switching to 2 levels by default we save 4032KB of
RAM per a device.

While at this, add __GFP_NOWARN to alloc_pages_node() as the userspace
can trigger this path via VFIO, see the failure and try creating a table
again with different parameters which might succeed.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v2:
* added __GFP_NOWARN to alloc_pages_node
---
 arch/powerpc/platforms/powernv/pci.h          |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 1a51e7bfc541..a55dabc8d057 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -225,7 +225,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach(
 		struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
-#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_DEFAULT_LEVELS	2
 #define POWERNV_IOMMU_MAX_LEVELS	5
 
 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..c75ec37bf0cd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	struct page *tce_mem = NULL;
 	__be64 *addr;
 
-	tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+	tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+			shift - PAGE_SHIFT);
 	if (!tce_mem) {
 		pr_err("Failed to allocate a TCE memory, level shift=%d\n",
 				shift);
@@ -161,6 +162,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 		if (ptce)
 			*ptce = cpu_to_be64(0);
+		else
+			/* Skip the rest of the level */
+			i |= tbl->it_level_size - 1;
 	}
 }
 
@@ -260,7 +264,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
 			PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
-	unsigned int tmplevels = levels;
 
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
@@ -268,9 +271,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
-	if (alloc_userspace_copy && (window_size > (1ULL << 32)))
-		tmplevels = 1;
-
 	/* Adjust direct table size from window_size and levels */
 	entries_shift = (entries_shift + levels - 1) / levels;
 	level_shift = entries_shift + 3;
@@ -281,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	/* Allocate TCE table */
 	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-			tmplevels, tce_table_size, &offset, &total_allocated);
+			1, tce_table_size, &offset, &total_allocated);
 
 	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
@@ -292,18 +292,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	 * we did not allocate as much as we wanted,
 	 * release partially allocated table.
 	 */
-	if (tmplevels == levels && offset < tce_table_size)
+	if (levels == 1 && offset < tce_table_size)
 		goto free_tces_exit;
 
 	/* Allocate userspace view of the TCE table */
 	if (alloc_userspace_copy) {
 		offset = 0;
 		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-				tmplevels, tce_table_size, &offset,
+				1, tce_table_size, &offset,
 				&total_allocated_uas);
 		if (!uas)
 			goto free_tces_exit;
-		if (tmplevels == levels && (offset < tce_table_size ||
+		if (levels == 1 && (offset < tce_table_size ||
 				total_allocated_uas != total_allocated))
 			goto free_uas_exit;
 	}
@@ -318,7 +318,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
 			window_size, tce_table_size, bus_offset, tbl->it_base,
-			tbl->it_userspace, tmplevels, levels);
+			tbl->it_userspace, 1, levels);
 
 	return 0;
 
-- 
2.17.1


  parent reply	other threads:[~2019-05-30  7:08 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-30  7:03 [PATCH kernel v3 0/3] powerpc/ioda2: Yet another attempt to allow DMA masks between 32 and 59 Alexey Kardashevskiy
2019-05-30  7:03 ` [PATCH kernel v3 1/3] powerpc/iommu: Allow bypass-only for DMA Alexey Kardashevskiy
2019-06-03  2:03   ` David Gibson
2019-05-30  7:03 ` Alexey Kardashevskiy [this message]
2019-07-08  7:01   ` [PATCH kernel v3 2/3] powerpc/powernv/ioda2: Allocate TCE table levels on demand for default DMA window alistair
2019-05-30  7:03 ` [PATCH kernel v3 3/3] powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU pages Alexey Kardashevskiy
2019-07-08  7:01   ` alistair
2019-07-09  0:57     ` Alexey Kardashevskiy
2019-06-06  4:11 ` [PATCH kernel v3 0/3] powerpc/ioda2: Yet another attempt to allow DMA masks between 32 and 59 Shawn Anastasio
2019-06-06  7:17   ` Alistair Popple
2019-06-06 12:07     ` Oliver
2019-06-07  1:41       ` Alistair Popple
2019-06-10  5:19         ` Alexey Kardashevskiy
2019-06-12  5:05   ` Shawn Anastasio
2019-06-12  6:16     ` Oliver O'Halloran
2019-06-12 19:14       ` Shawn Anastasio
2019-06-12  7:07     ` Alexey Kardashevskiy
2019-06-12 19:15       ` Shawn Anastasio
2019-06-18  4:26         ` Shawn Anastasio
2019-06-18  6:39           ` Alexey Kardashevskiy
2019-06-18  7:00             ` Shawn Anastasio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190530070355.121802-3-aik@ozlabs.ru \
    --to=aik@ozlabs.ru \
    --cc=alistair@popple.id.au \
    --cc=david@gibson.dropbear.id.au \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=oohall@gmail.com \
    --cc=sbobroff@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.