IOMMU Archive on lore.kernel.org
 help / color / Atom feed
From: Lu Baolu <baolu.lu@linux.intel.com>
To: Joerg Roedel <joro@8bytes.org>,
	David Woodhouse <dwmw2@infradead.org>,
	Alex Williamson <alex.williamson@redhat.com>
Cc: kevin.tian@intel.com, Yi Sun <yi.y.sun@linux.intel.com>,
	ashok.raj@intel.com, kvm@vger.kernel.org,
	sanjay.k.kumar@intel.com, iommu@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, yi.y.sun@intel.com
Subject: [RFC PATCH 2/4] iommu/vt-d: Add first level page table interfaces
Date: Mon, 23 Sep 2019 20:24:52 +0800
Message-ID: <20190923122454.9888-3-baolu.lu@linux.intel.com> (raw)
In-Reply-To: <20190923122454.9888-1-baolu.lu@linux.intel.com>

This adds functions to manipulate first level page tables
which could be used by a scalale mode capable IOMMU unit.

intel_mmmap_range(domain, addr, end, phys_addr, prot)
 - Map an iova range of [addr, end) to the physical memory
   started at @phys_addr with the @prot permissions.

intel_mmunmap_range(domain, addr, end)
 - Tear down the map of an iova range [addr, end). A page
   list will be returned which will be freed after iotlb
   flushing.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liu Yi L <yi.l.liu@intel.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/Makefile             |   2 +-
 drivers/iommu/intel-pgtable.c      | 342 +++++++++++++++++++++++++++++
 include/linux/intel-iommu.h        |  24 +-
 include/trace/events/intel_iommu.h |  60 +++++
 4 files changed, 426 insertions(+), 2 deletions(-)
 create mode 100644 drivers/iommu/intel-pgtable.c

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4f405f926e73..dc550e14cc58 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/intel-pgtable.c b/drivers/iommu/intel-pgtable.c
new file mode 100644
index 000000000000..8e95978cd381
--- /dev/null
+++ b/drivers/iommu/intel-pgtable.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pgtable.c - Intel IOMMU page table manipulation library
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)     "DMAR: " fmt
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/intel-iommu.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <trace/events/intel_iommu.h>
+
+#ifdef CONFIG_X86
+/*
+ * mmmap: Map a range of IO virtual address to physical addresses.
+ */
+#define pgtable_populate(domain, nm)					\
+do {									\
+	void *__new = alloc_pgtable_page(domain->nid);			\
+	if (!__new)							\
+		return -ENOMEM;						\
+	smp_wmb();							\
+	spin_lock(&(domain)->page_table_lock);				\
+	if (nm ## _present(*nm)) {					\
+		free_pgtable_page(__new);				\
+	} else {							\
+		set_##nm(nm, __##nm(__pa(__new) | _PAGE_TABLE));	\
+		domain_flush_cache(domain, nm, sizeof(nm##_t));		\
+	}								\
+	spin_unlock(&(domain)->page_table_lock);			\
+} while(0);
+
+static int
+mmmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	pte_t *pte, *first_pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	if (unlikely(pmd_none(*pmd)))
+		pgtable_populate(domain, pmd);
+
+	first_pte = pte = pte_offset_kernel(pmd, addr);
+
+	do {
+		set_pte(pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+	return 0;
+}
+
+static int
+mmmap_pmd_range(struct dmar_domain *domain, pud_t *pud, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	pmd_t *pmd;
+
+	if (unlikely(pud_none(*pud)))
+		pgtable_populate(domain, pud);
+	pmd = pmd_offset(pud, addr);
+
+	phys_addr -= addr;
+	do {
+		next = pmd_addr_end(addr, end);
+		if (mmmap_pte_range(domain, pmd, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pmd++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int
+mmmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	pud_t *pud;
+
+	if (unlikely(p4d_none(*p4d)))
+		pgtable_populate(domain, p4d);
+
+	pud = pud_offset(p4d, addr);
+
+	phys_addr -= addr;
+	do {
+		next = pud_addr_end(addr, end);
+		if (mmmap_pmd_range(domain, pud, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pud++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int
+mmmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	unsigned long next;
+	p4d_t *p4d;
+
+	if (cpu_feature_enabled(X86_FEATURE_LA57) && unlikely(pgd_none(*pgd)))
+		pgtable_populate(domain, pgd);
+
+	p4d = p4d_offset(pgd, addr);
+
+	phys_addr -= addr;
+	do {
+		next = p4d_addr_end(addr, end);
+		if (mmmap_pud_range(domain, p4d, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (p4d++, addr = next, addr != end);
+
+	return 0;
+}
+
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		      unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+	unsigned long next;
+	pgprot_t prot;
+	pgd_t *pgd;
+
+	trace_domain_mm_map(domain, addr, end, phys_addr);
+
+	/*
+	 * There is no PAGE_KERNEL_WO for a pte entry, so let's use RW
+	 * for a pte that requires write operation.
+	 */
+	prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : PAGE_KERNEL_RO;
+	BUG_ON(addr >= end);
+
+	phys_addr -= addr;
+	pgd = pgd_offset_pgd(domain->pgd, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (mmmap_p4d_range(domain, pgd, addr, next,
+				    phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (pgd++, addr = next, addr != end);
+
+	return 0;
+}
+
+/*
+ * mmunmap: Unmap an existing mapping between a range of IO vitual address
+ *          and physical addresses.
+ */
+static struct page *
+mmunmap_pte_range(struct dmar_domain *domain, pmd_t *pmd,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	unsigned long start;
+	pte_t *pte, *first_pte;
+
+	start = addr;
+	pte = pte_offset_kernel(pmd, addr);
+	first_pte = pte;
+	do {
+		set_pte(pte, __pte(0));
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pte_page;
+
+		pte = (pte_t *)pmd_page_vaddr(*pmd);
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			if (!pte || !pte_none(pte[i]))
+				goto pte_out;
+
+		pte_page = pmd_page(*pmd);
+		pte_page->freelist = freelist;
+		freelist = pte_page;
+		pmd_clear(pmd);
+		domain_flush_cache(domain, pmd, sizeof(pmd_t));
+	}
+
+pte_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_pmd_range(struct dmar_domain *domain, pud_t *pud,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	pmd_t *pmd;
+	unsigned long start, next;
+
+	start = addr;
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		freelist = mmunmap_pte_range(domain, pmd, addr, next,
+					     freelist, reclaim);
+	} while (pmd++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pmd_page;
+
+		pmd = (pmd_t *)pud_page_vaddr(*pud);
+		for (i = 0; i < PTRS_PER_PMD; i++)
+			if (!pmd || !pmd_none(pmd[i]))
+				goto pmd_out;
+
+		pmd_page = pud_page(*pud);
+		pmd_page->freelist = freelist;
+		freelist = pmd_page;
+		pud_clear(pud);
+		domain_flush_cache(domain, pud, sizeof(pud_t));
+	}
+
+pmd_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_pud_range(struct dmar_domain *domain, p4d_t *p4d,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	int i;
+	pud_t *pud;
+	unsigned long start, next;
+
+	start = addr;
+	pud = pud_offset(p4d, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		freelist = mmunmap_pmd_range(domain, pud, addr, next,
+					     freelist, reclaim);
+	} while (pud++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (reclaim) {
+		struct page *pud_page;
+
+		pud = (pud_t *)p4d_page_vaddr(*p4d);
+		for (i = 0; i < PTRS_PER_PUD; i++)
+			if (!pud || !pud_none(pud[i]))
+				goto pud_out;
+
+		pud_page = p4d_page(*p4d);
+		pud_page->freelist = freelist;
+		freelist = pud_page;
+		p4d_clear(p4d);
+		domain_flush_cache(domain, p4d, sizeof(p4d_t));
+	}
+
+pud_out:
+	return freelist;
+}
+
+static struct page *
+mmunmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd,
+		  unsigned long addr, unsigned long end,
+		  struct page *freelist, bool reclaim)
+{
+	p4d_t *p4d;
+	unsigned long start, next;
+
+	start = addr;
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		freelist = mmunmap_pud_range(domain, p4d, addr, next,
+					     freelist, reclaim);
+	} while (p4d++, addr = next, addr != end);
+
+	/* Add page to free list if all entries are empty. */
+	if (cpu_feature_enabled(X86_FEATURE_LA57) && reclaim) {
+		struct page *p4d_page;
+		int i;
+
+		p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+		for (i = 0; i < PTRS_PER_P4D; i++)
+			if (!p4d || !p4d_none(p4d[i]))
+				goto p4d_out;
+
+		p4d_page = pgd_page(*pgd);
+		p4d_page->freelist = freelist;
+		freelist = p4d_page;
+		pgd_clear(pgd);
+		domain_flush_cache(domain, pgd, sizeof(pgd_t));
+	}
+
+p4d_out:
+	return freelist;
+}
+
+struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+		    unsigned long addr, unsigned long end)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	struct page *freelist = NULL;
+
+	trace_domain_mm_unmap(domain, addr, end);
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset_pgd(domain->pgd, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		freelist = mmunmap_p4d_range(domain, pgd, addr, next,
+					     freelist, !addr);
+	} while (pgd++, addr = next, addr != end);
+
+	return freelist;
+}
+#endif /* CONFIG_X86 */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3ee694d4f361..044a91fa5431 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -489,7 +489,8 @@ struct dmar_domain {
 	struct list_head auxd;		/* link to device's auxiliary list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
 
-	struct dma_pte	*pgd;		/* virtual address */
+	void		*pgd;		/* virtual address */
+	spinlock_t page_table_lock;	/* Protects page tables */
 	int		gaw;		/* max guest address width */
 
 	/* adjusted guest address width, 0 is level 2 30-bit */
@@ -662,6 +663,27 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
 
+#ifdef CONFIG_X86
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		      unsigned long end, phys_addr_t phys_addr, int dma_prot);
+struct page *intel_mmunmap_range(struct dmar_domain *domain,
+				 unsigned long addr, unsigned long end);
+#else
+static inline int
+intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+		  unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+	return -ENODEV;
+}
+
+static inline struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+		    unsigned long addr, unsigned long end)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_INTEL_IOMMU_SVM
 int intel_svm_init(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 54e61d456cdf..e8c95290fd13 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single,
 	TP_ARGS(dev, dev_addr, size)
 );
 
+DECLARE_EVENT_CLASS(domain_map,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end, phys_addr_t phys_addr),
+
+	TP_ARGS(domain, addr, end, phys_addr),
+
+	TP_STRUCT__entry(
+		__field(struct dmar_domain *, domain)
+		__field(unsigned long, addr)
+		__field(unsigned long, end)
+		__field(phys_addr_t, phys_addr)
+	),
+
+	TP_fast_assign(
+		__entry->domain = domain;
+		__entry->addr = addr;
+		__entry->end = end;
+		__entry->phys_addr = phys_addr;
+	),
+
+	TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx",
+		  __entry->domain, __entry->addr, __entry->end,
+		  (unsigned long long)__entry->phys_addr)
+);
+
+DEFINE_EVENT(domain_map, domain_mm_map,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end, phys_addr_t phys_addr),
+
+	TP_ARGS(domain, addr, end, phys_addr)
+);
+
+DECLARE_EVENT_CLASS(domain_unmap,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end),
+
+	TP_ARGS(domain, addr, end),
+
+	TP_STRUCT__entry(
+		__field(struct dmar_domain *, domain)
+		__field(unsigned long, addr)
+		__field(unsigned long, end)
+	),
+
+	TP_fast_assign(
+		__entry->domain = domain;
+		__entry->addr = addr;
+		__entry->end = end;
+	),
+
+	TP_printk("domain=%p addr=0x%lx end=0x%lx",
+		  __entry->domain, __entry->addr, __entry->end)
+);
+
+DEFINE_EVENT(domain_unmap, domain_mm_unmap,
+	TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+		 unsigned long end),
+
+	TP_ARGS(domain, addr, end)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
2.17.1

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

  parent reply index

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-23 12:24 [RFC PATCH 0/4] Use 1st-level for DMA remapping in guest Lu Baolu
2019-09-23 12:24 ` [RFC PATCH 1/4] iommu/vt-d: Move domain_flush_cache helper into header Lu Baolu
2019-09-23 12:24 ` Lu Baolu [this message]
2019-09-23 20:31   ` [RFC PATCH 2/4] iommu/vt-d: Add first level page table interfaces Raj, Ashok
2019-09-24  1:38     ` Lu Baolu
2019-09-25  4:30       ` Peter Xu
2019-09-25  4:38         ` Tian, Kevin
2019-09-25  5:24           ` Peter Xu
2019-09-25  6:52             ` Lu Baolu
2019-09-25  7:32               ` Tian, Kevin
2019-09-25  8:35                 ` Peter Xu
2019-09-26  1:42                 ` Lu Baolu
2019-09-25  5:21   ` Peter Xu
2019-09-26  2:35     ` Lu Baolu
2019-09-26  3:49       ` Peter Xu
2019-09-27  2:27         ` Lu Baolu
2019-09-27  5:34           ` Peter Xu
2019-09-28  8:23             ` Lu Baolu
2019-09-29  5:25               ` Peter Xu
2019-10-08  2:20                 ` Lu Baolu
2019-09-23 12:24 ` [RFC PATCH 3/4] iommu/vt-d: Map/unmap domain with mmmap/mmunmap Lu Baolu
2019-09-25  5:00   ` Tian, Kevin
2019-09-25  7:06     ` Lu Baolu
2019-09-23 12:24 ` [RFC PATCH 4/4] iommu/vt-d: Identify domains using first level page table Lu Baolu
2019-09-25  6:50   ` Peter Xu
2019-09-25  7:35     ` Tian, Kevin
2019-09-23 19:27 ` [RFC PATCH 0/4] Use 1st-level for DMA remapping in guest Jacob Pan
2019-09-23 20:25   ` Raj, Ashok
2019-09-24  4:40     ` Lu Baolu
2019-09-24  7:00     ` Tian, Kevin
2019-09-25  2:48       ` Lu Baolu
2019-09-25  6:56         ` Peter Xu
2019-09-25  7:21           ` Tian, Kevin
2019-09-25  7:45             ` Peter Xu
2019-09-25  8:02               ` Tian, Kevin
2019-09-25  8:52                 ` Peter Xu
2019-09-26  1:37                   ` Lu Baolu
2019-09-24  4:27   ` Lu Baolu

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190923122454.9888-3-baolu.lu@linux.intel.com \
    --to=baolu.lu@linux.intel.com \
    --cc=alex.williamson@redhat.com \
    --cc=ashok.raj@intel.com \
    --cc=dwmw2@infradead.org \
    --cc=iommu@lists.linux-foundation.org \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sanjay.k.kumar@intel.com \
    --cc=yi.y.sun@intel.com \
    --cc=yi.y.sun@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

IOMMU Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-iommu/0 linux-iommu/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-iommu linux-iommu/ https://lore.kernel.org/linux-iommu \
		iommu@lists.linux-foundation.org
	public-inbox-index linux-iommu

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.linux-foundation.lists.iommu


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git