On Tue, Nov 13, 2018 at 07:28:05PM +1100, Alexey Kardashevskiy wrote: > This new memory does not have page structs as it is not plugged to > the host so gup() will fail anyway. > > This adds 2 helpers: > - mm_iommu_newdev() to preregister the "memory device" memory so > the rest of API can still be used; > - mm_iommu_is_devmem() to know if the physical address is one of thise > new regions which we must avoid unpinning of. > > This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test > if the memory is device memory to avoid pfn_to_page(). > > Signed-off-by: Alexey Kardashevskiy > --- > arch/powerpc/include/asm/iommu.h | 5 +- > arch/powerpc/include/asm/mmu_context.h | 5 ++ > arch/powerpc/kernel/iommu.c | 9 ++- > arch/powerpc/kvm/book3s_64_vio.c | 18 +++--- > arch/powerpc/mm/mmu_context_iommu.c | 83 +++++++++++++++++++++++--- > drivers/vfio/vfio_iommu_spapr_tce.c | 28 +++++---- > 6 files changed, 116 insertions(+), 32 deletions(-) > > diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h > index 35db0cb..a8aeac0 100644 > --- a/arch/powerpc/include/asm/iommu.h > +++ b/arch/powerpc/include/asm/iommu.h > @@ -218,8 +218,9 @@ extern void iommu_register_group(struct iommu_table_group *table_group, > extern int iommu_add_device(struct device *dev); > extern void iommu_del_device(struct device *dev); > extern int __init tce_iommu_bus_notifier_init(void); > -extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, > - unsigned long *hpa, enum dma_data_direction *direction); > +extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, > + unsigned long entry, unsigned long *hpa, > + enum dma_data_direction *direction); > #else > static inline void iommu_register_group(struct iommu_table_group *table_group, > int pci_domain_number, > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > index 2d6b00d..f0f9f3d 100644 > --- a/arch/powerpc/include/asm/mmu_context.h > +++ b/arch/powerpc/include/asm/mmu_context.h > @@ -24,6 +24,9 @@ extern bool mm_iommu_preregistered(struct mm_struct *mm); > extern long mm_iommu_new(struct mm_struct *mm, > unsigned long ua, unsigned long entries, > struct mm_iommu_table_group_mem_t **pmem); > +extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, > + unsigned long entries, unsigned long dev_hpa, > + struct mm_iommu_table_group_mem_t **pmem); > extern long mm_iommu_put(struct mm_struct *mm, > struct mm_iommu_table_group_mem_t *mem); > extern void mm_iommu_init(struct mm_struct *mm); > @@ -39,6 +42,8 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > unsigned long ua, unsigned int pageshift, unsigned long *hpa); > extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); > +extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, > + unsigned int pageshift); > extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); > extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); > #endif > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c > index f0dc680..8ccfdd9 100644 > --- a/arch/powerpc/kernel/iommu.c > +++ b/arch/powerpc/kernel/iommu.c > @@ -47,6 +47,7 @@ > #include > #include > #include > +#include > > #define DBG(...) > > @@ -993,15 +994,17 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) > } > EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); > > -long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, > - unsigned long *hpa, enum dma_data_direction *direction) > +long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, > + unsigned long entry, unsigned long *hpa, > + enum dma_data_direction *direction) > { > long ret; > > ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); > > if (!ret && ((*direction == DMA_FROM_DEVICE) || > - (*direction == DMA_BIDIRECTIONAL))) > + (*direction == DMA_BIDIRECTIONAL)) && > + !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift)) > SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); What about the equivalent real mode paths? I guess they won't ever be called for this case, since they're only used on POWER8. However some checks or WARN_ON() or something to make that clear would be nice. > /* if (unlikely(ret)) > diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c > index 62a8d03..532ab797 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -397,12 +397,13 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, > return H_SUCCESS; > } > > -static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) > +static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, > + unsigned long entry) > { > unsigned long hpa = 0; > enum dma_data_direction dir = DMA_NONE; > > - iommu_tce_xchg(tbl, entry, &hpa, &dir); > + iommu_tce_xchg(mm, tbl, entry, &hpa, &dir); > } > > static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, > @@ -433,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, > unsigned long hpa = 0; > long ret; > > - if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) > + if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir))) > return H_TOO_HARD; > > if (dir == DMA_NONE) > @@ -441,7 +442,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, > > ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); > if (ret != H_SUCCESS) > - iommu_tce_xchg(tbl, entry, &hpa, &dir); > + iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); > > return ret; > } > @@ -487,7 +488,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, > if (mm_iommu_mapped_inc(mem)) > return H_TOO_HARD; > > - ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); > + ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); > if (WARN_ON_ONCE(ret)) { > mm_iommu_mapped_dec(mem); > return H_TOO_HARD; > @@ -566,7 +567,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > entry, ua, dir); > > if (ret != H_SUCCESS) { > - kvmppc_clear_tce(stit->tbl, entry); > + kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); > goto unlock_exit; > } > } > @@ -655,7 +656,8 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, > iommu_tce_direction(tce)); > > if (ret != H_SUCCESS) { > - kvmppc_clear_tce(stit->tbl, entry); > + kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, > + entry); > goto unlock_exit; > } > } > @@ -704,7 +706,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, > return ret; > > WARN_ON_ONCE(1); > - kvmppc_clear_tce(stit->tbl, entry); > + kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); > } > } > > diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c > index 580d89e..62fe5fe 100644 > --- a/arch/powerpc/mm/mmu_context_iommu.c > +++ b/arch/powerpc/mm/mmu_context_iommu.c > @@ -47,6 +47,8 @@ struct mm_iommu_table_group_mem_t { > struct page **hpages; /* vmalloc'ed */ > phys_addr_t *hpas; > }; > +#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) > + u64 dev_hpa; /* Device memory base address */ > }; > > static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, > @@ -89,7 +91,8 @@ bool mm_iommu_preregistered(struct mm_struct *mm) > } > EXPORT_SYMBOL_GPL(mm_iommu_preregistered); > > -long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, > +static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, > + unsigned long entries, unsigned long dev_hpa, > struct mm_iommu_table_group_mem_t **pmem) > { > struct mm_iommu_table_group_mem_t *mem; > @@ -112,11 +115,13 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, > > } > > - ret = mm_iommu_adjust_locked_vm(mm, entries, true); > - if (ret) > - goto unlock_exit; > + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { > + ret = mm_iommu_adjust_locked_vm(mm, entries, true); > + if (ret) > + goto unlock_exit; > > - locked_entries = entries; > + locked_entries = entries; > + } > > mem = kzalloc(sizeof(*mem), GFP_KERNEL); > if (!mem) { > @@ -124,6 +129,13 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, > goto unlock_exit; > } > > + if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { > + mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); > + mem->dev_hpa = dev_hpa; > + goto good_exit; > + } > + mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; > + > /* > * For a starting point for a maximum page size calculation > * we use @ua and @entries natural alignment to allow IOMMU pages > @@ -180,6 +192,7 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, > > } > > +good_exit: > atomic64_set(&mem->mapped, 1); > mem->used = 1; > mem->ua = ua; > @@ -196,13 +209,31 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, > > return ret; > } > + > +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, > + struct mm_iommu_table_group_mem_t **pmem) > +{ > + return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, > + pmem); > +} > EXPORT_SYMBOL_GPL(mm_iommu_new); > > +long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, > + unsigned long entries, unsigned long dev_hpa, > + struct mm_iommu_table_group_mem_t **pmem) > +{ > + return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); > +} > +EXPORT_SYMBOL_GPL(mm_iommu_newdev); > + > static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) > { > long i; > struct page *page = NULL; > > + if (!mem->hpas) > + return; > + > for (i = 0; i < mem->entries; ++i) { > if (!mem->hpas[i]) > continue; > @@ -244,6 +275,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) > long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) > { > long ret = 0; > + unsigned long entries, dev_hpa; > > mutex_lock(&mem_list_mutex); > > @@ -265,9 +297,12 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) > } > > /* @mapped became 0 so now mappings are disabled, release the region */ > + entries = mem->entries; > + dev_hpa = mem->dev_hpa; > mm_iommu_release(mem); > > - mm_iommu_adjust_locked_vm(mm, mem->entries, false); > + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) > + mm_iommu_adjust_locked_vm(mm, entries, false); > > unlock_exit: > mutex_unlock(&mem_list_mutex); > @@ -337,7 +372,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > unsigned long ua, unsigned int pageshift, unsigned long *hpa) > { > const long entry = (ua - mem->ua) >> PAGE_SHIFT; > - u64 *va = &mem->hpas[entry]; > + u64 *va; > > if (entry >= mem->entries) > return -EFAULT; > @@ -345,6 +380,12 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > if (pageshift > mem->pageshift) > return -EFAULT; > > + if (!mem->hpas) { > + *hpa = mem->dev_hpa + (ua - mem->ua); > + return 0; > + } > + > + va = &mem->hpas[entry]; > *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); > > return 0; > @@ -355,7 +396,6 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > unsigned long ua, unsigned int pageshift, unsigned long *hpa) > { > const long entry = (ua - mem->ua) >> PAGE_SHIFT; > - void *va = &mem->hpas[entry]; > unsigned long *pa; > > if (entry >= mem->entries) > @@ -364,7 +404,12 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > if (pageshift > mem->pageshift) > return -EFAULT; > > - pa = (void *) vmalloc_to_phys(va); > + if (!mem->hpas) { > + *hpa = mem->dev_hpa + (ua - mem->ua); > + return 0; > + } > + > + pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); > if (!pa) > return -EFAULT; > > @@ -394,6 +439,26 @@ extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) > *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; > } > > +extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, > + unsigned int pageshift) > +{ > + struct mm_iommu_table_group_mem_t *mem; > + const unsigned long pagesize = 1UL << pageshift; > + > + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { > + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) > + continue; > + > + if ((mem->dev_hpa <= hpa) && > + (hpa + pagesize <= mem->dev_hpa + > + (mem->entries << PAGE_SHIFT))) > + return true; > + } > + > + return false; > +} > +EXPORT_SYMBOL_GPL(mm_iommu_is_devmem); > + > long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) > { > if (atomic64_inc_not_zero(&mem->mapped)) > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c > index 56db071..ed89137 100644 > --- a/drivers/vfio/vfio_iommu_spapr_tce.c > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c > @@ -222,8 +222,15 @@ static long tce_iommu_register_pages(struct tce_container *container, > return ret; > } > > -static bool tce_page_is_contained(struct page *page, unsigned page_shift) > +static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, > + unsigned int page_shift) > { > + struct page *page; > + > + if (mm_iommu_is_devmem(mm, hpa, page_shift)) > + return true; > + > + page = pfn_to_page(hpa >> PAGE_SHIFT); > /* > * Check that the TCE table granularity is not bigger than the size of > * a page we just found. Otherwise the hardware can get access to > @@ -499,7 +506,8 @@ static int tce_iommu_clear(struct tce_container *container, > > direction = DMA_NONE; > oldhpa = 0; > - ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); > + ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, > + &direction); > if (ret) > continue; > > @@ -537,7 +545,6 @@ static long tce_iommu_build(struct tce_container *container, > enum dma_data_direction direction) > { > long i, ret = 0; > - struct page *page; > unsigned long hpa; > enum dma_data_direction dirtmp; > > @@ -548,15 +555,16 @@ static long tce_iommu_build(struct tce_container *container, > if (ret) > break; > > - page = pfn_to_page(hpa >> PAGE_SHIFT); > - if (!tce_page_is_contained(page, tbl->it_page_shift)) { > + if (!tce_page_is_contained(container->mm, hpa, > + tbl->it_page_shift)) { > ret = -EPERM; > break; > } > > hpa |= offset; > dirtmp = direction; > - ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); > + ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, > + &dirtmp); > if (ret) { > tce_iommu_unuse_page(container, hpa); > pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", > @@ -583,7 +591,6 @@ static long tce_iommu_build_v2(struct tce_container *container, > enum dma_data_direction direction) > { > long i, ret = 0; > - struct page *page; > unsigned long hpa; > enum dma_data_direction dirtmp; > > @@ -596,8 +603,8 @@ static long tce_iommu_build_v2(struct tce_container *container, > if (ret) > break; > > - page = pfn_to_page(hpa >> PAGE_SHIFT); > - if (!tce_page_is_contained(page, tbl->it_page_shift)) { > + if (!tce_page_is_contained(container->mm, hpa, > + tbl->it_page_shift)) { > ret = -EPERM; > break; > } > @@ -610,7 +617,8 @@ static long tce_iommu_build_v2(struct tce_container *container, > if (mm_iommu_mapped_inc(mem)) > break; > > - ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); > + ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, > + &dirtmp); > if (ret) { > /* dirtmp cannot be DMA_NONE here */ > tce_iommu_unuse_page_v2(container, tbl, entry + i); -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson