> From: Song Bao Hua (Barry Song) > Sent: Monday, June 1, 2020 11:32 PM > To: hch@lst.de; m.szyprowski@samsung.com; robin.murphy@arm.com; > will@kernel.org > Cc: linux-arm-kernel@lists.infradead.org; > iommu@lists.linux-foundation.org; Linuxarm ; Song > Bao Hua (Barry Song) > Subject: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in > local numa node > > > dmam_alloc_coherent() will usually allocate memory from the default CMA. > For > a common arm64 defconfig without reserved memory in device tree, there > is only one CMA close to address 0. > dma_alloc_contiguous() will allocate memory without any idea of NUMA > and smmu has no customized per-numa cma_area. > struct page *dma_alloc_contiguous(struct device *dev, size_t size, > gfp_t gfp) { > size_t count = size >> PAGE_SHIFT; > struct page *page = NULL; > struct cma *cma = NULL; > > if (dev && dev->cma_area) > cma = dev->cma_area; > else if (count > 1) > cma = dma_contiguous_default_area; > > ... > return page; > } > > if there are N numa nodes, N-1 nodes will put command/evt queues etc > in a remote node the default CMA belongs to,probably node 0. Tests > show, after sending CMD_SYNC in an empty command queue, on Node2, > without this patch, it takes 550ns to wait for the completion of > CMD_SYNC; with this patch, it takes 250ns to wait for the completion > of CMD_SYNC. > Sorry for missing the RFC in the subject. For the tested platform, hardware will help the sync between cpu and smmu. So there is no sync operations. So please consider this patch as a RFC. This is only for the concept. The other option to fix this is creating a per-numa CMA area for smmu and assigning the per-numa cma_area to SMMU. Maybe cma_declare_contiguous_nid() used by mm/hugetlb.c can be used by AARCH64/SMMU. Or we can completely change CMA to create default per-numa CMA like this: struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { size_t count = size >> PAGE_SHIFT; struct page *page = NULL; struct cma *cma = NULL; + int nid = dev_to_node(dev); if (dev && dev->cma_area) cma = dev->cma_area; else if (count > 1) - cma = dma_contiguous_default_area; + cma = dma_contiguous_default_areas[nid]; ... return page; } Then there is no necessity to assign per-numa cma_area to smmu->dev.cma_area. -barry > Signed-off-by: Barry Song > --- > drivers/iommu/arm-smmu-v3.c | 63 > ++++++++++++++++++++++++++++--------- > 1 file changed, 48 insertions(+), 15 deletions(-) > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index 82508730feb7..58295423e1d7 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -3157,21 +3157,23 @@ static int arm_smmu_init_one_queue(struct > arm_smmu_device *smmu, > size_t dwords, const char *name) { > size_t qsz; > + struct page *page; > > - do { > - qsz = ((1 << q->llq.max_n_shift) * dwords) << 3; > - q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma, > - GFP_KERNEL); > - if (q->base || qsz < PAGE_SIZE) > - break; > - > - q->llq.max_n_shift--; > - } while (1); > + qsz = ((1 << q->llq.max_n_shift) * dwords) << 3; > + page = alloc_pages_node(dev_to_node(smmu->dev), GFP_KERNEL, > + get_order(qsz)); > + if (!page) { > + dev_err(smmu->dev, > + "failed to allocate queue (0x%zx bytes) for %s\n", > + qsz, name); > + return -ENOMEM; > + } > > - if (!q->base) { > + q->base = page_address(page); > + q->base_dma = dma_map_single(smmu->dev, q->base, qsz, > DMA_BIDIRECTIONAL); > + if (dma_mapping_error(smmu->dev, q->base_dma)) { > dev_err(smmu->dev, > - "failed to allocate queue (0x%zx bytes) for %s\n", > - qsz, name); > + "failed to dma map for %s\n", name); > return -ENOMEM; > } > > @@ -3192,6 +3194,18 @@ static int arm_smmu_init_one_queue(struct > arm_smmu_device *smmu, > return 0; > } > > +static int arm_smmu_deinit_one_queue(struct arm_smmu_device *smmu, > + struct arm_smmu_queue *q, > + size_t dwords) > +{ > + size_t qsz = ((1 << q->llq.max_n_shift) * dwords) << 3; > + > + dma_unmap_single(smmu->dev, q->base_dma, qsz, > DMA_BIDIRECTIONAL); > + free_page((unsigned long)q->base); > + > + return 0; > +} > + > static void arm_smmu_cmdq_free_bitmap(void *data) { > unsigned long *bitmap = data; > @@ -3233,22 +3247,40 @@ static int arm_smmu_init_queues(struct > arm_smmu_device *smmu) > > ret = arm_smmu_cmdq_init(smmu); > if (ret) > - return ret; > + goto deinit_cmdq; > > /* evtq */ > ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, > ARM_SMMU_EVTQ_PROD, > ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, > "evtq"); > if (ret) > - return ret; > + goto deinit_cmdq; > > /* priq */ > if (!(smmu->features & ARM_SMMU_FEAT_PRI)) > return 0; > > - return arm_smmu_init_one_queue(smmu, &smmu->priq.q, > ARM_SMMU_PRIQ_PROD, > + ret = arm_smmu_init_one_queue(smmu, &smmu->priq.q, > ARM_SMMU_PRIQ_PROD, > ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS, > "priq"); > + if (ret) > + goto deinit_evtq; > + > + return 0; > + > +deinit_evtq: > + arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q, > EVTQ_ENT_DWORDS); > +deinit_cmdq: > + arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q, > CMDQ_ENT_DWORDS); > + return ret; > +} > + > +static void arm_smmu_deinit_queues(struct arm_smmu_device *smmu) { > + arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q, > CMDQ_ENT_DWORDS); > + arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q, > EVTQ_ENT_DWORDS); > + if (smmu->features & ARM_SMMU_FEAT_PRI) > + arm_smmu_deinit_one_queue(smmu, &smmu->priq.q, > PRIQ_ENT_DWORDS); > } > > static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu) @@ > -4121,6 +4153,7 @@ static int arm_smmu_device_remove(struct > platform_device *pdev) > arm_smmu_set_bus_ops(NULL); > iommu_device_unregister(&smmu->iommu); > iommu_device_sysfs_remove(&smmu->iommu); > + arm_smmu_deinit_queues(smmu); > arm_smmu_device_disable(smmu); > > return 0; > -- > 2.23.0 >