* [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
@ 2020-06-01 11:31 Barry Song
2020-07-03 16:21 ` Will Deacon
0 siblings, 1 reply; 3+ messages in thread
From: Barry Song @ 2020-06-01 11:31 UTC (permalink / raw)
To: hch, m.szyprowski, robin.murphy, will
Cc: Barry Song, iommu, linuxarm, linux-arm-kernel
dmam_alloc_coherent() will usually allocate memory from the default CMA. For
a common arm64 defconfig without reserved memory in device tree, there is only
one CMA close to address 0.
dma_alloc_contiguous() will allocate memory without any idea of NUMA and smmu
has no customized per-numa cma_area.
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
size_t count = size >> PAGE_SHIFT;
struct page *page = NULL;
struct cma *cma = NULL;
if (dev && dev->cma_area)
cma = dev->cma_area;
else if (count > 1)
cma = dma_contiguous_default_area;
...
return page;
}
if there are N numa nodes, N-1 nodes will put command/evt queues etc in a
remote node the default CMA belongs to,probably node 0. Tests show, after
sending CMD_SYNC in an empty command queue,
on Node2, without this patch, it takes 550ns to wait for the completion
of CMD_SYNC; with this patch, it takes 250ns to wait for the completion
of CMD_SYNC.
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
---
drivers/iommu/arm-smmu-v3.c | 63 ++++++++++++++++++++++++++++---------
1 file changed, 48 insertions(+), 15 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 82508730feb7..58295423e1d7 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -3157,21 +3157,23 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
size_t dwords, const char *name)
{
size_t qsz;
+ struct page *page;
- do {
- qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
- q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
- GFP_KERNEL);
- if (q->base || qsz < PAGE_SIZE)
- break;
-
- q->llq.max_n_shift--;
- } while (1);
+ qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
+ page = alloc_pages_node(dev_to_node(smmu->dev), GFP_KERNEL,
+ get_order(qsz));
+ if (!page) {
+ dev_err(smmu->dev,
+ "failed to allocate queue (0x%zx bytes) for %s\n",
+ qsz, name);
+ return -ENOMEM;
+ }
- if (!q->base) {
+ q->base = page_address(page);
+ q->base_dma = dma_map_single(smmu->dev, q->base, qsz, DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(smmu->dev, q->base_dma)) {
dev_err(smmu->dev,
- "failed to allocate queue (0x%zx bytes) for %s\n",
- qsz, name);
+ "failed to dma map for %s\n", name);
return -ENOMEM;
}
@@ -3192,6 +3194,18 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
return 0;
}
+static int arm_smmu_deinit_one_queue(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue *q,
+ size_t dwords)
+{
+ size_t qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
+
+ dma_unmap_single(smmu->dev, q->base_dma, qsz, DMA_BIDIRECTIONAL);
+ free_page((unsigned long)q->base);
+
+ return 0;
+}
+
static void arm_smmu_cmdq_free_bitmap(void *data)
{
unsigned long *bitmap = data;
@@ -3233,22 +3247,40 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
ret = arm_smmu_cmdq_init(smmu);
if (ret)
- return ret;
+ goto deinit_cmdq;
/* evtq */
ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
"evtq");
if (ret)
- return ret;
+ goto deinit_cmdq;
/* priq */
if (!(smmu->features & ARM_SMMU_FEAT_PRI))
return 0;
- return arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
+ ret = arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS,
"priq");
+ if (ret)
+ goto deinit_evtq;
+
+ return 0;
+
+deinit_evtq:
+ arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q, EVTQ_ENT_DWORDS);
+deinit_cmdq:
+ arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q, CMDQ_ENT_DWORDS);
+ return ret;
+}
+
+static void arm_smmu_deinit_queues(struct arm_smmu_device *smmu)
+{
+ arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q, CMDQ_ENT_DWORDS);
+ arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q, EVTQ_ENT_DWORDS);
+ if (smmu->features & ARM_SMMU_FEAT_PRI)
+ arm_smmu_deinit_one_queue(smmu, &smmu->priq.q, PRIQ_ENT_DWORDS);
}
static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
@@ -4121,6 +4153,7 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
arm_smmu_set_bus_ops(NULL);
iommu_device_unregister(&smmu->iommu);
iommu_device_sysfs_remove(&smmu->iommu);
+ arm_smmu_deinit_queues(smmu);
arm_smmu_device_disable(smmu);
return 0;
--
2.23.0
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
2020-06-01 11:31 [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node Barry Song
@ 2020-07-03 16:21 ` Will Deacon
2020-07-05 10:09 ` Song Bao Hua (Barry Song)
0 siblings, 1 reply; 3+ messages in thread
From: Will Deacon @ 2020-07-03 16:21 UTC (permalink / raw)
To: Barry Song
Cc: linuxarm, iommu, robin.murphy, hch, linux-arm-kernel, m.szyprowski
On Mon, Jun 01, 2020 at 11:31:41PM +1200, Barry Song wrote:
> dmam_alloc_coherent() will usually allocate memory from the default CMA. For
> a common arm64 defconfig without reserved memory in device tree, there is only
> one CMA close to address 0.
> dma_alloc_contiguous() will allocate memory without any idea of NUMA and smmu
> has no customized per-numa cma_area.
> struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
> {
> size_t count = size >> PAGE_SHIFT;
> struct page *page = NULL;
> struct cma *cma = NULL;
>
> if (dev && dev->cma_area)
> cma = dev->cma_area;
> else if (count > 1)
> cma = dma_contiguous_default_area;
>
> ...
> return page;
> }
>
> if there are N numa nodes, N-1 nodes will put command/evt queues etc in a
> remote node the default CMA belongs to,probably node 0. Tests show, after
> sending CMD_SYNC in an empty command queue,
> on Node2, without this patch, it takes 550ns to wait for the completion
> of CMD_SYNC; with this patch, it takes 250ns to wait for the completion
> of CMD_SYNC.
>
> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
> drivers/iommu/arm-smmu-v3.c | 63 ++++++++++++++++++++++++++++---------
> 1 file changed, 48 insertions(+), 15 deletions(-)
I would prefer that the coherent DMA allocator learned about NUMA, rather
than we bodge drivers to use the streaming API where it doesn't really
make sense.
I see that you've posted other patches to do that (thanks!), so I'll
disregard this series.
Cheers,
Will
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 3+ messages in thread
* RE: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
2020-07-03 16:21 ` Will Deacon
@ 2020-07-05 10:09 ` Song Bao Hua (Barry Song)
0 siblings, 0 replies; 3+ messages in thread
From: Song Bao Hua (Barry Song) @ 2020-07-05 10:09 UTC (permalink / raw)
To: Will Deacon
Cc: Linuxarm, iommu, robin.murphy, hch, linux-arm-kernel, m.szyprowski
> -----Original Message-----
> From: Will Deacon [mailto:will@kernel.org]
> Sent: Saturday, July 4, 2020 4:22 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: hch@lst.de; m.szyprowski@samsung.com; robin.murphy@arm.com;
> linux-arm-kernel@lists.infradead.org; iommu@lists.linux-foundation.org;
> Linuxarm <linuxarm@huawei.com>
> Subject: Re: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in
> local numa node
>
> On Mon, Jun 01, 2020 at 11:31:41PM +1200, Barry Song wrote:
> > dmam_alloc_coherent() will usually allocate memory from the default CMA.
> For
> > a common arm64 defconfig without reserved memory in device tree, there is
> only
> > one CMA close to address 0.
> > dma_alloc_contiguous() will allocate memory without any idea of NUMA
> and smmu
> > has no customized per-numa cma_area.
> > struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
> > {
> > size_t count = size >> PAGE_SHIFT;
> > struct page *page = NULL;
> > struct cma *cma = NULL;
> >
> > if (dev && dev->cma_area)
> > cma = dev->cma_area;
> > else if (count > 1)
> > cma = dma_contiguous_default_area;
> >
> > ...
> > return page;
> > }
> >
> > if there are N numa nodes, N-1 nodes will put command/evt queues etc in a
> > remote node the default CMA belongs to,probably node 0. Tests show, after
> > sending CMD_SYNC in an empty command queue,
> > on Node2, without this patch, it takes 550ns to wait for the completion
> > of CMD_SYNC; with this patch, it takes 250ns to wait for the completion
> > of CMD_SYNC.
> >
> > Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> > ---
> > drivers/iommu/arm-smmu-v3.c | 63
> ++++++++++++++++++++++++++++---------
> > 1 file changed, 48 insertions(+), 15 deletions(-)
>
> I would prefer that the coherent DMA allocator learned about NUMA, rather
> than we bodge drivers to use the streaming API where it doesn't really
> make sense.
>
> I see that you've posted other patches to do that (thanks!), so I'll
> disregard this series.
Thanks for taking a look, Will. For sure I am using the per-numa cma patchset to
replace this patch. So it is ok to ignore this one.
>
> Cheers,
>
> Will
Thanks
Barry
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-07-05 10:11 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-01 11:31 [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node Barry Song
2020-07-03 16:21 ` Will Deacon
2020-07-05 10:09 ` Song Bao Hua (Barry Song)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).