BEGIN:VCALENDAR METHOD:REQUEST PRODID:Microsoft Exchange Server 2010 VERSION:2.0 BEGIN:VTIMEZONE TZID:New Zealand Standard Time BEGIN:STANDARD DTSTART:16010101T030000 TZOFFSETFROM:+1300 TZOFFSETTO:+1200 RRULE:FREQ=YEARLY;INTERVAL=1;BYDAY=1SU;BYMONTH=4 END:STANDARD BEGIN:DAYLIGHT DTSTART:16010101T020000 TZOFFSETFROM:+1200 TZOFFSETTO:+1300 RRULE:FREQ=YEARLY;INTERVAL=1;BYDAY=-1SU;BYMONTH=9 END:DAYLIGHT END:VTIMEZONE BEGIN:VEVENT ORGANIZER;CN=Song Bao Hua (Barry Song):MAILTO:song.bao.hua@hisilicon.com ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=hch@lst.de :MAILTO:hch@lst.de ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=m.szyprows ki@samsung.com:MAILTO:m.szyprowski@samsung.com ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=robin.murp hy@arm.com:MAILTO:robin.murphy@arm.com ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=will@kerne l.org:MAILTO:will@kernel.org ATTENDEE;ROLE=OPT-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=linux-arm- kernel@lists.infradead.org:MAILTO:linux-arm-kernel@lists.infradead.org ATTENDEE;ROLE=OPT-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=iommu@list s.linux-foundation.org:MAILTO:iommu@lists.linux-foundation.org ATTENDEE;ROLE=OPT-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=Linuxarm:M AILTO:linuxarm@huawei.com DESCRIPTION;LANGUAGE=en-US:> From: Song Bao Hua (Barry Song)\n> Sent: Monda y\, June 1\, 2020 11:32 PM\n> To: hch@lst.de\; m.szyprowski@samsung.com\; robin.murphy@arm.com\; \n> will@kernel.org\n> Cc: linux-arm-kernel@lists.i nfradead.org\; \n> iommu@lists.linux-foundation.org\; Linuxarm \; Song \n> Bao Hua (Barry Song) \n> Subject: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in \n> local numa node\n> \n> \n> dmam_alloc_coherent() will usually allocate mem ory from the default CMA.\n> For\n> a common arm64 defconfig without reser ved memory in device tree\, there \n> is only one CMA close to address 0.\ n> dma_alloc_contiguous() will allocate memory without any idea of NUMA \ n> and smmu has no customized per-numa cma_area.\n> struct page *dma_alloc _contiguous(struct device *dev\, size_t size\, \n> gfp_t gfp) {\n> size_t count = size >> PAGE_SHIFT\;\n> struct page *page = NULL\; \n> struct cma *cma = NULL\;\n> \n> if (dev && dev->cma_ar ea)\n> cma = dev->cma_area\;\n> else if (count > 1 )\n> cma = dma_contiguous_default_area\;\n> \n> ...\n> return page\;\n> }\n> \n> if there are N numa nodes\, N-1 nodes will put command/evt queues etc \n> in a remote node the default CMA belongs t o\,probably node 0. Tests \n> show\, after sending CMD_SYNC in an empty co mmand queue\, on Node2\, \n> without this patch\, it takes 550ns to wait f or the completion of \n> CMD_SYNC\; with this patch\, it takes 250ns to wa it for the completion \n> of CMD_SYNC.\n> \n\nSorry for missing the RFC in the subject.\nFor the tested platform\, hardware will help the sync betwe en cpu and smmu. So there is no sync operations. So please consider this p atch as a RFC. This is only for the concept.\n\nThe other option to fix th is is creating a per-numa CMA area for smmu and assigning the per-numa cma _area to SMMU.\nMaybe cma_declare_contiguous_nid() used by mm/hugetlb.c ca n be used by AARCH64/SMMU.\n\nOr we can completely change CMA to create de fault per-numa CMA like this:\n\n struct page *dma_alloc_contiguous(struct device *dev\, size_t size\, gfp_t gfp) {\n size_t count = size > > PAGE_SHIFT\;\n struct page *page = NULL\;\n struct cma * cma = NULL\;\n + int nid = dev_to_node(dev)\;\n \n if (dev && dev->cma_area)\n cma = dev->cma_area\;\n else i f (count > 1)\n - cma = dma_contiguous_default_area\;\n + cma = dma_contiguous_default_areas[nid]\;\n \n ...\n return page\;\n}\n\nThen there is no necessity to assign per-numa cma_are a to smmu->dev.cma_area.\n\n-barry\n\n> Signed-off-by: Barry Song \n> ---\n> drivers/iommu/arm-smmu-v3.c | 63 \n> +++++ +++++++++++++++++++++++---------\n> 1 file changed\, 48 insertions(+)\, 1 5 deletions(-)\n> \n> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/i ommu/arm-smmu-v3.c \n> index 82508730feb7..58295423e1d7 100644\n> --- a/dr ivers/iommu/arm-smmu-v3.c\n> +++ b/drivers/iommu/arm-smmu-v3.c\n> @@ -3157 \,21 +3157\,23 @@ static int arm_smmu_init_one_queue(struct \n> arm_smmu_d evice *smmu\,\n> size_t dwords\, const char *name) {\n> size_t qsz\;\n> + struct page *page\;\n> \n> - do {\n> - qsz = ((1 << q->llq.max _n_shift) * dwords) << 3\;\n> - q->base = dmam_alloc_coherent(smmu->dev\, qsz\, &q->base_dma\,\n> - GFP_KERNEL)\;\n> - if (q->base || qs z < PAGE_SIZE)\n> - break\;\n> -\n> - q->llq.max_n_shift--\;\n> - } whi le (1)\;\n> + qsz = ((1 << q->llq.max_n_shift) * dwords) << 3\;\n> + page = alloc_pages_node(dev_to_node(smmu->dev)\, GFP_KERNEL\,\n> + get_order (qsz))\;\n> + if (!page) {\n> + dev_err(smmu->dev\,\n> + "failed to al locate queue (0x%zx bytes) for %s\\n"\,\n> + qsz\, name)\;\n> + return -ENOMEM\;\n> + }\n> \n> - if (!q->base) {\n> + q->base = page_address(pag e)\;\n> + q->base_dma = dma_map_single(smmu->dev\, q->base\, qsz\,\n> DMA_ BIDIRECTIONAL)\;\n> + if (dma_mapping_error(smmu->dev\, q->base_dma)) {\n> dev_err(smmu->dev\,\n> - "failed to allocate queue (0x%zx bytes) for %s\\n"\,\n> - qsz\, name)\;\n> + "failed to dma map for %s\\n"\, nam e)\;\n> return -ENOMEM\;\n> }\n> \n> @@ -3192\,6 +3194\,18 @@ static int arm_smmu_init_one_queue(struct \n> arm_smmu_device *smmu\,\n> return 0\;\n> }\n> \n> +static int arm_smmu_deinit_one_queue(struct arm_smmu_de vice *smmu\,\n> + struct arm_smmu_queue *q\,\n> + size_t dword s)\n> +{\n> + size_t qsz = ((1 << q->llq.max_n_shift) * dwords) << 3\;\n> +\n> + dma_unmap_single(smmu->dev\, q->base_dma\, qsz\,\n> DMA_BIDIRECTION AL)\;\n> + free_page((unsigned long)q->base)\;\n> +\n> + return 0\;\n> +}\ n> +\n> static void arm_smmu_cmdq_free_bitmap(void *data) {\n> unsigne d long *bitmap = data\;\n> @@ -3233\,22 +3247\,40 @@ static int arm_smmu_i nit_queues(struct \n> arm_smmu_device *smmu)\n> \n> ret = arm_smmu_cmdq_ init(smmu)\;\n> if (ret)\n> - return ret\;\n> + goto deinit_cmdq\;\n> \n> /* evtq */\n> ret = arm_smmu_init_one_queue(smmu\, &smmu->evtq.q\, \n> ARM_SMMU_EVTQ_PROD\,\n> ARM_SMMU_EVTQ_CONS\, EVTQ_ENT_DWOR DS\,\n> "evtq")\;\n> if (ret)\n> - return ret\;\n> + goto d einit_cmdq\;\n> \n> /* priq */\n> if (!(smmu->features & ARM_SMMU_FEAT _PRI))\n> return 0\;\n> \n> - return arm_smmu_init_one_queue(smmu\, &sm mu->priq.q\,\n> ARM_SMMU_PRIQ_PROD\,\n> + ret = arm_smmu_init_one_queue(sm mu\, &smmu->priq.q\,\n> ARM_SMMU_PRIQ_PROD\,\n> ARM_SMMU_PRIQ_ CONS\, PRIQ_ENT_DWORDS\,\n> "priq")\;\n> + if (ret)\n> + goto deinit_evtq\;\n> +\n> + return 0\;\n> +\n> +deinit_evtq:\n> + arm_smmu_de init_one_queue(smmu\, &smmu->evtq.q\,\n> EVTQ_ENT_DWORDS)\;\n> +deinit_cmd q:\n> + arm_smmu_deinit_one_queue(smmu\, &smmu->cmdq.q\,\n> CMDQ_ENT_DWORD S)\;\n> + return ret\;\n> +}\n> +\n> +static void arm_smmu_deinit_queues(s truct arm_smmu_device *smmu) {\n> + arm_smmu_deinit_one_queue(smmu\, &smmu ->cmdq.q\,\n> CMDQ_ENT_DWORDS)\;\n> + arm_smmu_deinit_one_queue(smmu\, &sm mu->evtq.q\,\n> EVTQ_ENT_DWORDS)\;\n> + if (smmu->features & ARM_SMMU_FEAT _PRI)\n> + arm_smmu_deinit_one_queue(smmu\, &smmu->priq.q\,\n> PRIQ_ENT_D WORDS)\;\n> }\n> \n> static int arm_smmu_init_l1_strtab(struct arm_smmu_ device *smmu) @@ \n> -4121\,6 +4153\,7 @@ static int arm_smmu_device_remov e(struct \n> platform_device *pdev)\n> arm_smmu_set_bus_ops(NULL)\;\n> iommu_device_unregister(&smmu->iommu)\;\n> iommu_device_sysfs_remove(&s mmu->iommu)\;\n> + arm_smmu_deinit_queues(smmu)\;\n> arm_smmu_device_dis able(smmu)\;\n> \n> return 0\;\n> --\n> 2.23.0\n> \n\n SUMMARY;LANGUAGE=en-US:[PATCH] iommu/arm-smmu-v3: allocate the memory of qu eues in local numa node DTSTART;TZID=New Zealand Standard Time:20200602T000000 DTEND;TZID=New Zealand Standard Time:20200602T003000 UID:040000008200E00074C5B7101A82E0080000000020F379756F38D601000000000000000 010000000A76210A2130E604D80BA64E7C92F495B CLASS:PUBLIC PRIORITY:5 DTSTAMP:20200601T115041Z TRANSP:OPAQUE STATUS:CONFIRMED SEQUENCE:0 LOCATION;LANGUAGE=en-US: X-MICROSOFT-CDO-APPT-SEQUENCE:0 X-MICROSOFT-CDO-OWNERAPPTID:46229476 X-MICROSOFT-CDO-BUSYSTATUS:TENTATIVE X-MICROSOFT-CDO-INTENDEDSTATUS:BUSY X-MICROSOFT-CDO-ALLDAYEVENT:FALSE X-MICROSOFT-CDO-IMPORTANCE:1 X-MICROSOFT-CDO-INSTTYPE:0 X-MICROSOFT-DISALLOW-COUNTER:FALSE BEGIN:VALARM ACTION:DISPLAY DESCRIPTION:REMINDER TRIGGER;RELATED=START:-PT15M END:VALARM END:VEVENT END:VCALENDAR