iommu.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
@ 2020-06-01 11:51 Song Bao Hua (Barry Song)
  0 siblings, 0 replies; 4+ messages in thread
From: Song Bao Hua (Barry Song) @ 2020-06-01 11:51 UTC (permalink / raw)
  To: hch, m.szyprowski, robin.murphy, will; +Cc: iommu, Linuxarm, linux-arm-kernel


[-- Attachment #1.1: Type: text/plain, Size: 6476 bytes --]

> From: Song Bao Hua (Barry Song)
> Sent: Monday, June 1, 2020 11:32 PM
> To: hch@lst.de; m.szyprowski@samsung.com; robin.murphy@arm.com; 
> will@kernel.org
> Cc: linux-arm-kernel@lists.infradead.org; 
> iommu@lists.linux-foundation.org; Linuxarm <linuxarm@huawei.com>; Song 
> Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Subject: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in 
> local numa node
> 
> 
> dmam_alloc_coherent() will usually allocate memory from the default CMA.
> For
> a common arm64 defconfig without reserved memory in device tree, there 
> is only one CMA close to address 0.
> dma_alloc_contiguous() will allocate memory without any idea of  NUMA 
> and smmu has no customized per-numa cma_area.
> struct page *dma_alloc_contiguous(struct device *dev, size_t size, 
> gfp_t gfp) {
>         size_t count = size >> PAGE_SHIFT;
>         struct page *page = NULL;
>         struct cma *cma = NULL;
> 
>         if (dev && dev->cma_area)
>                 cma = dev->cma_area;
>         else if (count > 1)
>                 cma = dma_contiguous_default_area;
> 
> 	...
>         return page;
> }
> 
> if there are N numa nodes, N-1 nodes will put command/evt queues etc 
> in a remote node the default CMA belongs to,probably node 0. Tests 
> show, after sending CMD_SYNC in an empty command queue, on Node2, 
> without this patch, it takes 550ns to wait for the completion of 
> CMD_SYNC; with this patch, it takes 250ns to wait for the completion 
> of CMD_SYNC.
> 

Sorry for missing the RFC in the subject.
For the tested platform, hardware will help the sync between cpu and smmu. So there is no sync operations. So please consider this patch as a RFC. This is only for the concept.

The other option to fix this is creating a per-numa CMA area for smmu and assigning the per-numa cma_area to SMMU.
Maybe cma_declare_contiguous_nid() used by mm/hugetlb.c can be used by AARCH64/SMMU.

Or we can completely change CMA to create default per-numa CMA like this:

 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)  {
         size_t count = size >> PAGE_SHIFT;
         struct page *page = NULL;
         struct cma *cma = NULL;
 +        int nid = dev_to_node(dev);
 
         if (dev && dev->cma_area)
                 cma = dev->cma_area;
         else if (count > 1)
 -               cma = dma_contiguous_default_area;
 +                cma = dma_contiguous_default_areas[nid];
 
 	...
         return page;
}

Then there is no necessity to assign per-numa cma_area to smmu->dev.cma_area.

-barry

> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 63 
> ++++++++++++++++++++++++++++---------
>  1 file changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c 
> index 82508730feb7..58295423e1d7 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -3157,21 +3157,23 @@ static int arm_smmu_init_one_queue(struct 
> arm_smmu_device *smmu,
>  				   size_t dwords, const char *name)  {
>  	size_t qsz;
> +	struct page *page;
> 
> -	do {
> -		qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
> -		q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
> -					      GFP_KERNEL);
> -		if (q->base || qsz < PAGE_SIZE)
> -			break;
> -
> -		q->llq.max_n_shift--;
> -	} while (1);
> +	qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
> +	page = alloc_pages_node(dev_to_node(smmu->dev), GFP_KERNEL,
> +				get_order(qsz));
> +	if (!page) {
> +		dev_err(smmu->dev,
> +				"failed to allocate queue (0x%zx bytes) for %s\n",
> +				qsz, name);
> +		return -ENOMEM;
> +	}
> 
> -	if (!q->base) {
> +	q->base = page_address(page);
> +	q->base_dma = dma_map_single(smmu->dev, q->base, qsz,
> DMA_BIDIRECTIONAL);
> +	if (dma_mapping_error(smmu->dev, q->base_dma)) {
>  		dev_err(smmu->dev,
> -			"failed to allocate queue (0x%zx bytes) for %s\n",
> -			qsz, name);
> +				"failed to dma map for %s\n", name);
>  		return -ENOMEM;
>  	}
> 
> @@ -3192,6 +3194,18 @@ static int arm_smmu_init_one_queue(struct 
> arm_smmu_device *smmu,
>  	return 0;
>  }
> 
> +static int arm_smmu_deinit_one_queue(struct arm_smmu_device *smmu,
> +				   struct arm_smmu_queue *q,
> +				   size_t dwords)
> +{
> +	size_t qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
> +
> +	dma_unmap_single(smmu->dev, q->base_dma, qsz,
> DMA_BIDIRECTIONAL);
> +	free_page((unsigned long)q->base);
> +
> +	return 0;
> +}
> +
>  static void arm_smmu_cmdq_free_bitmap(void *data)  {
>  	unsigned long *bitmap = data;
> @@ -3233,22 +3247,40 @@ static int arm_smmu_init_queues(struct 
> arm_smmu_device *smmu)
> 
>  	ret = arm_smmu_cmdq_init(smmu);
>  	if (ret)
> -		return ret;
> +		goto deinit_cmdq;
> 
>  	/* evtq */
>  	ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, 
> ARM_SMMU_EVTQ_PROD,
>  				      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
>  				      "evtq");
>  	if (ret)
> -		return ret;
> +		goto deinit_cmdq;
> 
>  	/* priq */
>  	if (!(smmu->features & ARM_SMMU_FEAT_PRI))
>  		return 0;
> 
> -	return arm_smmu_init_one_queue(smmu, &smmu->priq.q,
> ARM_SMMU_PRIQ_PROD,
> +	ret = arm_smmu_init_one_queue(smmu, &smmu->priq.q,
> ARM_SMMU_PRIQ_PROD,
>  				       ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS,
>  				       "priq");
> +	if (ret)
> +		goto deinit_evtq;
> +
> +	return 0;
> +
> +deinit_evtq:
> +	arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q,
> EVTQ_ENT_DWORDS);
> +deinit_cmdq:
> +	arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q,
> CMDQ_ENT_DWORDS);
> +	return ret;
> +}
> +
> +static void arm_smmu_deinit_queues(struct arm_smmu_device *smmu) {
> +	arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q,
> CMDQ_ENT_DWORDS);
> +	arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q,
> EVTQ_ENT_DWORDS);
> +	if (smmu->features & ARM_SMMU_FEAT_PRI)
> +		arm_smmu_deinit_one_queue(smmu, &smmu->priq.q,
> PRIQ_ENT_DWORDS);
>  }
> 
>  static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu) @@ 
> -4121,6 +4153,7 @@ static int arm_smmu_device_remove(struct 
> platform_device *pdev)
>  	arm_smmu_set_bus_ops(NULL);
>  	iommu_device_unregister(&smmu->iommu);
>  	iommu_device_sysfs_remove(&smmu->iommu);
> +	arm_smmu_deinit_queues(smmu);
>  	arm_smmu_device_disable(smmu);
> 
>  	return 0;
> --
> 2.23.0
> 


[-- Attachment #1.2: Type: text/calendar, Size: 9136 bytes --]

BEGIN:VCALENDAR
METHOD:REQUEST
PRODID:Microsoft Exchange Server 2010
VERSION:2.0
BEGIN:VTIMEZONE
TZID:New Zealand Standard Time
BEGIN:STANDARD
DTSTART:16010101T030000
TZOFFSETFROM:+1300
TZOFFSETTO:+1200
RRULE:FREQ=YEARLY;INTERVAL=1;BYDAY=1SU;BYMONTH=4
END:STANDARD
BEGIN:DAYLIGHT
DTSTART:16010101T020000
TZOFFSETFROM:+1200
TZOFFSETTO:+1300
RRULE:FREQ=YEARLY;INTERVAL=1;BYDAY=-1SU;BYMONTH=9
END:DAYLIGHT
END:VTIMEZONE
BEGIN:VEVENT
ORGANIZER;CN=Song Bao Hua (Barry Song):MAILTO:song.bao.hua@hisilicon.com
ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=hch@lst.de
 :MAILTO:hch@lst.de
ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=m.szyprows
 ki@samsung.com:MAILTO:m.szyprowski@samsung.com
ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=robin.murp
 hy@arm.com:MAILTO:robin.murphy@arm.com
ATTENDEE;ROLE=REQ-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=will@kerne
 l.org:MAILTO:will@kernel.org
ATTENDEE;ROLE=OPT-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=linux-arm-
 kernel@lists.infradead.org:MAILTO:linux-arm-kernel@lists.infradead.org
ATTENDEE;ROLE=OPT-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=iommu@list
 s.linux-foundation.org:MAILTO:iommu@lists.linux-foundation.org
ATTENDEE;ROLE=OPT-PARTICIPANT;PARTSTAT=NEEDS-ACTION;RSVP=TRUE;CN=Linuxarm:M
 AILTO:linuxarm@huawei.com
DESCRIPTION;LANGUAGE=en-US:> From: Song Bao Hua (Barry Song)\n> Sent: Monda
 y\, June 1\, 2020 11:32 PM\n> To: hch@lst.de\; m.szyprowski@samsung.com\; 
 robin.murphy@arm.com\; \n> will@kernel.org\n> Cc: linux-arm-kernel@lists.i
 nfradead.org\; \n> iommu@lists.linux-foundation.org\; Linuxarm <linuxarm@h
 uawei.com>\; Song \n> Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>\n>
  Subject: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in \n> 
 local numa node\n> \n> \n> dmam_alloc_coherent() will usually allocate mem
 ory from the default CMA.\n> For\n> a common arm64 defconfig without reser
 ved memory in device tree\, there \n> is only one CMA close to address 0.\
 n> dma_alloc_contiguous() will allocate memory without any idea of  NUMA \
 n> and smmu has no customized per-numa cma_area.\n> struct page *dma_alloc
 _contiguous(struct device *dev\, size_t size\, \n> gfp_t gfp) {\n>        
  size_t count = size >> PAGE_SHIFT\;\n>         struct page *page = NULL\;
 \n>         struct cma *cma = NULL\;\n> \n>         if (dev && dev->cma_ar
 ea)\n>                 cma = dev->cma_area\;\n>         else if (count > 1
 )\n>                 cma = dma_contiguous_default_area\;\n> \n> 	...\n>   
       return page\;\n> }\n> \n> if there are N numa nodes\, N-1 nodes will
  put command/evt queues etc \n> in a remote node the default CMA belongs t
 o\,probably node 0. Tests \n> show\, after sending CMD_SYNC in an empty co
 mmand queue\, on Node2\, \n> without this patch\, it takes 550ns to wait f
 or the completion of \n> CMD_SYNC\; with this patch\, it takes 250ns to wa
 it for the completion \n> of CMD_SYNC.\n> \n\nSorry for missing the RFC in
  the subject.\nFor the tested platform\, hardware will help the sync betwe
 en cpu and smmu. So there is no sync operations. So please consider this p
 atch as a RFC. This is only for the concept.\n\nThe other option to fix th
 is is creating a per-numa CMA area for smmu and assigning the per-numa cma
 _area to SMMU.\nMaybe cma_declare_contiguous_nid() used by mm/hugetlb.c ca
 n be used by AARCH64/SMMU.\n\nOr we can completely change CMA to create de
 fault per-numa CMA like this:\n\n struct page *dma_alloc_contiguous(struct
  device *dev\, size_t size\, gfp_t gfp)  {\n         size_t count = size >
 > PAGE_SHIFT\;\n         struct page *page = NULL\;\n         struct cma *
 cma = NULL\;\n +        int nid = dev_to_node(dev)\;\n \n         if (dev 
 && dev->cma_area)\n                 cma = dev->cma_area\;\n         else i
 f (count > 1)\n -               cma = dma_contiguous_default_area\;\n +   
              cma = dma_contiguous_default_areas[nid]\;\n \n 	...\n        
  return page\;\n}\n\nThen there is no necessity to assign per-numa cma_are
 a to smmu->dev.cma_area.\n\n-barry\n\n> Signed-off-by: Barry Song <song.ba
 o.hua@hisilicon.com>\n> ---\n>  drivers/iommu/arm-smmu-v3.c | 63 \n> +++++
 +++++++++++++++++++++++---------\n>  1 file changed\, 48 insertions(+)\, 1
 5 deletions(-)\n> \n> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/i
 ommu/arm-smmu-v3.c \n> index 82508730feb7..58295423e1d7 100644\n> --- a/dr
 ivers/iommu/arm-smmu-v3.c\n> +++ b/drivers/iommu/arm-smmu-v3.c\n> @@ -3157
 \,21 +3157\,23 @@ static int arm_smmu_init_one_queue(struct \n> arm_smmu_d
 evice *smmu\,\n>  				   size_t dwords\, const char *name)  {\n>  	size_t 
 qsz\;\n> +	struct page *page\;\n> \n> -	do {\n> -		qsz = ((1 << q->llq.max
 _n_shift) * dwords) << 3\;\n> -		q->base = dmam_alloc_coherent(smmu->dev\,
  qsz\, &q->base_dma\,\n> -					      GFP_KERNEL)\;\n> -		if (q->base || qs
 z < PAGE_SIZE)\n> -			break\;\n> -\n> -		q->llq.max_n_shift--\;\n> -	} whi
 le (1)\;\n> +	qsz = ((1 << q->llq.max_n_shift) * dwords) << 3\;\n> +	page 
 = alloc_pages_node(dev_to_node(smmu->dev)\, GFP_KERNEL\,\n> +				get_order
 (qsz))\;\n> +	if (!page) {\n> +		dev_err(smmu->dev\,\n> +				"failed to al
 locate queue (0x%zx bytes) for %s\\n"\,\n> +				qsz\, name)\;\n> +		return
  -ENOMEM\;\n> +	}\n> \n> -	if (!q->base) {\n> +	q->base = page_address(pag
 e)\;\n> +	q->base_dma = dma_map_single(smmu->dev\, q->base\, qsz\,\n> DMA_
 BIDIRECTIONAL)\;\n> +	if (dma_mapping_error(smmu->dev\, q->base_dma)) {\n>
   		dev_err(smmu->dev\,\n> -			"failed to allocate queue (0x%zx bytes) for
  %s\\n"\,\n> -			qsz\, name)\;\n> +				"failed to dma map for %s\\n"\, nam
 e)\;\n>  		return -ENOMEM\;\n>  	}\n> \n> @@ -3192\,6 +3194\,18 @@ static 
 int arm_smmu_init_one_queue(struct \n> arm_smmu_device *smmu\,\n>  	return
  0\;\n>  }\n> \n> +static int arm_smmu_deinit_one_queue(struct arm_smmu_de
 vice *smmu\,\n> +				   struct arm_smmu_queue *q\,\n> +				   size_t dword
 s)\n> +{\n> +	size_t qsz = ((1 << q->llq.max_n_shift) * dwords) << 3\;\n> 
 +\n> +	dma_unmap_single(smmu->dev\, q->base_dma\, qsz\,\n> DMA_BIDIRECTION
 AL)\;\n> +	free_page((unsigned long)q->base)\;\n> +\n> +	return 0\;\n> +}\
 n> +\n>  static void arm_smmu_cmdq_free_bitmap(void *data)  {\n>  	unsigne
 d long *bitmap = data\;\n> @@ -3233\,22 +3247\,40 @@ static int arm_smmu_i
 nit_queues(struct \n> arm_smmu_device *smmu)\n> \n>  	ret = arm_smmu_cmdq_
 init(smmu)\;\n>  	if (ret)\n> -		return ret\;\n> +		goto deinit_cmdq\;\n> 
 \n>  	/* evtq */\n>  	ret = arm_smmu_init_one_queue(smmu\, &smmu->evtq.q\,
  \n> ARM_SMMU_EVTQ_PROD\,\n>  				      ARM_SMMU_EVTQ_CONS\, EVTQ_ENT_DWOR
 DS\,\n>  				      "evtq")\;\n>  	if (ret)\n> -		return ret\;\n> +		goto d
 einit_cmdq\;\n> \n>  	/* priq */\n>  	if (!(smmu->features & ARM_SMMU_FEAT
 _PRI))\n>  		return 0\;\n> \n> -	return arm_smmu_init_one_queue(smmu\, &sm
 mu->priq.q\,\n> ARM_SMMU_PRIQ_PROD\,\n> +	ret = arm_smmu_init_one_queue(sm
 mu\, &smmu->priq.q\,\n> ARM_SMMU_PRIQ_PROD\,\n>  				       ARM_SMMU_PRIQ_
 CONS\, PRIQ_ENT_DWORDS\,\n>  				       "priq")\;\n> +	if (ret)\n> +		goto
  deinit_evtq\;\n> +\n> +	return 0\;\n> +\n> +deinit_evtq:\n> +	arm_smmu_de
 init_one_queue(smmu\, &smmu->evtq.q\,\n> EVTQ_ENT_DWORDS)\;\n> +deinit_cmd
 q:\n> +	arm_smmu_deinit_one_queue(smmu\, &smmu->cmdq.q\,\n> CMDQ_ENT_DWORD
 S)\;\n> +	return ret\;\n> +}\n> +\n> +static void arm_smmu_deinit_queues(s
 truct arm_smmu_device *smmu) {\n> +	arm_smmu_deinit_one_queue(smmu\, &smmu
 ->cmdq.q\,\n> CMDQ_ENT_DWORDS)\;\n> +	arm_smmu_deinit_one_queue(smmu\, &sm
 mu->evtq.q\,\n> EVTQ_ENT_DWORDS)\;\n> +	if (smmu->features & ARM_SMMU_FEAT
 _PRI)\n> +		arm_smmu_deinit_one_queue(smmu\, &smmu->priq.q\,\n> PRIQ_ENT_D
 WORDS)\;\n>  }\n> \n>  static int arm_smmu_init_l1_strtab(struct arm_smmu_
 device *smmu) @@ \n> -4121\,6 +4153\,7 @@ static int arm_smmu_device_remov
 e(struct \n> platform_device *pdev)\n>  	arm_smmu_set_bus_ops(NULL)\;\n>  
 	iommu_device_unregister(&smmu->iommu)\;\n>  	iommu_device_sysfs_remove(&s
 mmu->iommu)\;\n> +	arm_smmu_deinit_queues(smmu)\;\n>  	arm_smmu_device_dis
 able(smmu)\;\n> \n>  	return 0\;\n> --\n> 2.23.0\n> \n\n
SUMMARY;LANGUAGE=en-US:[PATCH] iommu/arm-smmu-v3: allocate the memory of qu
 eues in local numa node
DTSTART;TZID=New Zealand Standard Time:20200602T000000
DTEND;TZID=New Zealand Standard Time:20200602T003000
UID:040000008200E00074C5B7101A82E0080000000020F379756F38D601000000000000000
 010000000A76210A2130E604D80BA64E7C92F495B
CLASS:PUBLIC
PRIORITY:5
DTSTAMP:20200601T115041Z
TRANSP:OPAQUE
STATUS:CONFIRMED
SEQUENCE:0
LOCATION;LANGUAGE=en-US:
X-MICROSOFT-CDO-APPT-SEQUENCE:0
X-MICROSOFT-CDO-OWNERAPPTID:46229476
X-MICROSOFT-CDO-BUSYSTATUS:TENTATIVE
X-MICROSOFT-CDO-INTENDEDSTATUS:BUSY
X-MICROSOFT-CDO-ALLDAYEVENT:FALSE
X-MICROSOFT-CDO-IMPORTANCE:1
X-MICROSOFT-CDO-INSTTYPE:0
X-MICROSOFT-DISALLOW-COUNTER:FALSE
BEGIN:VALARM
ACTION:DISPLAY
DESCRIPTION:REMINDER
TRIGGER;RELATED=START:-PT15M
END:VALARM
END:VEVENT
END:VCALENDAR

[-- Attachment #2: Type: text/plain, Size: 156 bytes --]

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
  2020-07-03 16:21 ` Will Deacon
@ 2020-07-05 10:09   ` Song Bao Hua (Barry Song)
  0 siblings, 0 replies; 4+ messages in thread
From: Song Bao Hua (Barry Song) @ 2020-07-05 10:09 UTC (permalink / raw)
  To: Will Deacon; +Cc: Linuxarm, iommu, robin.murphy, hch, linux-arm-kernel



> -----Original Message-----
> From: Will Deacon [mailto:will@kernel.org]
> Sent: Saturday, July 4, 2020 4:22 AM
> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>
> Cc: hch@lst.de; m.szyprowski@samsung.com; robin.murphy@arm.com;
> linux-arm-kernel@lists.infradead.org; iommu@lists.linux-foundation.org;
> Linuxarm <linuxarm@huawei.com>
> Subject: Re: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in
> local numa node
> 
> On Mon, Jun 01, 2020 at 11:31:41PM +1200, Barry Song wrote:
> > dmam_alloc_coherent() will usually allocate memory from the default CMA.
> For
> > a common arm64 defconfig without reserved memory in device tree, there is
> only
> > one CMA close to address 0.
> > dma_alloc_contiguous() will allocate memory without any idea of  NUMA
> and smmu
> > has no customized per-numa cma_area.
> > struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
> > {
> >         size_t count = size >> PAGE_SHIFT;
> >         struct page *page = NULL;
> >         struct cma *cma = NULL;
> >
> >         if (dev && dev->cma_area)
> >                 cma = dev->cma_area;
> >         else if (count > 1)
> >                 cma = dma_contiguous_default_area;
> >
> > 	...
> >         return page;
> > }
> >
> > if there are N numa nodes, N-1 nodes will put command/evt queues etc in a
> > remote node the default CMA belongs to,probably node 0. Tests show, after
> > sending CMD_SYNC in an empty command queue,
> > on Node2, without this patch, it takes 550ns to wait for the completion
> > of CMD_SYNC; with this patch, it takes 250ns to wait for the completion
> > of CMD_SYNC.
> >
> > Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> > ---
> >  drivers/iommu/arm-smmu-v3.c | 63
> ++++++++++++++++++++++++++++---------
> >  1 file changed, 48 insertions(+), 15 deletions(-)
> 
> I would prefer that the coherent DMA allocator learned about NUMA, rather
> than we bodge drivers to use the streaming API where it doesn't really
> make sense.
> 
> I see that you've posted other patches to do that (thanks!), so I'll
> disregard this series.

Thanks for taking a look, Will. For sure I am using the per-numa cma patchset to
replace this patch. So it is ok to ignore this one.


> 
> Cheers,
> 
> Will

Thanks
Barry

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
  2020-06-01 11:31 Barry Song
@ 2020-07-03 16:21 ` Will Deacon
  2020-07-05 10:09   ` Song Bao Hua (Barry Song)
  0 siblings, 1 reply; 4+ messages in thread
From: Will Deacon @ 2020-07-03 16:21 UTC (permalink / raw)
  To: Barry Song; +Cc: linuxarm, iommu, robin.murphy, hch, linux-arm-kernel

On Mon, Jun 01, 2020 at 11:31:41PM +1200, Barry Song wrote:
> dmam_alloc_coherent() will usually allocate memory from the default CMA. For
> a common arm64 defconfig without reserved memory in device tree, there is only
> one CMA close to address 0.
> dma_alloc_contiguous() will allocate memory without any idea of  NUMA and smmu
> has no customized per-numa cma_area.
> struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
> {
>         size_t count = size >> PAGE_SHIFT;
>         struct page *page = NULL;
>         struct cma *cma = NULL;
> 
>         if (dev && dev->cma_area)
>                 cma = dev->cma_area;
>         else if (count > 1)
>                 cma = dma_contiguous_default_area;
> 
> 	...
>         return page;
> }
> 
> if there are N numa nodes, N-1 nodes will put command/evt queues etc in a
> remote node the default CMA belongs to,probably node 0. Tests show, after
> sending CMD_SYNC in an empty command queue,
> on Node2, without this patch, it takes 550ns to wait for the completion
> of CMD_SYNC; with this patch, it takes 250ns to wait for the completion
> of CMD_SYNC.
> 
> Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 63 ++++++++++++++++++++++++++++---------
>  1 file changed, 48 insertions(+), 15 deletions(-)

I would prefer that the coherent DMA allocator learned about NUMA, rather
than we bodge drivers to use the streaming API where it doesn't really
make sense.

I see that you've posted other patches to do that (thanks!), so I'll
disregard this series.

Cheers,

Will
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node
@ 2020-06-01 11:31 Barry Song
  2020-07-03 16:21 ` Will Deacon
  0 siblings, 1 reply; 4+ messages in thread
From: Barry Song @ 2020-06-01 11:31 UTC (permalink / raw)
  To: hch, m.szyprowski, robin.murphy, will; +Cc: iommu, linuxarm, linux-arm-kernel

dmam_alloc_coherent() will usually allocate memory from the default CMA. For
a common arm64 defconfig without reserved memory in device tree, there is only
one CMA close to address 0.
dma_alloc_contiguous() will allocate memory without any idea of  NUMA and smmu
has no customized per-numa cma_area.
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
        size_t count = size >> PAGE_SHIFT;
        struct page *page = NULL;
        struct cma *cma = NULL;

        if (dev && dev->cma_area)
                cma = dev->cma_area;
        else if (count > 1)
                cma = dma_contiguous_default_area;

	...
        return page;
}

if there are N numa nodes, N-1 nodes will put command/evt queues etc in a
remote node the default CMA belongs to,probably node 0. Tests show, after
sending CMD_SYNC in an empty command queue,
on Node2, without this patch, it takes 550ns to wait for the completion
of CMD_SYNC; with this patch, it takes 250ns to wait for the completion
of CMD_SYNC.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
---
 drivers/iommu/arm-smmu-v3.c | 63 ++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 82508730feb7..58295423e1d7 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -3157,21 +3157,23 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 				   size_t dwords, const char *name)
 {
 	size_t qsz;
+	struct page *page;
 
-	do {
-		qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
-		q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma,
-					      GFP_KERNEL);
-		if (q->base || qsz < PAGE_SIZE)
-			break;
-
-		q->llq.max_n_shift--;
-	} while (1);
+	qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
+	page = alloc_pages_node(dev_to_node(smmu->dev), GFP_KERNEL,
+				get_order(qsz));
+	if (!page) {
+		dev_err(smmu->dev,
+				"failed to allocate queue (0x%zx bytes) for %s\n",
+				qsz, name);
+		return -ENOMEM;
+	}
 
-	if (!q->base) {
+	q->base = page_address(page);
+	q->base_dma = dma_map_single(smmu->dev, q->base, qsz, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(smmu->dev, q->base_dma)) {
 		dev_err(smmu->dev,
-			"failed to allocate queue (0x%zx bytes) for %s\n",
-			qsz, name);
+				"failed to dma map for %s\n", name);
 		return -ENOMEM;
 	}
 
@@ -3192,6 +3194,18 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 	return 0;
 }
 
+static int arm_smmu_deinit_one_queue(struct arm_smmu_device *smmu,
+				   struct arm_smmu_queue *q,
+				   size_t dwords)
+{
+	size_t qsz = ((1 << q->llq.max_n_shift) * dwords) << 3;
+
+	dma_unmap_single(smmu->dev, q->base_dma, qsz, DMA_BIDIRECTIONAL);
+	free_page((unsigned long)q->base);
+
+	return 0;
+}
+
 static void arm_smmu_cmdq_free_bitmap(void *data)
 {
 	unsigned long *bitmap = data;
@@ -3233,22 +3247,40 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 
 	ret = arm_smmu_cmdq_init(smmu);
 	if (ret)
-		return ret;
+		goto deinit_cmdq;
 
 	/* evtq */
 	ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
 				      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
 				      "evtq");
 	if (ret)
-		return ret;
+		goto deinit_cmdq;
 
 	/* priq */
 	if (!(smmu->features & ARM_SMMU_FEAT_PRI))
 		return 0;
 
-	return arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
+	ret = arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
 				       ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS,
 				       "priq");
+	if (ret)
+		goto deinit_evtq;
+
+	return 0;
+
+deinit_evtq:
+	arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q, EVTQ_ENT_DWORDS);
+deinit_cmdq:
+	arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q, CMDQ_ENT_DWORDS);
+	return ret;
+}
+
+static void arm_smmu_deinit_queues(struct arm_smmu_device *smmu)
+{
+	arm_smmu_deinit_one_queue(smmu, &smmu->cmdq.q, CMDQ_ENT_DWORDS);
+	arm_smmu_deinit_one_queue(smmu, &smmu->evtq.q, EVTQ_ENT_DWORDS);
+	if (smmu->features & ARM_SMMU_FEAT_PRI)
+		arm_smmu_deinit_one_queue(smmu, &smmu->priq.q, PRIQ_ENT_DWORDS);
 }
 
 static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
@@ -4121,6 +4153,7 @@ static int arm_smmu_device_remove(struct platform_device *pdev)
 	arm_smmu_set_bus_ops(NULL);
 	iommu_device_unregister(&smmu->iommu);
 	iommu_device_sysfs_remove(&smmu->iommu);
+	arm_smmu_deinit_queues(smmu);
 	arm_smmu_device_disable(smmu);
 
 	return 0;
-- 
2.23.0


_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-07-05 10:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-01 11:51 [PATCH] iommu/arm-smmu-v3: allocate the memory of queues in local numa node Song Bao Hua (Barry Song)
  -- strict thread matches above, loose matches on Subject: below --
2020-06-01 11:31 Barry Song
2020-07-03 16:21 ` Will Deacon
2020-07-05 10:09   ` Song Bao Hua (Barry Song)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).