All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/3] iommu: Attach device group to old domain in error path
@ 2023-02-03 17:36 Vasant Hegde
  2023-02-03 17:36 ` [PATCH v2 2/3] iommu/amd: Skip attach device domain is same as new domain Vasant Hegde
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Vasant Hegde @ 2023-02-03 17:36 UTC (permalink / raw)
  To: iommu, joro; +Cc: suravee.suthikulpanit, robin.murphy, will, Vasant Hegde

iommu_attach_group() attaches all devices in a group to domain and then
sets group domain (group->domain). Current code (__iommu_attach_group())
does not handle error path. This creates problem as devices to domain
attachment is in inconsistent state.

To recover from this situation, we need force attaching all devices back
to the old domain. This patch introduces `force` parameter to
__iommu_group_set_domain() so that we can attach devices back to old
domain.

Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
---
 drivers/iommu/iommu.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index de91dd88705b..e58683cd3bf7 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
 static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group);
 static int __iommu_group_set_domain(struct iommu_group *group,
-				    struct iommu_domain *new_domain);
+				    struct iommu_domain *new_domain,
+				    bool force);
 static int iommu_create_device_direct_mappings(struct iommu_group *group,
 					       struct device *dev);
 static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
@@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
 	else
 		new_domain = group->default_domain;
 
-	ret = __iommu_group_set_domain(group, new_domain);
+	ret = __iommu_group_set_domain(group, new_domain, false);
 	WARN(ret, "iommu driver failed to attach the default/blocking domain");
 }
 
@@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain *domain,
 
 	ret = __iommu_group_for_each_dev(group, domain,
 					 iommu_group_do_attach_device);
-	if (ret == 0)
+	if (ret == 0) {
 		group->domain = domain;
+	} else {
+		/*
+		 * To recover from the case when certain device within the
+		 * group fails to attach to the new domain, we need force
+		 * attaching all devices back to the old domain.
+		 */
+		__iommu_group_set_domain(group, group->domain, true);
+	}
 
 	return ret;
 }
@@ -2164,11 +2173,12 @@ static int iommu_group_do_detach_device(struct device *dev, void *data)
 }
 
 static int __iommu_group_set_domain(struct iommu_group *group,
-				    struct iommu_domain *new_domain)
+				    struct iommu_domain *new_domain,
+				    bool force)
 {
 	int ret;
 
-	if (group->domain == new_domain)
+	if (!force && group->domain == new_domain)
 		return 0;
 
 	/*
@@ -3135,7 +3145,7 @@ static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner)
 	ret = __iommu_group_alloc_blocking_domain(group);
 	if (ret)
 		return ret;
-	ret = __iommu_group_set_domain(group, group->blocking_domain);
+	ret = __iommu_group_set_domain(group, group->blocking_domain, false);
 	if (ret)
 		return ret;
 
@@ -3222,7 +3232,7 @@ static void __iommu_release_dma_ownership(struct iommu_group *group)
 
 	group->owner_cnt = 0;
 	group->owner = NULL;
-	ret = __iommu_group_set_domain(group, group->default_domain);
+	ret = __iommu_group_set_domain(group, group->default_domain, false);
 	WARN(ret, "iommu driver failed to attach the default domain");
 }
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 2/3] iommu/amd: Skip attach device domain is same as new domain
  2023-02-03 17:36 [PATCH v2 1/3] iommu: Attach device group to old domain in error path Vasant Hegde
@ 2023-02-03 17:36 ` Vasant Hegde
  2023-02-03 17:36 ` [PATCH v2 3/3] iommu/amd: Improve page fault error reporting Vasant Hegde
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Vasant Hegde @ 2023-02-03 17:36 UTC (permalink / raw)
  To: iommu, joro; +Cc: suravee.suthikulpanit, robin.murphy, will, Vasant Hegde

If device->domain is same as new domain then we can skip the
device attach process.

Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
---
 drivers/iommu/amd/iommu.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2f8c018e1ffd..26c96b821fa2 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2161,6 +2161,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 	struct amd_iommu *iommu = rlookup_amd_iommu(dev);
 	int ret;
 
+	/*
+	 * Skip attach device to domain if new domain is same as
+	 * devices current domain
+	 */
+	if (dev_data->domain == domain)
+		return 0;
+
 	dev_data->defer_attach = false;
 
 	if (dev_data->domain)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 3/3] iommu/amd: Improve page fault error reporting
  2023-02-03 17:36 [PATCH v2 1/3] iommu: Attach device group to old domain in error path Vasant Hegde
  2023-02-03 17:36 ` [PATCH v2 2/3] iommu/amd: Skip attach device domain is same as new domain Vasant Hegde
@ 2023-02-03 17:36 ` Vasant Hegde
  2023-02-04  3:37 ` [PATCH v2 1/3] iommu: Attach device group to old domain in error path Baolu Lu
  2023-02-06 13:29 ` Robin Murphy
  3 siblings, 0 replies; 9+ messages in thread
From: Vasant Hegde @ 2023-02-03 17:36 UTC (permalink / raw)
  To: iommu, joro
  Cc: suravee.suthikulpanit, robin.murphy, will, Vasant Hegde, Matt Fagnani

If IOMMU domain for device group is not setup properly then we may hit
IOMMU page fault. Current page fault handler assumes that domain is
always setup and it will hit NULL pointer derefence (see below sample log).

Lets check whether domain is setup or not and log appropriate message.

Sample log:
----------
Jan 06 02:07:52 kernel: amdgpu 0000:00:01.0: amdgpu: SE 1, SH per SE 1, CU per SH 8, active_cu_number 6
Jan 06 02:07:52 kernel: BUG: kernel NULL pointer dereference, address: 0000000000000058
Jan 06 02:07:52 kernel: #PF: supervisor read access in kernel mode
Jan 06 02:07:53 kernel: #PF: error_code(0x0000) - not-present page
Jan 06 02:07:53 kernel: PGD 0 P4D 0
Jan 06 02:07:53 kernel: Oops: 0000 [#1] PREEMPT SMP NOPTI
Jan 06 02:07:53 kernel: CPU: 2 PID: 56 Comm: irq/24-AMD-Vi Not tainted 6.2.0-rc2+ #89
Jan 06 02:07:53 kernel: Hardware name: xxx
Jan 06 02:07:53 kernel: RIP: 0010:report_iommu_fault+0x11/0x90
Jan 06 02:07:53 kernel: Code: 0f 0b eb cd 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 55 41 54 41 89 cc 55 48 89 d5 53 <48> 8b 47 48 48 89 f3 48 85 c0 74 64 4c 8b 47 50 e8 ea e8 4a 00 41
Jan 06 02:07:53 kernel: RSP: 0018:ffffb9b7803ebe08 EFLAGS: 00010246
Jan 06 02:07:53 kernel: RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
Jan 06 02:07:53 kernel: RDX: 00000001522e2500 RSI: ffff940cc09e30d0 RDI: 0000000000000010
Jan 06 02:07:53 kernel: RBP: 00000001522e2500 R08: ffff940cc12c2b00 R09: 0000000000000050
Jan 06 02:07:53 kernel: R10: ffff940cc021e000 R11: 0000000000000000 R12: 0000000000000000
Jan 06 02:07:53 kernel: R13: ffff940cc005b000 R14: 0000000000000008 R15: 0000000000000000
Jan 06 02:07:53 kernel: FS:  0000000000000000(0000) GS:ffff940db7500000(0000) knlGS:0000000000000000
Jan 06 02:07:53 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jan 06 02:07:53 kernel: CR2: 0000000000000058 CR3: 000000010230a000 CR4: 00000000001506e0
Jan 06 02:07:53 kernel: Call Trace:
Jan 06 02:07:53 kernel:  <TASK>
Jan 06 02:07:53 kernel:  amd_iommu_int_thread+0x60c/0x760
Jan 06 02:07:53 kernel:  ? __pfx_irq_thread_fn+0x10/0x10
Jan 06 02:07:53 kernel:  irq_thread_fn+0x1f/0x60
Jan 06 02:07:53 kernel:  irq_thread+0xea/0x1a0
Jan 06 02:07:53 kernel:  ? preempt_count_add+0x6a/0xa0
Jan 06 02:07:53 kernel:  ? __pfx_irq_thread_dtor+0x10/0x10
Jan 06 02:07:53 kernel:  ? __pfx_irq_thread+0x10/0x10
Jan 06 02:07:53 kernel:  kthread+0xe9/0x110
Jan 06 02:07:53 kernel:  ? __pfx_kthread+0x10/0x10
Jan 06 02:07:53 kernel:  ret_from_fork+0x2c/0x50
Jan 06 02:07:53 kernel:  </TASK>

Reported-by: Matt Fagnani <matt.fagnani@bell.net>
Suggested-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
---
Changes in v2:
  - Removed BUG_ON and added log message.

-Vasant

 drivers/iommu/amd/iommu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 26c96b821fa2..9dc41c48c29d 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -558,6 +558,15 @@ static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
 		 * prevent logging it.
 		 */
 		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
+			/* Device not attached to domain properly */
+			if (dev_data->domain == NULL) {
+				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
+				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
+						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+						   PCI_FUNC(devid), domain_id);
+				goto out;
+			}
+
 			if (!report_iommu_fault(&dev_data->domain->domain,
 						&pdev->dev, address,
 						IS_WRITE_REQUEST(flags) ?
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/3] iommu: Attach device group to old domain in error path
  2023-02-03 17:36 [PATCH v2 1/3] iommu: Attach device group to old domain in error path Vasant Hegde
  2023-02-03 17:36 ` [PATCH v2 2/3] iommu/amd: Skip attach device domain is same as new domain Vasant Hegde
  2023-02-03 17:36 ` [PATCH v2 3/3] iommu/amd: Improve page fault error reporting Vasant Hegde
@ 2023-02-04  3:37 ` Baolu Lu
  2023-02-06 13:05   ` Vasant Hegde
  2023-02-06 13:29 ` Robin Murphy
  3 siblings, 1 reply; 9+ messages in thread
From: Baolu Lu @ 2023-02-04  3:37 UTC (permalink / raw)
  To: Vasant Hegde, iommu, joro
  Cc: baolu.lu, suravee.suthikulpanit, robin.murphy, will

On 2023/2/4 1:36, Vasant Hegde wrote:
> iommu_attach_group() attaches all devices in a group to domain and then
> sets group domain (group->domain). Current code (__iommu_attach_group())
> does not handle error path. This creates problem as devices to domain
> attachment is in inconsistent state.
> 
> To recover from this situation, we need force attaching all devices back
> to the old domain. This patch introduces `force` parameter to
> __iommu_group_set_domain() so that we can attach devices back to old
> domain.
> 
> Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
> ---
>   drivers/iommu/iommu.c | 24 +++++++++++++++++-------
>   1 file changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index de91dd88705b..e58683cd3bf7 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>   static int __iommu_attach_group(struct iommu_domain *domain,
>   				struct iommu_group *group);
>   static int __iommu_group_set_domain(struct iommu_group *group,
> -				    struct iommu_domain *new_domain);
> +				    struct iommu_domain *new_domain,
> +				    bool force);
>   static int iommu_create_device_direct_mappings(struct iommu_group *group,
>   					       struct device *dev);
>   static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
> @@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
>   	else
>   		new_domain = group->default_domain;
>   
> -	ret = __iommu_group_set_domain(group, new_domain);
> +	ret = __iommu_group_set_domain(group, new_domain, false);
>   	WARN(ret, "iommu driver failed to attach the default/blocking domain");
>   }
>   
> @@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain *domain,
>   
>   	ret = __iommu_group_for_each_dev(group, domain,
>   					 iommu_group_do_attach_device);
> -	if (ret == 0)
> +	if (ret == 0) {
>   		group->domain = domain;
> +	} else {
> +		/*
> +		 * To recover from the case when certain device within the
> +		 * group fails to attach to the new domain, we need force
> +		 * attaching all devices back to the old domain.
> +		 */
> +		__iommu_group_set_domain(group, group->domain, true);

Adding force parameter to __iommu_group_set_domain() looks strange. Does
it work if we make above line like below?

		/*
		 * To recover from the case when certain device within the
		 * group fails to attach to the new domain, we need force
		 * attaching all devices back to the old domain. The old
		 * domain is compatible for all devices in the group,
		 * hence the iommu driver should always return success.
		 */
		old_domain = group->domain;
		group->domain = NULL;
		WARN(__iommu_group_set_domain(group, old_domain),
		     "iommu driver failed to attach a compatible domain");

Best regards,
baolu

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/3] iommu: Attach device group to old domain in error path
  2023-02-04  3:37 ` [PATCH v2 1/3] iommu: Attach device group to old domain in error path Baolu Lu
@ 2023-02-06 13:05   ` Vasant Hegde
  0 siblings, 0 replies; 9+ messages in thread
From: Vasant Hegde @ 2023-02-06 13:05 UTC (permalink / raw)
  To: Baolu Lu, iommu, joro; +Cc: suravee.suthikulpanit, robin.murphy, will

Hi Baolu,


On 2/4/2023 9:07 AM, Baolu Lu wrote:
> On 2023/2/4 1:36, Vasant Hegde wrote:
>> iommu_attach_group() attaches all devices in a group to domain and then
>> sets group domain (group->domain). Current code (__iommu_attach_group())
>> does not handle error path. This creates problem as devices to domain
>> attachment is in inconsistent state.
>>
>> To recover from this situation, we need force attaching all devices back
>> to the old domain. This patch introduces `force` parameter to
>> __iommu_group_set_domain() so that we can attach devices back to old
>> domain.
>>
>> Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
>> ---
>>   drivers/iommu/iommu.c | 24 +++++++++++++++++-------
>>   1 file changed, 17 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index de91dd88705b..e58683cd3bf7 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>>   static int __iommu_attach_group(struct iommu_domain *domain,
>>                   struct iommu_group *group);
>>   static int __iommu_group_set_domain(struct iommu_group *group,
>> -                    struct iommu_domain *new_domain);
>> +                    struct iommu_domain *new_domain,
>> +                    bool force);
>>   static int iommu_create_device_direct_mappings(struct iommu_group *group,
>>                              struct device *dev);
>>   static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>> @@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct
>> iommu_group *group)
>>       else
>>           new_domain = group->default_domain;
>>   -    ret = __iommu_group_set_domain(group, new_domain);
>> +    ret = __iommu_group_set_domain(group, new_domain, false);
>>       WARN(ret, "iommu driver failed to attach the default/blocking domain");
>>   }
>>   @@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain
>> *domain,
>>         ret = __iommu_group_for_each_dev(group, domain,
>>                        iommu_group_do_attach_device);
>> -    if (ret == 0)
>> +    if (ret == 0) {
>>           group->domain = domain;
>> +    } else {
>> +        /*
>> +         * To recover from the case when certain device within the
>> +         * group fails to attach to the new domain, we need force
>> +         * attaching all devices back to the old domain.
>> +         */
>> +        __iommu_group_set_domain(group, group->domain, true);
> 
> Adding force parameter to __iommu_group_set_domain() looks strange. Does
> it work if we make above line like below?

Yeah. It should work fine. Will fix it in v3.

-Vasant


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/3] iommu: Attach device group to old domain in error path
  2023-02-03 17:36 [PATCH v2 1/3] iommu: Attach device group to old domain in error path Vasant Hegde
                   ` (2 preceding siblings ...)
  2023-02-04  3:37 ` [PATCH v2 1/3] iommu: Attach device group to old domain in error path Baolu Lu
@ 2023-02-06 13:29 ` Robin Murphy
  2023-02-06 13:55   ` Vasant Hegde
  3 siblings, 1 reply; 9+ messages in thread
From: Robin Murphy @ 2023-02-06 13:29 UTC (permalink / raw)
  To: Vasant Hegde, iommu, joro; +Cc: suravee.suthikulpanit, will

On 2023-02-03 17:36, Vasant Hegde wrote:
> iommu_attach_group() attaches all devices in a group to domain and then
> sets group domain (group->domain). Current code (__iommu_attach_group())
> does not handle error path. This creates problem as devices to domain
> attachment is in inconsistent state.

Can you clarify the circumstances for this happening? I think the 
implicit assumption here is that if first attach_dev succeeds, then the 
group is compatible with the domain in general, so then any subsequent 
attach_dev calls are mostly just updating data for additional RIDs, and 
as such would not be expected to fail.

Thanks,
Robin.

> To recover from this situation, we need force attaching all devices back
> to the old domain. This patch introduces `force` parameter to
> __iommu_group_set_domain() so that we can attach devices back to old
> domain.
> 
> Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
> ---
>   drivers/iommu/iommu.c | 24 +++++++++++++++++-------
>   1 file changed, 17 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index de91dd88705b..e58683cd3bf7 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>   static int __iommu_attach_group(struct iommu_domain *domain,
>   				struct iommu_group *group);
>   static int __iommu_group_set_domain(struct iommu_group *group,
> -				    struct iommu_domain *new_domain);
> +				    struct iommu_domain *new_domain,
> +				    bool force);
>   static int iommu_create_device_direct_mappings(struct iommu_group *group,
>   					       struct device *dev);
>   static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
> @@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
>   	else
>   		new_domain = group->default_domain;
>   
> -	ret = __iommu_group_set_domain(group, new_domain);
> +	ret = __iommu_group_set_domain(group, new_domain, false);
>   	WARN(ret, "iommu driver failed to attach the default/blocking domain");
>   }
>   
> @@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain *domain,
>   
>   	ret = __iommu_group_for_each_dev(group, domain,
>   					 iommu_group_do_attach_device);
> -	if (ret == 0)
> +	if (ret == 0) {
>   		group->domain = domain;
> +	} else {
> +		/*
> +		 * To recover from the case when certain device within the
> +		 * group fails to attach to the new domain, we need force
> +		 * attaching all devices back to the old domain.
> +		 */
> +		__iommu_group_set_domain(group, group->domain, true);
> +	}
>   
>   	return ret;
>   }
> @@ -2164,11 +2173,12 @@ static int iommu_group_do_detach_device(struct device *dev, void *data)
>   }
>   
>   static int __iommu_group_set_domain(struct iommu_group *group,
> -				    struct iommu_domain *new_domain)
> +				    struct iommu_domain *new_domain,
> +				    bool force)
>   {
>   	int ret;
>   
> -	if (group->domain == new_domain)
> +	if (!force && group->domain == new_domain)
>   		return 0;
>   
>   	/*
> @@ -3135,7 +3145,7 @@ static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner)
>   	ret = __iommu_group_alloc_blocking_domain(group);
>   	if (ret)
>   		return ret;
> -	ret = __iommu_group_set_domain(group, group->blocking_domain);
> +	ret = __iommu_group_set_domain(group, group->blocking_domain, false);
>   	if (ret)
>   		return ret;
>   
> @@ -3222,7 +3232,7 @@ static void __iommu_release_dma_ownership(struct iommu_group *group)
>   
>   	group->owner_cnt = 0;
>   	group->owner = NULL;
> -	ret = __iommu_group_set_domain(group, group->default_domain);
> +	ret = __iommu_group_set_domain(group, group->default_domain, false);
>   	WARN(ret, "iommu driver failed to attach the default domain");
>   }
>   

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/3] iommu: Attach device group to old domain in error path
  2023-02-06 13:29 ` Robin Murphy
@ 2023-02-06 13:55   ` Vasant Hegde
  2023-02-07 10:09     ` Robin Murphy
  0 siblings, 1 reply; 9+ messages in thread
From: Vasant Hegde @ 2023-02-06 13:55 UTC (permalink / raw)
  To: Robin Murphy, iommu, joro; +Cc: suravee.suthikulpanit, will

Hi Robin,


On 2/6/2023 6:59 PM, Robin Murphy wrote:
> On 2023-02-03 17:36, Vasant Hegde wrote:
>> iommu_attach_group() attaches all devices in a group to domain and then
>> sets group domain (group->domain). Current code (__iommu_attach_group())
>> does not handle error path. This creates problem as devices to domain
>> attachment is in inconsistent state.
> 
> Can you clarify the circumstances for this happening? I think the implicit
> assumption here is that if first attach_dev succeeds, then the group is
> compatible with the domain in general, so then any subsequent attach_dev calls
> are mostly just updating data for additional RIDs, and as such would not be
> expected to fail.

It can happen with even single device in the device group.

Flow :
  - Try to attach device to new domain (say IOMMU_DOMAIN_UMANAGED)
  - In iommu_attach_group() path we detach device from current domain. Then it
tries to attach device to new group.
  - If it fails then device to domain link is broken.
  - iommu_attach_group() returns error.
  - At this stage caller assumes attaching device to new domain failed.
  - But it should work fine with old domain.
    This is what I'm trying to fix it in this patch.


Recently we hit this issue with AMD IOMMU v2 driver [1], but it can happen with
any other code path (like VFIO path).

[1]
https://lore.kernel.org/linux-iommu/15d0f9ff-2a56-b3e9-5b45-e6b23300ae3b@leemhuis.info/

-Vasant



> 
> Thanks,
> Robin.
> 
>> To recover from this situation, we need force attaching all devices back
>> to the old domain. This patch introduces `force` parameter to
>> __iommu_group_set_domain() so that we can attach devices back to old
>> domain.
>>
>> Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
>> ---
>>   drivers/iommu/iommu.c | 24 +++++++++++++++++-------
>>   1 file changed, 17 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index de91dd88705b..e58683cd3bf7 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>>   static int __iommu_attach_group(struct iommu_domain *domain,
>>                   struct iommu_group *group);
>>   static int __iommu_group_set_domain(struct iommu_group *group,
>> -                    struct iommu_domain *new_domain);
>> +                    struct iommu_domain *new_domain,
>> +                    bool force);
>>   static int iommu_create_device_direct_mappings(struct iommu_group *group,
>>                              struct device *dev);
>>   static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>> @@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct
>> iommu_group *group)
>>       else
>>           new_domain = group->default_domain;
>>   -    ret = __iommu_group_set_domain(group, new_domain);
>> +    ret = __iommu_group_set_domain(group, new_domain, false);
>>       WARN(ret, "iommu driver failed to attach the default/blocking domain");
>>   }
>>   @@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain
>> *domain,
>>         ret = __iommu_group_for_each_dev(group, domain,
>>                        iommu_group_do_attach_device);
>> -    if (ret == 0)
>> +    if (ret == 0) {
>>           group->domain = domain;
>> +    } else {
>> +        /*
>> +         * To recover from the case when certain device within the
>> +         * group fails to attach to the new domain, we need force
>> +         * attaching all devices back to the old domain.
>> +         */
>> +        __iommu_group_set_domain(group, group->domain, true);
>> +    }
>>         return ret;
>>   }
>> @@ -2164,11 +2173,12 @@ static int iommu_group_do_detach_device(struct device
>> *dev, void *data)
>>   }
>>     static int __iommu_group_set_domain(struct iommu_group *group,
>> -                    struct iommu_domain *new_domain)
>> +                    struct iommu_domain *new_domain,
>> +                    bool force)
>>   {
>>       int ret;
>>   -    if (group->domain == new_domain)
>> +    if (!force && group->domain == new_domain)
>>           return 0;
>>         /*
>> @@ -3135,7 +3145,7 @@ static int __iommu_take_dma_ownership(struct iommu_group
>> *group, void *owner)
>>       ret = __iommu_group_alloc_blocking_domain(group);
>>       if (ret)
>>           return ret;
>> -    ret = __iommu_group_set_domain(group, group->blocking_domain);
>> +    ret = __iommu_group_set_domain(group, group->blocking_domain, false);
>>       if (ret)
>>           return ret;
>>   @@ -3222,7 +3232,7 @@ static void __iommu_release_dma_ownership(struct
>> iommu_group *group)
>>         group->owner_cnt = 0;
>>       group->owner = NULL;
>> -    ret = __iommu_group_set_domain(group, group->default_domain);
>> +    ret = __iommu_group_set_domain(group, group->default_domain, false);
>>       WARN(ret, "iommu driver failed to attach the default domain");
>>   }
>>   

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/3] iommu: Attach device group to old domain in error path
  2023-02-06 13:55   ` Vasant Hegde
@ 2023-02-07 10:09     ` Robin Murphy
  2023-02-07 14:11       ` Vasant Hegde
  0 siblings, 1 reply; 9+ messages in thread
From: Robin Murphy @ 2023-02-07 10:09 UTC (permalink / raw)
  To: Vasant Hegde, iommu, joro; +Cc: suravee.suthikulpanit, will

On 2023-02-06 13:55, Vasant Hegde wrote:
> Hi Robin,
> 
> 
> On 2/6/2023 6:59 PM, Robin Murphy wrote:
>> On 2023-02-03 17:36, Vasant Hegde wrote:
>>> iommu_attach_group() attaches all devices in a group to domain and then
>>> sets group domain (group->domain). Current code (__iommu_attach_group())
>>> does not handle error path. This creates problem as devices to domain
>>> attachment is in inconsistent state.
>>
>> Can you clarify the circumstances for this happening? I think the implicit
>> assumption here is that if first attach_dev succeeds, then the group is
>> compatible with the domain in general, so then any subsequent attach_dev calls
>> are mostly just updating data for additional RIDs, and as such would not be
>> expected to fail.
> 
> It can happen with even single device in the device group.
> 
> Flow :
>    - Try to attach device to new domain (say IOMMU_DOMAIN_UMANAGED)
>    - In iommu_attach_group() path we detach device from current domain. Then it
> tries to attach device to new group.
>    - If it fails then device to domain link is broken.
>    - iommu_attach_group() returns error.
>    - At this stage caller assumes attaching device to new domain failed.
>    - But it should work fine with old domain.
>      This is what I'm trying to fix it in this patch.

Oh, I see. However, in that case the assumption in the core code is 
definitely that if the attach failed then the device remains attached to 
the original domain. If the AMD driver is doing something different, 
shouldn't that just be fixed in the driver (i.e. do all the preparation 
on the new domain *before* the detach_device()/do_attach() cycle)?

Thanks,
Robin.

> 
> 
> Recently we hit this issue with AMD IOMMU v2 driver [1], but it can happen with
> any other code path (like VFIO path).
> 
> [1]
> https://lore.kernel.org/linux-iommu/15d0f9ff-2a56-b3e9-5b45-e6b23300ae3b@leemhuis.info/
> 
> -Vasant
> 
> 
> 
>>
>> Thanks,
>> Robin.
>>
>>> To recover from this situation, we need force attaching all devices back
>>> to the old domain. This patch introduces `force` parameter to
>>> __iommu_group_set_domain() so that we can attach devices back to old
>>> domain.
>>>
>>> Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
>>> ---
>>>    drivers/iommu/iommu.c | 24 +++++++++++++++++-------
>>>    1 file changed, 17 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>>> index de91dd88705b..e58683cd3bf7 100644
>>> --- a/drivers/iommu/iommu.c
>>> +++ b/drivers/iommu/iommu.c
>>> @@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>>>    static int __iommu_attach_group(struct iommu_domain *domain,
>>>                    struct iommu_group *group);
>>>    static int __iommu_group_set_domain(struct iommu_group *group,
>>> -                    struct iommu_domain *new_domain);
>>> +                    struct iommu_domain *new_domain,
>>> +                    bool force);
>>>    static int iommu_create_device_direct_mappings(struct iommu_group *group,
>>>                               struct device *dev);
>>>    static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>>> @@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct
>>> iommu_group *group)
>>>        else
>>>            new_domain = group->default_domain;
>>>    -    ret = __iommu_group_set_domain(group, new_domain);
>>> +    ret = __iommu_group_set_domain(group, new_domain, false);
>>>        WARN(ret, "iommu driver failed to attach the default/blocking domain");
>>>    }
>>>    @@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain
>>> *domain,
>>>          ret = __iommu_group_for_each_dev(group, domain,
>>>                         iommu_group_do_attach_device);
>>> -    if (ret == 0)
>>> +    if (ret == 0) {
>>>            group->domain = domain;
>>> +    } else {
>>> +        /*
>>> +         * To recover from the case when certain device within the
>>> +         * group fails to attach to the new domain, we need force
>>> +         * attaching all devices back to the old domain.
>>> +         */
>>> +        __iommu_group_set_domain(group, group->domain, true);
>>> +    }
>>>          return ret;
>>>    }
>>> @@ -2164,11 +2173,12 @@ static int iommu_group_do_detach_device(struct device
>>> *dev, void *data)
>>>    }
>>>      static int __iommu_group_set_domain(struct iommu_group *group,
>>> -                    struct iommu_domain *new_domain)
>>> +                    struct iommu_domain *new_domain,
>>> +                    bool force)
>>>    {
>>>        int ret;
>>>    -    if (group->domain == new_domain)
>>> +    if (!force && group->domain == new_domain)
>>>            return 0;
>>>          /*
>>> @@ -3135,7 +3145,7 @@ static int __iommu_take_dma_ownership(struct iommu_group
>>> *group, void *owner)
>>>        ret = __iommu_group_alloc_blocking_domain(group);
>>>        if (ret)
>>>            return ret;
>>> -    ret = __iommu_group_set_domain(group, group->blocking_domain);
>>> +    ret = __iommu_group_set_domain(group, group->blocking_domain, false);
>>>        if (ret)
>>>            return ret;
>>>    @@ -3222,7 +3232,7 @@ static void __iommu_release_dma_ownership(struct
>>> iommu_group *group)
>>>          group->owner_cnt = 0;
>>>        group->owner = NULL;
>>> -    ret = __iommu_group_set_domain(group, group->default_domain);
>>> +    ret = __iommu_group_set_domain(group, group->default_domain, false);
>>>        WARN(ret, "iommu driver failed to attach the default domain");
>>>    }
>>>    

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/3] iommu: Attach device group to old domain in error path
  2023-02-07 10:09     ` Robin Murphy
@ 2023-02-07 14:11       ` Vasant Hegde
  0 siblings, 0 replies; 9+ messages in thread
From: Vasant Hegde @ 2023-02-07 14:11 UTC (permalink / raw)
  To: Robin Murphy, iommu, joro; +Cc: suravee.suthikulpanit, will

Hi Robin,


On 2/7/2023 3:39 PM, Robin Murphy wrote:
> On 2023-02-06 13:55, Vasant Hegde wrote:
>> Hi Robin,
>>
>>
>> On 2/6/2023 6:59 PM, Robin Murphy wrote:
>>> On 2023-02-03 17:36, Vasant Hegde wrote:
>>>> iommu_attach_group() attaches all devices in a group to domain and then
>>>> sets group domain (group->domain). Current code (__iommu_attach_group())
>>>> does not handle error path. This creates problem as devices to domain
>>>> attachment is in inconsistent state.
>>>
>>> Can you clarify the circumstances for this happening? I think the implicit
>>> assumption here is that if first attach_dev succeeds, then the group is
>>> compatible with the domain in general, so then any subsequent attach_dev calls
>>> are mostly just updating data for additional RIDs, and as such would not be
>>> expected to fail.
>>
>> It can happen with even single device in the device group.
>>
>> Flow :
>>    - Try to attach device to new domain (say IOMMU_DOMAIN_UMANAGED)
>>    - In iommu_attach_group() path we detach device from current domain. Then it
>> tries to attach device to new group.
>>    - If it fails then device to domain link is broken.
>>    - iommu_attach_group() returns error.
>>    - At this stage caller assumes attaching device to new domain failed.
>>    - But it should work fine with old domain.
>>      This is what I'm trying to fix it in this patch.
> 
> Oh, I see. However, in that case the assumption in the core code is definitely
> that if the attach failed then the device remains attached to the original
> domain. If the AMD driver is doing something different, shouldn't that just be
> fixed in the driver (i.e. do all the preparation on the new domain *before* the
> detach_device()/do_attach() cycle)?

I tried to fix it in core code instead of AMD driver because:
  - If we have multiple devices in group, then recovering inside hardware driver
(like AMD driver) becomes difficult. Say we have two devices and second device
attachment failed then how do we re-attach first device back to old domain.
  - As I understand its not AMD specific driver issue. I did look into few other
driver (like s390-iommu.c, arm-smmu-v3.c). They also detach devices before
attaching devices to new domain.
  - Finally fixing it in core code makes it easy for all driver. The flow I was
thinking is something like:
  iommu_attach_group() -> __iommu_attach_group()
    -> iommu_domain_ops->attach_dev() : if failed return error
      -> __iommu_attach_group() recovers for the failure for entire group.
        -> device group to domain attachement is consistent and return error
           from iommu_attach_group()


Please let me know if we have better way to fix this issue.

-Vasant




> 
> Thanks,
> Robin.
> 
>>
>>
>> Recently we hit this issue with AMD IOMMU v2 driver [1], but it can happen with
>> any other code path (like VFIO path).
>>
>> [1]
>> https://lore.kernel.org/linux-iommu/15d0f9ff-2a56-b3e9-5b45-e6b23300ae3b@leemhuis.info/
>>
>>
>> -Vasant
>>
>>
>>
>>>
>>> Thanks,
>>> Robin.
>>>
>>>> To recover from this situation, we need force attaching all devices back
>>>> to the old domain. This patch introduces `force` parameter to
>>>> __iommu_group_set_domain() so that we can attach devices back to old
>>>> domain.
>>>>
>>>> Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
>>>> ---
>>>>    drivers/iommu/iommu.c | 24 +++++++++++++++++-------
>>>>    1 file changed, 17 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>>>> index de91dd88705b..e58683cd3bf7 100644
>>>> --- a/drivers/iommu/iommu.c
>>>> +++ b/drivers/iommu/iommu.c
>>>> @@ -95,7 +95,8 @@ static int __iommu_attach_device(struct iommu_domain *domain,
>>>>    static int __iommu_attach_group(struct iommu_domain *domain,
>>>>                    struct iommu_group *group);
>>>>    static int __iommu_group_set_domain(struct iommu_group *group,
>>>> -                    struct iommu_domain *new_domain);
>>>> +                    struct iommu_domain *new_domain,
>>>> +                    bool force);
>>>>    static int iommu_create_device_direct_mappings(struct iommu_group *group,
>>>>                               struct device *dev);
>>>>    static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>>>> @@ -1974,7 +1975,7 @@ static void __iommu_group_set_core_domain(struct
>>>> iommu_group *group)
>>>>        else
>>>>            new_domain = group->default_domain;
>>>>    -    ret = __iommu_group_set_domain(group, new_domain);
>>>> +    ret = __iommu_group_set_domain(group, new_domain, false);
>>>>        WARN(ret, "iommu driver failed to attach the default/blocking domain");
>>>>    }
>>>>    @@ -2124,8 +2125,16 @@ static int __iommu_attach_group(struct iommu_domain
>>>> *domain,
>>>>          ret = __iommu_group_for_each_dev(group, domain,
>>>>                         iommu_group_do_attach_device);
>>>> -    if (ret == 0)
>>>> +    if (ret == 0) {
>>>>            group->domain = domain;
>>>> +    } else {
>>>> +        /*
>>>> +         * To recover from the case when certain device within the
>>>> +         * group fails to attach to the new domain, we need force
>>>> +         * attaching all devices back to the old domain.
>>>> +         */
>>>> +        __iommu_group_set_domain(group, group->domain, true);
>>>> +    }
>>>>          return ret;
>>>>    }
>>>> @@ -2164,11 +2173,12 @@ static int iommu_group_do_detach_device(struct device
>>>> *dev, void *data)
>>>>    }
>>>>      static int __iommu_group_set_domain(struct iommu_group *group,
>>>> -                    struct iommu_domain *new_domain)
>>>> +                    struct iommu_domain *new_domain,
>>>> +                    bool force)
>>>>    {
>>>>        int ret;
>>>>    -    if (group->domain == new_domain)
>>>> +    if (!force && group->domain == new_domain)
>>>>            return 0;
>>>>          /*
>>>> @@ -3135,7 +3145,7 @@ static int __iommu_take_dma_ownership(struct iommu_group
>>>> *group, void *owner)
>>>>        ret = __iommu_group_alloc_blocking_domain(group);
>>>>        if (ret)
>>>>            return ret;
>>>> -    ret = __iommu_group_set_domain(group, group->blocking_domain);
>>>> +    ret = __iommu_group_set_domain(group, group->blocking_domain, false);
>>>>        if (ret)
>>>>            return ret;
>>>>    @@ -3222,7 +3232,7 @@ static void __iommu_release_dma_ownership(struct
>>>> iommu_group *group)
>>>>          group->owner_cnt = 0;
>>>>        group->owner = NULL;
>>>> -    ret = __iommu_group_set_domain(group, group->default_domain);
>>>> +    ret = __iommu_group_set_domain(group, group->default_domain, false);
>>>>        WARN(ret, "iommu driver failed to attach the default domain");
>>>>    }
>>>>    

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-02-07 14:12 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-03 17:36 [PATCH v2 1/3] iommu: Attach device group to old domain in error path Vasant Hegde
2023-02-03 17:36 ` [PATCH v2 2/3] iommu/amd: Skip attach device domain is same as new domain Vasant Hegde
2023-02-03 17:36 ` [PATCH v2 3/3] iommu/amd: Improve page fault error reporting Vasant Hegde
2023-02-04  3:37 ` [PATCH v2 1/3] iommu: Attach device group to old domain in error path Baolu Lu
2023-02-06 13:05   ` Vasant Hegde
2023-02-06 13:29 ` Robin Murphy
2023-02-06 13:55   ` Vasant Hegde
2023-02-07 10:09     ` Robin Murphy
2023-02-07 14:11       ` Vasant Hegde

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.