linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 0/2] Enable IOMMU support for pseries Secure VMs
@ 2019-12-07  1:12 Ram Pai
  2019-12-07  1:12 ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Ram Pai
  0 siblings, 1 reply; 21+ messages in thread
From: Ram Pai @ 2019-12-07  1:12 UTC (permalink / raw)
  To: mpe
  Cc: linuxppc-dev, benh, david, paulus, mdroth, hch, linuxram,
	andmike, sukadev, mst, ram.n.pai, aik, cai, tglx, bauerman,
	linux-kernel, leonardo

This patch series enables IOMMU support for pseries Secure VMs.

Tested using QEMU command line option:

	"-device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x4,
	iommu_platform=on,disable-modern=off,disable-legacy=on"

	and

	"-device virtio-blk-pci,scsi=off,bus=pci.0,
	addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,
	iommu_platform=on,disable-modern=off,disable-legacy=on"

changelog:
	v5: unshare the page used in H_PUT_TCE_INDIRECT hcall to
       	    communicate the TCE entries with the hypervisor.
	    Concern raised by Alexey. This minimizes the number
	    of pages shared with the hypervisor.

	v4: Corrected the Subject, to include the keyword 'PATCH'.
		No other changes.

	v3: Better description of 2/2 patch.
		Suggested by David Gibson.

	v2: added comments describing the changes.
		Suggested by Alexey and Michael Ellermen.


Ram Pai (2):
  powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.

 arch/powerpc/platforms/pseries/iommu.c | 43 ++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 13 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-07  1:12 [PATCH v5 0/2] Enable IOMMU support for pseries Secure VMs Ram Pai
@ 2019-12-07  1:12 ` Ram Pai
  2019-12-07  1:12   ` [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM Ram Pai
  2019-12-10  3:07   ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Alexey Kardashevskiy
  0 siblings, 2 replies; 21+ messages in thread
From: Ram Pai @ 2019-12-07  1:12 UTC (permalink / raw)
  To: mpe
  Cc: linuxppc-dev, benh, david, paulus, mdroth, hch, linuxram,
	andmike, sukadev, mst, ram.n.pai, aik, cai, tglx, bauerman,
	linux-kernel, leonardo

H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
its parameters.  On secure VMs, hypervisor cannot access the contents of
this page since it gets encrypted.  Hence share the page with the
hypervisor, and unshare when done.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 6ba081d..67b5009 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -37,6 +37,7 @@
 #include <asm/mmzone.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/svm.h>
+#include <asm/ultravisor.h>
 
 #include "pseries.h"
 
@@ -179,6 +180,18 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 
 static DEFINE_PER_CPU(__be64 *, tce_page);
 
+static void pre_process_tce_page(__be64 *tcep)
+{
+	if (tcep && is_secure_guest())
+		uv_share_page(PHYS_PFN(__pa(tcep)), 1);
+}
+
+static void post_process_tce_page(__be64 *tcep)
+{
+	if (tcep && is_secure_guest())
+		uv_unshare_page(PHYS_PFN(__pa(tcep)), 1);
+}
+
 static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
 				     enum dma_data_direction direction,
@@ -187,7 +200,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	u64 rc = 0;
 	u64 proto_tce;
 	__be64 *tcep;
-	u64 rpn;
+	u64 rpn, tcep0;
 	long l, limit;
 	long tcenum_start = tcenum, npages_start = npages;
 	int ret = 0;
@@ -216,6 +229,8 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		__this_cpu_write(tce_page, tcep);
 	}
 
+	pre_process_tce_page(tcep);
+
 	rpn = __pa(uaddr) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
@@ -243,6 +258,14 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
+	/*
+	 * if "tcep" is shared, post_process_tce_page() will unshare the page,
+	 * which will zero the page. Grab any interesting content, before it
+	 * disappears.
+	 */
+	tcep0 = tcep[0];
+	post_process_tce_page(tcep);
+
 	local_irq_restore(flags);
 
 	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
@@ -256,7 +279,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
 		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
 		printk("\tnpages  = 0x%llx\n", (u64)npages);
-		printk("\ttce[0] val = 0x%llx\n", tcep[0]);
+		printk("\ttce[0] val = 0x%llx\n", tcep0);
 		dump_stack();
 	}
 	return ret;
@@ -280,7 +303,6 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages
 	}
 }
 
-
 static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
 {
 	u64 rc;
@@ -413,6 +435,8 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 		__this_cpu_write(tce_page, tcep);
 	}
 
+	pre_process_tce_page(tcep);
+
 	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
 
 	liobn = (u64)be32_to_cpu(maprange->liobn);
@@ -451,6 +475,8 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 		num_tce -= limit;
 	} while (num_tce > 0 && !rc);
 
+	post_process_tce_page(tcep);
+
 	/* error cleanup: caller will clear whole range */
 
 	local_irq_enable();
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-07  1:12 ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Ram Pai
@ 2019-12-07  1:12   ` Ram Pai
  2019-12-10  3:08     ` Alexey Kardashevskiy
                       ` (2 more replies)
  2019-12-10  3:07   ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Alexey Kardashevskiy
  1 sibling, 3 replies; 21+ messages in thread
From: Ram Pai @ 2019-12-07  1:12 UTC (permalink / raw)
  To: mpe
  Cc: linuxppc-dev, benh, david, paulus, mdroth, hch, linuxram,
	andmike, sukadev, mst, ram.n.pai, aik, cai, tglx, bauerman,
	linux-kernel, leonardo

Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
		secure guests")
disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
path for secure VMs, helped enable dma_direct path.  This enabled
support for bounce-buffering through SWIOTLB.  However it fails to
operate when IOMMU is enabled, since I/O pages are not TCE mapped.

Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
cases including, TCE mapping I/O pages, in the presence of a
IOMMU.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 67b5009..4e27d66 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,7 +36,6 @@
 #include <asm/udbg.h>
 #include <asm/mmzone.h>
 #include <asm/plpar_wrappers.h>
-#include <asm/svm.h>
 #include <asm/ultravisor.h>
 
 #include "pseries.h"
@@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
 	of_reconfig_notifier_register(&iommu_reconfig_nb);
 	register_memory_notifier(&iommu_mem_nb);
 
-	/*
-	 * Secure guest memory is inacessible to devices so regular DMA isn't
-	 * possible.
-	 *
-	 * In that case keep devices' dma_map_ops as NULL so that the generic
-	 * DMA code path will use SWIOTLB to bounce buffers for DMA.
-	 */
-	if (!is_secure_guest())
-		set_pci_dma_ops(&dma_iommu_ops);
+	set_pci_dma_ops(&dma_iommu_ops);
 }
 
 static int __init disable_multitce(char *str)
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-07  1:12 ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Ram Pai
  2019-12-07  1:12   ` [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM Ram Pai
@ 2019-12-10  3:07   ` Alexey Kardashevskiy
  2019-12-10  5:12     ` Ram Pai
  1 sibling, 1 reply; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-10  3:07 UTC (permalink / raw)
  To: Ram Pai, mpe
  Cc: linuxppc-dev, benh, david, paulus, mdroth, hch, andmike, sukadev,
	mst, ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo



On 07/12/2019 12:12, Ram Pai wrote:
> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
> its parameters.  On secure VMs, hypervisor cannot access the contents of
> this page since it gets encrypted.  Hence share the page with the
> hypervisor, and unshare when done.


I thought the idea was to use H_PUT_TCE and avoid sharing any extra
pages. There is small problem that when DDW is enabled,
FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
about the performance on slack but this is caused by initial cleanup of
the default TCE window (which we do not use anyway) and to battle this
we can simply reduce its size by adding

-global
spapr-pci-host-bridge.dma_win_size=0x4000000

to the qemu command line. And the huge DMA window will use 16MB or 1GB
TCEs so even mapping 32GB guests is barely noticeable. What do I miss?


> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 32 +++++++++++++++++++++++++++++---
>  1 file changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 6ba081d..67b5009 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -37,6 +37,7 @@
>  #include <asm/mmzone.h>
>  #include <asm/plpar_wrappers.h>
>  #include <asm/svm.h>
> +#include <asm/ultravisor.h>
>  
>  #include "pseries.h"
>  
> @@ -179,6 +180,18 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
>  
>  static DEFINE_PER_CPU(__be64 *, tce_page);
>  
> +static void pre_process_tce_page(__be64 *tcep)
> +{
> +	if (tcep && is_secure_guest())
> +		uv_share_page(PHYS_PFN(__pa(tcep)), 1);
> +}
> +
> +static void post_process_tce_page(__be64 *tcep)
> +{
> +	if (tcep && is_secure_guest())
> +		uv_unshare_page(PHYS_PFN(__pa(tcep)), 1);
> +}
> +
>  static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>  				     long npages, unsigned long uaddr,
>  				     enum dma_data_direction direction,
> @@ -187,7 +200,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>  	u64 rc = 0;
>  	u64 proto_tce;
>  	__be64 *tcep;
> -	u64 rpn;
> +	u64 rpn, tcep0;
>  	long l, limit;
>  	long tcenum_start = tcenum, npages_start = npages;
>  	int ret = 0;
> @@ -216,6 +229,8 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>  		__this_cpu_write(tce_page, tcep);
>  	}
>  
> +	pre_process_tce_page(tcep);
> +
>  	rpn = __pa(uaddr) >> TCE_SHIFT;
>  	proto_tce = TCE_PCI_READ;
>  	if (direction != DMA_TO_DEVICE)
> @@ -243,6 +258,14 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>  		tcenum += limit;
>  	} while (npages > 0 && !rc);
>  
> +	/*
> +	 * if "tcep" is shared, post_process_tce_page() will unshare the page,
> +	 * which will zero the page. Grab any interesting content, before it
> +	 * disappears.
> +	 */
> +	tcep0 = tcep[0];
> +	post_process_tce_page(tcep);
> +
>  	local_irq_restore(flags);
>  
>  	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
> @@ -256,7 +279,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>  		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
>  		printk("\tindex   = 0x%llx\n", (u64)tbl->it_index);
>  		printk("\tnpages  = 0x%llx\n", (u64)npages);
> -		printk("\ttce[0] val = 0x%llx\n", tcep[0]);
> +		printk("\ttce[0] val = 0x%llx\n", tcep0);
>  		dump_stack();
>  	}
>  	return ret;
> @@ -280,7 +303,6 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages
>  	}
>  }
>  
> -
>  static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
>  {
>  	u64 rc;
> @@ -413,6 +435,8 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
>  		__this_cpu_write(tce_page, tcep);
>  	}
>  
> +	pre_process_tce_page(tcep);
> +
>  	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
>  
>  	liobn = (u64)be32_to_cpu(maprange->liobn);
> @@ -451,6 +475,8 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
>  		num_tce -= limit;
>  	} while (num_tce > 0 && !rc);
>  
> +	post_process_tce_page(tcep);
> +
>  	/* error cleanup: caller will clear whole range */
>  
>  	local_irq_enable();
> 

-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-07  1:12   ` [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM Ram Pai
@ 2019-12-10  3:08     ` Alexey Kardashevskiy
  2019-12-10 22:09     ` Thiago Jung Bauermann
  2019-12-11  1:43     ` Michael Roth
  2 siblings, 0 replies; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-10  3:08 UTC (permalink / raw)
  To: Ram Pai, mpe
  Cc: linuxppc-dev, benh, david, paulus, mdroth, hch, andmike, sukadev,
	mst, ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo



On 07/12/2019 12:12, Ram Pai wrote:
> Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
> 		secure guests")
> disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
> path for secure VMs, helped enable dma_direct path.  This enabled
> support for bounce-buffering through SWIOTLB.  However it fails to
> operate when IOMMU is enabled, since I/O pages are not TCE mapped.
> 
> Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
> cases including, TCE mapping I/O pages, in the presence of a
> IOMMU.
> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>


Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Although I still do not totally understand the mechanics of this
(swiotlb on top of huge DDW at 0x800.0000.0000.0000), this change looks
reasonable anyway.



> ---
>  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
>  1 file changed, 1 insertion(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 67b5009..4e27d66 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -36,7 +36,6 @@
>  #include <asm/udbg.h>
>  #include <asm/mmzone.h>
>  #include <asm/plpar_wrappers.h>
> -#include <asm/svm.h>
>  #include <asm/ultravisor.h>
>  
>  #include "pseries.h"
> @@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
>  	of_reconfig_notifier_register(&iommu_reconfig_nb);
>  	register_memory_notifier(&iommu_mem_nb);
>  
> -	/*
> -	 * Secure guest memory is inacessible to devices so regular DMA isn't
> -	 * possible.
> -	 *
> -	 * In that case keep devices' dma_map_ops as NULL so that the generic
> -	 * DMA code path will use SWIOTLB to bounce buffers for DMA.
> -	 */
> -	if (!is_secure_guest())
> -		set_pci_dma_ops(&dma_iommu_ops);
> +	set_pci_dma_ops(&dma_iommu_ops);
>  }
>  
>  static int __init disable_multitce(char *str)
> 

-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-10  3:07   ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Alexey Kardashevskiy
@ 2019-12-10  5:12     ` Ram Pai
  2019-12-10  5:32       ` Alexey Kardashevskiy
  0 siblings, 1 reply; 21+ messages in thread
From: Ram Pai @ 2019-12-10  5:12 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: mpe, linuxppc-dev, benh, david, paulus, mdroth, hch, andmike,
	sukadev, mst, ram.n.pai, cai, tglx, bauerman, linux-kernel,
	leonardo

On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 07/12/2019 12:12, Ram Pai wrote:
> > H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
> > its parameters.  On secure VMs, hypervisor cannot access the contents of
> > this page since it gets encrypted.  Hence share the page with the
> > hypervisor, and unshare when done.
> 
> 
> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
> pages. There is small problem that when DDW is enabled,
> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
> about the performance on slack but this is caused by initial cleanup of
> the default TCE window (which we do not use anyway) and to battle this
> we can simply reduce its size by adding

something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
13secs per device for H_PUT_TCE approach, during boot. This is with a
30GB guest. With larger guest, the time will further detoriate.

> 
> -global
> spapr-pci-host-bridge.dma_win_size=0x4000000

This option, speeds it up tremendously.  But than should this option be
enabled in qemu by default?  only for secure VMs? for both VMs?


RP


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-10  5:12     ` Ram Pai
@ 2019-12-10  5:32       ` Alexey Kardashevskiy
  2019-12-10 15:35         ` Ram Pai
  0 siblings, 1 reply; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-10  5:32 UTC (permalink / raw)
  To: Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, mdroth, hch, andmike,
	sukadev, mst, ram.n.pai, cai, tglx, bauerman, linux-kernel,
	leonardo



On 10/12/2019 16:12, Ram Pai wrote:
> On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 07/12/2019 12:12, Ram Pai wrote:
>>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
>>> its parameters.  On secure VMs, hypervisor cannot access the contents of
>>> this page since it gets encrypted.  Hence share the page with the
>>> hypervisor, and unshare when done.
>>
>>
>> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
>> pages. There is small problem that when DDW is enabled,
>> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
>> about the performance on slack but this is caused by initial cleanup of
>> the default TCE window (which we do not use anyway) and to battle this
>> we can simply reduce its size by adding
> 
> something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
> 13secs per device for H_PUT_TCE approach, during boot. This is with a
> 30GB guest. With larger guest, the time will further detoriate.


No it will not, I checked. The time is the same for 2GB and 32GB guests-
the delay is caused by clearing the small DMA window which is small by
the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
(depends on the system) so the number of TCEs is much smaller.


> 
>>
>> -global
>> spapr-pci-host-bridge.dma_win_size=0x4000000
> 
> This option, speeds it up tremendously.  But than should this option be
> enabled in qemu by default?  only for secure VMs? for both VMs?


As discussed in slack, by default we do not need to clear the entire TCE
table and we only have to map swiotlb buffer using the small window. It
is a guest kernel change only. Thanks,



-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-10  5:32       ` Alexey Kardashevskiy
@ 2019-12-10 15:35         ` Ram Pai
  2019-12-11  8:15           ` Alexey Kardashevskiy
  0 siblings, 1 reply; 21+ messages in thread
From: Ram Pai @ 2019-12-10 15:35 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: mpe, linuxppc-dev, benh, david, paulus, mdroth, hch, andmike,
	sukadev, mst, ram.n.pai, cai, tglx, bauerman, linux-kernel,
	leonardo

On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 10/12/2019 16:12, Ram Pai wrote:
> > On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
> >>
> >>
> >> On 07/12/2019 12:12, Ram Pai wrote:
> >>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
> >>> its parameters.  On secure VMs, hypervisor cannot access the contents of
> >>> this page since it gets encrypted.  Hence share the page with the
> >>> hypervisor, and unshare when done.
> >>
> >>
> >> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
> >> pages. There is small problem that when DDW is enabled,
> >> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
> >> about the performance on slack but this is caused by initial cleanup of
> >> the default TCE window (which we do not use anyway) and to battle this
> >> we can simply reduce its size by adding
> > 
> > something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
> > 13secs per device for H_PUT_TCE approach, during boot. This is with a
> > 30GB guest. With larger guest, the time will further detoriate.
> 
> 
> No it will not, I checked. The time is the same for 2GB and 32GB guests-
> the delay is caused by clearing the small DMA window which is small by
> the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
> for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
> (depends on the system) so the number of TCEs is much smaller.

I cant get your results.  What changes did you make to get it?

> 
> 
> > 
> >>
> >> -global
> >> spapr-pci-host-bridge.dma_win_size=0x4000000
> > 
> > This option, speeds it up tremendously.  But than should this option be
> > enabled in qemu by default?  only for secure VMs? for both VMs?
> 
> 
> As discussed in slack, by default we do not need to clear the entire TCE
> table and we only have to map swiotlb buffer using the small window. It
> is a guest kernel change only. Thanks,

Can you tell me what code you are talking about here.  Where is the TCE
table getting cleared? What code needs to be changed to not clear it?

Is the code in tce_buildmulti_pSeriesLP(), the one that does the clear
aswell?

>
>

But before I close, you have not told me clearly, what is the problem
with;  'share the page, make the H_PUT_INDIRECT_TCE hcall, unshare the page'.


Remember this is the same page that is earmarked for doing
H_PUT_INDIRECT_TCE, not by my patch, but its already earmarked by the
existing code. So it not some random buffer that is picked. Second 
this page is temporarily shared and unshared, it does not stay shared
for life.  It does not slow the boot. it does not need any
special command line options on the qemu.

Shared pages technology was put in place, exactly for the purpose of
sharing data with the hypervisor.  We are using this technology exactly
for that purpose.  And finally I agreed with your concern of having
shared pages staying around.  Hence i addressed that concern, by
unsharing the page.  At this point, I fail to understand your concern.


RP


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-07  1:12   ` [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM Ram Pai
  2019-12-10  3:08     ` Alexey Kardashevskiy
@ 2019-12-10 22:09     ` Thiago Jung Bauermann
  2019-12-11  1:43     ` Michael Roth
  2 siblings, 0 replies; 21+ messages in thread
From: Thiago Jung Bauermann @ 2019-12-10 22:09 UTC (permalink / raw)
  To: Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, mdroth, hch, andmike,
	sukadev, mst, ram.n.pai, aik, cai, tglx, linux-kernel, leonardo


Hello Ram,

Ram Pai <linuxram@us.ibm.com> writes:

> Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
> 		secure guests")
> disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
> path for secure VMs, helped enable dma_direct path.  This enabled
> support for bounce-buffering through SWIOTLB.  However it fails to
> operate when IOMMU is enabled, since I/O pages are not TCE mapped.
>
> Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
> cases including, TCE mapping I/O pages, in the presence of a
> IOMMU.
>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
>  1 file changed, 1 insertion(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 67b5009..4e27d66 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -36,7 +36,6 @@
>  #include <asm/udbg.h>
>  #include <asm/mmzone.h>
>  #include <asm/plpar_wrappers.h>
> -#include <asm/svm.h>
>  #include <asm/ultravisor.h>
>
>  #include "pseries.h"

You still need to keep <asm/svm.h>, otherwise there won't be a
definition of is_secure_guest() when CONFIG_PPC_SVM=n.

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-07  1:12   ` [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM Ram Pai
  2019-12-10  3:08     ` Alexey Kardashevskiy
  2019-12-10 22:09     ` Thiago Jung Bauermann
@ 2019-12-11  1:43     ` Michael Roth
  2019-12-11  8:36       ` Alexey Kardashevskiy
  2019-12-12  6:45       ` Ram Pai
  2 siblings, 2 replies; 21+ messages in thread
From: Michael Roth @ 2019-12-11  1:43 UTC (permalink / raw)
  To: Ram Pai, mpe
  Cc: linuxppc-dev, benh, david, paulus, hch, linuxram, andmike,
	sukadev, mst, ram.n.pai, aik, cai, tglx, bauerman, linux-kernel,
	leonardo

Quoting Ram Pai (2019-12-06 19:12:39)
> Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
>                 secure guests")
> disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
> path for secure VMs, helped enable dma_direct path.  This enabled
> support for bounce-buffering through SWIOTLB.  However it fails to
> operate when IOMMU is enabled, since I/O pages are not TCE mapped.
> 
> Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
> cases including, TCE mapping I/O pages, in the presence of a
> IOMMU.

Wasn't clear to me at first, but I guess the main gist of this series is
that we want to continue to use SWIOTLB, but also need to create mappings
of it's bounce buffers in the IOMMU, so we revert to using dma_iommu_ops
and rely on the various dma_iommu_{map,alloc}_bypass() hooks throughout
to call into dma_direct_* ops rather than relying on the dma_is_direct(ops)
checks in DMA API functions to do the same.

That makes sense, but one issue I see with that is that
dma_iommu_map_bypass() only tests true if all the following are true:

1) the device requests a 64-bit DMA mask via
   dma_set_mask/dma_set_coherent_mask
2) DDW is enabled (i.e. we don't pass disable_ddw on command-line)

dma_is_direct() checks don't have this limitation, so I think for
anything cases, such as devices that use a smaller DMA mask, we'll
end up falling back to the non-bypass functions in dma_iommu_ops, which
will likely break for things like dma_alloc_coherent/dma_map_single
since they won't use SWIOTLB pages and won't do the necessary calls to
set_memory_unencrypted() to share those non-SWIOTLB buffers with
hypervisor.

Maybe that's ok, but I think we should be clearer about how to
fail/handle these cases.

Though I also agree with some concerns Alexey stated earlier: it seems
wasteful to map the entire DDW window just so these bounce buffers can be
mapped.  Especially if you consider the lack of a mapping to be an additional
safe-guard against things like buggy device implementations on the QEMU
side. E.g. if we leaked pages to the hypervisor on accident, those pages
wouldn't be immediately accessible to a device, and would still require
additional work get past the IOMMU.

What would it look like if we try to make all this work with disable_ddw passed
to kernel command-line (or forced for is_secure_guest())?

  1) dma_iommu_{alloc,map}_bypass() would no longer get us to dma_direct_* ops,
     but an additional case or hook that considers is_secure_guest() might do
     it.
     
  2) We'd also need to set up an IOMMU mapping for the bounce buffers via
     io_tlb_start/io_tlb_end. We could do it once, on-demand via
     dma_iommu_bypass_supported() like we do for the 64-bit DDW window, or
     maybe in some init function.

That also has the benefit of not requiring devices to support 64-bit DMA.

Alternatively, we could continue to rely on the 64-bit DDW window, but
modify call to enable_ddw() to only map the io_tlb_start/end range in
the case of is_secure_guest(). This is a little cleaner implementation-wise
since we can rely on the existing dma_iommu_{alloc,map}_bypass() hooks, but
devices that don't support 64-bit will fail back to not using dma_direct_* ops
and fail miserably. We'd probably want to handle that more gracefully.

Or we handle both cases gracefully. To me it makes more sense to enable
non-DDW case, then consider adding DDW case later if there's some reason
why 64-bit DMA is needed. But would be good to hear if there are other
opinions.

> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
>  1 file changed, 1 insertion(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 67b5009..4e27d66 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -36,7 +36,6 @@
>  #include <asm/udbg.h>
>  #include <asm/mmzone.h>
>  #include <asm/plpar_wrappers.h>
> -#include <asm/svm.h>
>  #include <asm/ultravisor.h>
> 
>  #include "pseries.h"
> @@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
>         of_reconfig_notifier_register(&iommu_reconfig_nb);
>         register_memory_notifier(&iommu_mem_nb);
> 
> -       /*
> -        * Secure guest memory is inacessible to devices so regular DMA isn't
> -        * possible.
> -        *
> -        * In that case keep devices' dma_map_ops as NULL so that the generic
> -        * DMA code path will use SWIOTLB to bounce buffers for DMA.
> -        */
> -       if (!is_secure_guest())
> -               set_pci_dma_ops(&dma_iommu_ops);
> +       set_pci_dma_ops(&dma_iommu_ops);
>  }
> 
>  static int __init disable_multitce(char *str)
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-10 15:35         ` Ram Pai
@ 2019-12-11  8:15           ` Alexey Kardashevskiy
  2019-12-11 20:31             ` Michael Roth
  2019-12-12  4:11             ` Ram Pai
  0 siblings, 2 replies; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-11  8:15 UTC (permalink / raw)
  To: Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, mdroth, hch, andmike,
	sukadev, mst, ram.n.pai, cai, tglx, bauerman, linux-kernel,
	leonardo



On 11/12/2019 02:35, Ram Pai wrote:
> On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 10/12/2019 16:12, Ram Pai wrote:
>>> On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
>>>>
>>>>
>>>> On 07/12/2019 12:12, Ram Pai wrote:
>>>>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
>>>>> its parameters.  On secure VMs, hypervisor cannot access the contents of
>>>>> this page since it gets encrypted.  Hence share the page with the
>>>>> hypervisor, and unshare when done.
>>>>
>>>>
>>>> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
>>>> pages. There is small problem that when DDW is enabled,
>>>> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
>>>> about the performance on slack but this is caused by initial cleanup of
>>>> the default TCE window (which we do not use anyway) and to battle this
>>>> we can simply reduce its size by adding
>>>
>>> something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
>>> 13secs per device for H_PUT_TCE approach, during boot. This is with a
>>> 30GB guest. With larger guest, the time will further detoriate.
>>
>>
>> No it will not, I checked. The time is the same for 2GB and 32GB guests-
>> the delay is caused by clearing the small DMA window which is small by
>> the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
>> for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
>> (depends on the system) so the number of TCEs is much smaller.
> 
> I cant get your results.  What changes did you make to get it?


Get what? I passed "-m 2G" and "-m 32G", got the same time - 13s spent
in clearing the default window and the huge window took a fraction of a
second to create and map.


>>>>
>>>> -global
>>>> spapr-pci-host-bridge.dma_win_size=0x4000000
>>>
>>> This option, speeds it up tremendously.  But than should this option be
>>> enabled in qemu by default?  only for secure VMs? for both VMs?
>>
>>
>> As discussed in slack, by default we do not need to clear the entire TCE
>> table and we only have to map swiotlb buffer using the small window. It
>> is a guest kernel change only. Thanks,
> 
> Can you tell me what code you are talking about here.  Where is the TCE
> table getting cleared? What code needs to be changed to not clear it?


pci_dma_bus_setup_pSeriesLP()
	iommu_init_table()
		iommu_table_clear()
			for () tbl->it_ops->get()

We do not really need to clear it there, we only need it for VFIO with
IOMMU SPAPR TCE v1 which reuses these tables but there are
iommu_take_ownership/iommu_release_ownership to clear these tables. I'll
send a patch for this.


> Is the code in tce_buildmulti_pSeriesLP(), the one that does the clear
> aswell?


This one does not need to clear TCEs as this creates a window of known
size and maps it all.

Well, actually, it only maps actual guest RAM, if there are gaps in RAM,
then TCEs for the gaps will have what hypervisor had there (which is
zeroes, qemu/kvm clears it anyway).


> But before I close, you have not told me clearly, what is the problem
> with;  'share the page, make the H_PUT_INDIRECT_TCE hcall, unshare the page'.

Between share and unshare you have a (tiny) window of opportunity to
attack the guest. No, I do not know how exactly.

For example, the hypervisor does a lot of PHB+PCI hotplug-unplug with
64bit devices - each time this will create a huge window which will
share/unshare the same page.  No, I do not know how exactly how this can
be exploited either, we cannot rely of what you or myself know today. My
point is that we should not be sharing pages at all unless we really
really have to, and this does not seem to be the case.

But since this seems to an acceptable compromise anyway,

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>





> Remember this is the same page that is earmarked for doing
> H_PUT_INDIRECT_TCE, not by my patch, but its already earmarked by the
> existing code. So it not some random buffer that is picked. Second 
> this page is temporarily shared and unshared, it does not stay shared
> for life.  It does not slow the boot. it does not need any
> special command line options on the qemu.
>> Shared pages technology was put in place, exactly for the purpose of
> sharing data with the hypervisor.  We are using this technology exactly
> for that purpose.  And finally I agreed with your concern of having
> shared pages staying around.  Hence i addressed that concern, by
> unsharing the page.  At this point, I fail to understand your concern.




-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-11  1:43     ` Michael Roth
@ 2019-12-11  8:36       ` Alexey Kardashevskiy
  2019-12-11 18:07         ` Michael Roth
  2019-12-12  6:45       ` Ram Pai
  1 sibling, 1 reply; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-11  8:36 UTC (permalink / raw)
  To: Michael Roth, Ram Pai, mpe
  Cc: linuxppc-dev, benh, david, paulus, hch, andmike, sukadev, mst,
	ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo



On 11/12/2019 12:43, Michael Roth wrote:
> Quoting Ram Pai (2019-12-06 19:12:39)
>> Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
>>                 secure guests")
>> disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
>> path for secure VMs, helped enable dma_direct path.  This enabled
>> support for bounce-buffering through SWIOTLB.  However it fails to
>> operate when IOMMU is enabled, since I/O pages are not TCE mapped.
>>
>> Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
>> cases including, TCE mapping I/O pages, in the presence of a
>> IOMMU.
> 
> Wasn't clear to me at first, but I guess the main gist of this series is
> that we want to continue to use SWIOTLB, but also need to create mappings
> of it's bounce buffers in the IOMMU, so we revert to using dma_iommu_ops
> and rely on the various dma_iommu_{map,alloc}_bypass() hooks throughout
> to call into dma_direct_* ops rather than relying on the dma_is_direct(ops)
> checks in DMA API functions to do the same.


Correct. Took me a bit of time to realize what we got here :) We only
rely on  dma_iommu_ops::.dma_supported to write the DMA offset to a
device (when creating a huge window), and after that we know it is
mapped directly and swiotlb gets this 1<<59 offset via __phys_to_dma().


> That makes sense, but one issue I see with that is that
> dma_iommu_map_bypass() only tests true if all the following are true:
> 
> 1) the device requests a 64-bit DMA mask via
>    dma_set_mask/dma_set_coherent_mask
> 2) DDW is enabled (i.e. we don't pass disable_ddw on command-line)
> 
> dma_is_direct() checks don't have this limitation, so I think for
> anything cases, such as devices that use a smaller DMA mask, we'll
> end up falling back to the non-bypass functions in dma_iommu_ops, which
> will likely break for things like dma_alloc_coherent/dma_map_single
> since they won't use SWIOTLB pages and won't do the necessary calls to
> set_memory_unencrypted() to share those non-SWIOTLB buffers with
> hypervisor.
> 
> Maybe that's ok, but I think we should be clearer about how to
> fail/handle these cases.
> 
> Though I also agree with some concerns Alexey stated earlier: it seems
> wasteful to map the entire DDW window just so these bounce buffers can be
> mapped.  Especially if you consider the lack of a mapping to be an additional
> safe-guard against things like buggy device implementations on the QEMU
> side. E.g. if we leaked pages to the hypervisor on accident, those pages
> wouldn't be immediately accessible to a device, and would still require
> additional work get past the IOMMU.
> 
> What would it look like if we try to make all this work with disable_ddw passed
> to kernel command-line (or forced for is_secure_guest())?
> 
>   1) dma_iommu_{alloc,map}_bypass() would no longer get us to dma_direct_* ops,
>      but an additional case or hook that considers is_secure_guest() might do
>      it.
>      
>   2) We'd also need to set up an IOMMU mapping for the bounce buffers via
>      io_tlb_start/io_tlb_end. We could do it once, on-demand via
>      dma_iommu_bypass_supported() like we do for the 64-bit DDW window, or
>      maybe in some init function.


io_tlb_start/io_tlb_end are only guaranteed to stay within 4GB and our
default DMA window is 1GB (KVM) or 2GB (PowerVM), ok, we can define
ARCH_LOW_ADDRESS_LIMIT as 1GB.

But it has also been mentioned that we are likely to be having swiotlb
buffers outside of the first 4GB as they are not just for crippled
devices any more. So we are likely to have 64bit window, I'd just ditch
the default window then, I have patches for this but every time I
thought I have a use case, turned out that I did not.


> That also has the benefit of not requiring devices to support 64-bit DMA.
> 
> Alternatively, we could continue to rely on the 64-bit DDW window, but
> modify call to enable_ddw() to only map the io_tlb_start/end range in
> the case of is_secure_guest(). This is a little cleaner implementation-wise
> since we can rely on the existing dma_iommu_{alloc,map}_bypass() hooks, but
> devices that don't support 64-bit will fail back to not using dma_direct_* ops
> and fail miserably. We'd probably want to handle that more gracefully.
> 
> Or we handle both cases gracefully. To me it makes more sense to enable
> non-DDW case, then consider adding DDW case later if there's some reason
> why 64-bit DMA is needed. But would be good to hear if there are other
> opinions.


For now we need to do something with the H_PUT_TCE_INDIRECT's page -
either disable multitce (but boot time increases) or share the page. The
patch does the latter. Thanks,


> 
>>
>> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
>> ---
>>  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
>>  1 file changed, 1 insertion(+), 10 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 67b5009..4e27d66 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -36,7 +36,6 @@
>>  #include <asm/udbg.h>
>>  #include <asm/mmzone.h>
>>  #include <asm/plpar_wrappers.h>
>> -#include <asm/svm.h>
>>  #include <asm/ultravisor.h>
>>
>>  #include "pseries.h"
>> @@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
>>         of_reconfig_notifier_register(&iommu_reconfig_nb);
>>         register_memory_notifier(&iommu_mem_nb);
>>
>> -       /*
>> -        * Secure guest memory is inacessible to devices so regular DMA isn't
>> -        * possible.
>> -        *
>> -        * In that case keep devices' dma_map_ops as NULL so that the generic
>> -        * DMA code path will use SWIOTLB to bounce buffers for DMA.
>> -        */
>> -       if (!is_secure_guest())
>> -               set_pci_dma_ops(&dma_iommu_ops);
>> +       set_pci_dma_ops(&dma_iommu_ops);
>>  }
>>
>>  static int __init disable_multitce(char *str)
>> -- 
>> 1.8.3.1
>>

-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-11  8:36       ` Alexey Kardashevskiy
@ 2019-12-11 18:07         ` Michael Roth
  2019-12-11 18:20           ` Christoph Hellwig
  0 siblings, 1 reply; 21+ messages in thread
From: Michael Roth @ 2019-12-11 18:07 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Ram Pai, mpe
  Cc: linuxppc-dev, benh, david, paulus, hch, andmike, sukadev, mst,
	ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo

Quoting Alexey Kardashevskiy (2019-12-11 02:36:29)
> 
> 
> On 11/12/2019 12:43, Michael Roth wrote:
> > Quoting Ram Pai (2019-12-06 19:12:39)
> >> Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
> >>                 secure guests")
> >> disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
> >> path for secure VMs, helped enable dma_direct path.  This enabled
> >> support for bounce-buffering through SWIOTLB.  However it fails to
> >> operate when IOMMU is enabled, since I/O pages are not TCE mapped.
> >>
> >> Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
> >> cases including, TCE mapping I/O pages, in the presence of a
> >> IOMMU.
> > 
> > Wasn't clear to me at first, but I guess the main gist of this series is
> > that we want to continue to use SWIOTLB, but also need to create mappings
> > of it's bounce buffers in the IOMMU, so we revert to using dma_iommu_ops
> > and rely on the various dma_iommu_{map,alloc}_bypass() hooks throughout
> > to call into dma_direct_* ops rather than relying on the dma_is_direct(ops)
> > checks in DMA API functions to do the same.
> 
> 
> Correct. Took me a bit of time to realize what we got here :) We only
> rely on  dma_iommu_ops::.dma_supported to write the DMA offset to a
> device (when creating a huge window), and after that we know it is
> mapped directly and swiotlb gets this 1<<59 offset via __phys_to_dma().
> 
> 
> > That makes sense, but one issue I see with that is that
> > dma_iommu_map_bypass() only tests true if all the following are true:
> > 
> > 1) the device requests a 64-bit DMA mask via
> >    dma_set_mask/dma_set_coherent_mask
> > 2) DDW is enabled (i.e. we don't pass disable_ddw on command-line)
> > 
> > dma_is_direct() checks don't have this limitation, so I think for
> > anything cases, such as devices that use a smaller DMA mask, we'll
> > end up falling back to the non-bypass functions in dma_iommu_ops, which
> > will likely break for things like dma_alloc_coherent/dma_map_single
> > since they won't use SWIOTLB pages and won't do the necessary calls to
> > set_memory_unencrypted() to share those non-SWIOTLB buffers with
> > hypervisor.
> > 
> > Maybe that's ok, but I think we should be clearer about how to
> > fail/handle these cases.
> > 
> > Though I also agree with some concerns Alexey stated earlier: it seems
> > wasteful to map the entire DDW window just so these bounce buffers can be
> > mapped.  Especially if you consider the lack of a mapping to be an additional
> > safe-guard against things like buggy device implementations on the QEMU
> > side. E.g. if we leaked pages to the hypervisor on accident, those pages
> > wouldn't be immediately accessible to a device, and would still require
> > additional work get past the IOMMU.
> > 
> > What would it look like if we try to make all this work with disable_ddw passed
> > to kernel command-line (or forced for is_secure_guest())?
> > 
> >   1) dma_iommu_{alloc,map}_bypass() would no longer get us to dma_direct_* ops,
> >      but an additional case or hook that considers is_secure_guest() might do
> >      it.
> >      
> >   2) We'd also need to set up an IOMMU mapping for the bounce buffers via
> >      io_tlb_start/io_tlb_end. We could do it once, on-demand via
> >      dma_iommu_bypass_supported() like we do for the 64-bit DDW window, or
> >      maybe in some init function.
> 
> 
> io_tlb_start/io_tlb_end are only guaranteed to stay within 4GB and our
> default DMA window is 1GB (KVM) or 2GB (PowerVM), ok, we can define
> ARCH_LOW_ADDRESS_LIMIT as 1GB.

True, and limiting allocations to under 1GB might be brittle (also saw a
patching floating around that increased IO_TLB_DEFAULT_SIZE size to 1GB,
which obviously wouldn't work out with this approach, but not sure if
that's still needed or not: "powerpc/svm: Increase SWIOTLB buffer size")

However that's only an issue if we insist on using an identity mapping
in the IOMMU, which would be nice because non-IOMMU virtio would
magically work, but since that's not a goal of this series I think we do
have the option of mapping io_tlb_start at DMA address 0 (or
thereabouts).

We'd probably need to modify __phys_to_dma to treat archdata.dma_offset
as a negative offset in this case, but it seems like it would work about
the same as with DDW offset.

But yah, it does make things a bit less appealing than what I was initially
thinking with that approach...

> 
> But it has also been mentioned that we are likely to be having swiotlb
> buffers outside of the first 4GB as they are not just for crippled
> devices any more. So we are likely to have 64bit window, I'd just ditch
> the default window then, I have patches for this but every time I
> thought I have a use case, turned out that I did not.

Not sure I've seen this discussion, maybe it was on slack? By crippled
devices do you mean virtio with IOMMU off? Isn't swiotlb buffer limited
to under ARCH_LOW_ADDRESS_LIMIT in any case? Just trying to understand
what changes we're anticipating there.

> 
> 
> > That also has the benefit of not requiring devices to support 64-bit DMA.
> > 
> > Alternatively, we could continue to rely on the 64-bit DDW window, but
> > modify call to enable_ddw() to only map the io_tlb_start/end range in
> > the case of is_secure_guest(). This is a little cleaner implementation-wise
> > since we can rely on the existing dma_iommu_{alloc,map}_bypass() hooks, but
> > devices that don't support 64-bit will fail back to not using dma_direct_* ops
> > and fail miserably. We'd probably want to handle that more gracefully.
> > 
> > Or we handle both cases gracefully. To me it makes more sense to enable
> > non-DDW case, then consider adding DDW case later if there's some reason
> > why 64-bit DMA is needed. But would be good to hear if there are other
> > opinions.
> 
> 
> For now we need to do something with the H_PUT_TCE_INDIRECT's page -
> either disable multitce (but boot time increases) or share the page. The
> patch does the latter. Thanks,

I was sort of hoping the option of only mapping the bounce buffer (or
avoiding DDW completely) would help here, but looks like the issue has
more to do with clearing the default TCE table. Fair enough.

Reverting to dma_iommu_ops does re-introduce some new failure paths for
non 64-bit devices though, so I think it would be good to address that
as part of this series. I think it would be sufficient to have
dma_set_mask/dma_set_coherent_mask/dma_set_mask_and_coherent fail for
non 64-bit masks. I think something like the following in
dma_iommu_dma_supported() might do it:

  /*
   * Secure guests currently rely on 64-bit DMA window, which is only
   * utilized for devices that support 64-bit DMA masks
   */
  if (is_secure_guest() && mask < DMA_BIT_MASK(64)) {
    dev_err(dev, "Warning: 64-bit DMA required when PEF enabled: mask 0x%08llx", mask);
    return 0;
  }
    
That should let most drivers fail gracefully and make it clear what devices
are broken (rather than silently misbehaving). It also makes things a bit
clearer WRT what we expect to work with this applied. 

> 
> 
> > 
> >>
> >> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> >> ---
> >>  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
> >>  1 file changed, 1 insertion(+), 10 deletions(-)
> >>
> >> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> >> index 67b5009..4e27d66 100644
> >> --- a/arch/powerpc/platforms/pseries/iommu.c
> >> +++ b/arch/powerpc/platforms/pseries/iommu.c
> >> @@ -36,7 +36,6 @@
> >>  #include <asm/udbg.h>
> >>  #include <asm/mmzone.h>
> >>  #include <asm/plpar_wrappers.h>
> >> -#include <asm/svm.h>
> >>  #include <asm/ultravisor.h>
> >>
> >>  #include "pseries.h"
> >> @@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
> >>         of_reconfig_notifier_register(&iommu_reconfig_nb);
> >>         register_memory_notifier(&iommu_mem_nb);
> >>
> >> -       /*
> >> -        * Secure guest memory is inacessible to devices so regular DMA isn't
> >> -        * possible.
> >> -        *
> >> -        * In that case keep devices' dma_map_ops as NULL so that the generic
> >> -        * DMA code path will use SWIOTLB to bounce buffers for DMA.
> >> -        */
> >> -       if (!is_secure_guest())
> >> -               set_pci_dma_ops(&dma_iommu_ops);
> >> +       set_pci_dma_ops(&dma_iommu_ops);
> >>  }
> >>
> >>  static int __init disable_multitce(char *str)
> >> -- 
> >> 1.8.3.1
> >>
> 
> -- 
> Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-11 18:07         ` Michael Roth
@ 2019-12-11 18:20           ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2019-12-11 18:20 UTC (permalink / raw)
  To: Michael Roth
  Cc: Alexey Kardashevskiy, Ram Pai, mpe, linuxppc-dev, benh, david,
	paulus, hch, andmike, sukadev, mst, ram.n.pai, cai, tglx,
	bauerman, linux-kernel, leonardo

On Wed, Dec 11, 2019 at 12:07:17PM -0600, Michael Roth wrote:
> > io_tlb_start/io_tlb_end are only guaranteed to stay within 4GB and our
> > default DMA window is 1GB (KVM) or 2GB (PowerVM), ok, we can define
> > ARCH_LOW_ADDRESS_LIMIT as 1GB.
> 
> True, and limiting allocations to under 1GB might be brittle (also saw a
> patching floating around that increased IO_TLB_DEFAULT_SIZE size to 1GB,
> which obviously wouldn't work out with this approach, but not sure if
> that's still needed or not: "powerpc/svm: Increase SWIOTLB buffer size")

FYI, there is a patch out there that allocates the powerpc swiotlb
from the boottom of the memblock area instead of the top to fix a 85xx
regression.

Also the AMD folks have been asking about non-GFP_DMA32 swiotlb pools
as they have the same bounce buffer issue with SEV.  I think it is
entirely doable to have multiple swiotlb pool, I just need a volunteer
to implement that.

> 
> However that's only an issue if we insist on using an identity mapping
> in the IOMMU, which would be nice because non-IOMMU virtio would
> magically work, but since that's not a goal of this series I think we do
> have the option of mapping io_tlb_start at DMA address 0 (or
> thereabouts).
> 
> We'd probably need to modify __phys_to_dma to treat archdata.dma_offset
> as a negative offset in this case, but it seems like it would work about
> the same as with DDW offset.

Or switch to the generic version of __phys_to_dma that has a negative
offset.  We'd just need to look into a signed value for dma_pfn_offset
to allow for the existing platforms that need the current positive
offset.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-11  8:15           ` Alexey Kardashevskiy
@ 2019-12-11 20:31             ` Michael Roth
  2019-12-11 22:47               ` Alexey Kardashevskiy
  2019-12-12  4:11             ` Ram Pai
  1 sibling, 1 reply; 21+ messages in thread
From: Michael Roth @ 2019-12-11 20:31 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, hch, andmike, sukadev,
	mst, ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo

Quoting Alexey Kardashevskiy (2019-12-11 02:15:44)
> 
> 
> On 11/12/2019 02:35, Ram Pai wrote:
> > On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
> >>
> >>
> >> On 10/12/2019 16:12, Ram Pai wrote:
> >>> On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
> >>>>
> >>>>
> >>>> On 07/12/2019 12:12, Ram Pai wrote:
> >>>>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
> >>>>> its parameters.  On secure VMs, hypervisor cannot access the contents of
> >>>>> this page since it gets encrypted.  Hence share the page with the
> >>>>> hypervisor, and unshare when done.
> >>>>
> >>>>
> >>>> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
> >>>> pages. There is small problem that when DDW is enabled,
> >>>> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
> >>>> about the performance on slack but this is caused by initial cleanup of
> >>>> the default TCE window (which we do not use anyway) and to battle this
> >>>> we can simply reduce its size by adding
> >>>
> >>> something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
> >>> 13secs per device for H_PUT_TCE approach, during boot. This is with a
> >>> 30GB guest. With larger guest, the time will further detoriate.
> >>
> >>
> >> No it will not, I checked. The time is the same for 2GB and 32GB guests-
> >> the delay is caused by clearing the small DMA window which is small by
> >> the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
> >> for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
> >> (depends on the system) so the number of TCEs is much smaller.
> > 
> > I cant get your results.  What changes did you make to get it?
> 
> 
> Get what? I passed "-m 2G" and "-m 32G", got the same time - 13s spent
> in clearing the default window and the huge window took a fraction of a
> second to create and map.

Is this if we disable FW_FEATURE_MULTITCE in the guest and force the use
of H_PUT_TCE everywhere?

In theory couldn't we leave FW_FEATURE_MULTITCE in place so that
iommu_table_clear() can still use H_STUFF_TCE (which I guess is basically
instant), and then force H_PUT_TCE for new mappings via something like:

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 6ba081dd61c9..85d092baf17d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -194,6 +194,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
        unsigned long flags;
 
        if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
+       if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE) || is_secure_guest()) {
                return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
                                           direction, attrs);
        }

That seems like it would avoid the extra 13s.

If we take the additional step of only mapping SWIOTLB range in
enable_ddw() for is_secure_guest() that might further improve things
(though the bigger motivation with that is the extra isolation it would
grant us for stuff behind the IOMMU, since it apparently doesn't affect
boot-time all that much)

> 
> 
> >>>>
> >>>> -global
> >>>> spapr-pci-host-bridge.dma_win_size=0x4000000
> >>>
> >>> This option, speeds it up tremendously.  But than should this option be
> >>> enabled in qemu by default?  only for secure VMs? for both VMs?
> >>
> >>
> >> As discussed in slack, by default we do not need to clear the entire TCE
> >> table and we only have to map swiotlb buffer using the small window. It
> >> is a guest kernel change only. Thanks,
> > 
> > Can you tell me what code you are talking about here.  Where is the TCE
> > table getting cleared? What code needs to be changed to not clear it?
> 
> 
> pci_dma_bus_setup_pSeriesLP()
>         iommu_init_table()
>                 iommu_table_clear()
>                         for () tbl->it_ops->get()
> 
> We do not really need to clear it there, we only need it for VFIO with
> IOMMU SPAPR TCE v1 which reuses these tables but there are
> iommu_take_ownership/iommu_release_ownership to clear these tables. I'll
> send a patch for this.


> 
> 
> > Is the code in tce_buildmulti_pSeriesLP(), the one that does the clear
> > aswell?
> 
> 
> This one does not need to clear TCEs as this creates a window of known
> size and maps it all.
> 
> Well, actually, it only maps actual guest RAM, if there are gaps in RAM,
> then TCEs for the gaps will have what hypervisor had there (which is
> zeroes, qemu/kvm clears it anyway).
> 
> 
> > But before I close, you have not told me clearly, what is the problem
> > with;  'share the page, make the H_PUT_INDIRECT_TCE hcall, unshare the page'.
> 
> Between share and unshare you have a (tiny) window of opportunity to
> attack the guest. No, I do not know how exactly.
> 
> For example, the hypervisor does a lot of PHB+PCI hotplug-unplug with
> 64bit devices - each time this will create a huge window which will
> share/unshare the same page.  No, I do not know how exactly how this can
> be exploited either, we cannot rely of what you or myself know today. My
> point is that we should not be sharing pages at all unless we really
> really have to, and this does not seem to be the case.
> 
> But since this seems to an acceptable compromise anyway,
> 
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> 
> 
> 
> 
> > Remember this is the same page that is earmarked for doing
> > H_PUT_INDIRECT_TCE, not by my patch, but its already earmarked by the
> > existing code. So it not some random buffer that is picked. Second 
> > this page is temporarily shared and unshared, it does not stay shared
> > for life.  It does not slow the boot. it does not need any
> > special command line options on the qemu.
> >> Shared pages technology was put in place, exactly for the purpose of
> > sharing data with the hypervisor.  We are using this technology exactly
> > for that purpose.  And finally I agreed with your concern of having
> > shared pages staying around.  Hence i addressed that concern, by
> > unsharing the page.  At this point, I fail to understand your concern.
> 
> 
> 
> 
> -- 
> Alexey

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-11 20:31             ` Michael Roth
@ 2019-12-11 22:47               ` Alexey Kardashevskiy
  2019-12-12  2:39                 ` Alexey Kardashevskiy
  2019-12-13  0:22                 ` Michael Roth
  0 siblings, 2 replies; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-11 22:47 UTC (permalink / raw)
  To: Michael Roth, Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, hch, andmike, sukadev,
	mst, ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo



On 12/12/2019 07:31, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2019-12-11 02:15:44)
>>
>>
>> On 11/12/2019 02:35, Ram Pai wrote:
>>> On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
>>>>
>>>>
>>>> On 10/12/2019 16:12, Ram Pai wrote:
>>>>> On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
>>>>>>
>>>>>>
>>>>>> On 07/12/2019 12:12, Ram Pai wrote:
>>>>>>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
>>>>>>> its parameters.  On secure VMs, hypervisor cannot access the contents of
>>>>>>> this page since it gets encrypted.  Hence share the page with the
>>>>>>> hypervisor, and unshare when done.
>>>>>>
>>>>>>
>>>>>> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
>>>>>> pages. There is small problem that when DDW is enabled,
>>>>>> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
>>>>>> about the performance on slack but this is caused by initial cleanup of
>>>>>> the default TCE window (which we do not use anyway) and to battle this
>>>>>> we can simply reduce its size by adding
>>>>>
>>>>> something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
>>>>> 13secs per device for H_PUT_TCE approach, during boot. This is with a
>>>>> 30GB guest. With larger guest, the time will further detoriate.
>>>>
>>>>
>>>> No it will not, I checked. The time is the same for 2GB and 32GB guests-
>>>> the delay is caused by clearing the small DMA window which is small by
>>>> the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
>>>> for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
>>>> (depends on the system) so the number of TCEs is much smaller.
>>>
>>> I cant get your results.  What changes did you make to get it?
>>
>>
>> Get what? I passed "-m 2G" and "-m 32G", got the same time - 13s spent
>> in clearing the default window and the huge window took a fraction of a
>> second to create and map.
> 
> Is this if we disable FW_FEATURE_MULTITCE in the guest and force the use
> of H_PUT_TCE everywhere?


Yes. Well, for the DDW case FW_FEATURE_MULTITCE is ignored but even when
fixed (I have it in my local branch), this does not make a difference.


> 
> In theory couldn't we leave FW_FEATURE_MULTITCE in place so that
> iommu_table_clear() can still use H_STUFF_TCE (which I guess is basically
> instant),

PAPR/LoPAPR "conveniently" do not describe what hcall-multi-tce does
exactly. But I am pretty sure the idea is that either both H_STUFF_TCE
and H_PUT_TCE_INDIRECT are present or neither.


> and then force H_PUT_TCE for new mappings via something like:
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 6ba081dd61c9..85d092baf17d 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -194,6 +194,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>         unsigned long flags;
>  
>         if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
> +       if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE) || is_secure_guest()) {


Nobody (including myself) seems to like the idea of having
is_secure_guest() all over the place.

And with KVM acceleration enabled, it is pretty fast anyway. Just now we
do not have H_PUT_TCE in KVM/UV for secure guests but we will have to
fix this for secure PCI passhtrough anyway.


>                 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
>                                            direction, attrs);
>         }
> 
> That seems like it would avoid the extra 13s.

Or move around iommu_table_clear() which imho is just the right thing to do.


> If we take the additional step of only mapping SWIOTLB range in
> enable_ddw() for is_secure_guest() that might further improve things
> (though the bigger motivation with that is the extra isolation it would
> grant us for stuff behind the IOMMU, since it apparently doesn't affect
> boot-time all that much)


Sure, we just need to confirm how many of these swiotlb banks we are
going to have (just one or many and at what location). Thanks,



> 
>>
>>
>>>>>>
>>>>>> -global
>>>>>> spapr-pci-host-bridge.dma_win_size=0x4000000
>>>>>
>>>>> This option, speeds it up tremendously.  But than should this option be
>>>>> enabled in qemu by default?  only for secure VMs? for both VMs?
>>>>
>>>>
>>>> As discussed in slack, by default we do not need to clear the entire TCE
>>>> table and we only have to map swiotlb buffer using the small window. It
>>>> is a guest kernel change only. Thanks,
>>>
>>> Can you tell me what code you are talking about here.  Where is the TCE
>>> table getting cleared? What code needs to be changed to not clear it?
>>
>>
>> pci_dma_bus_setup_pSeriesLP()
>>         iommu_init_table()
>>                 iommu_table_clear()
>>                         for () tbl->it_ops->get()
>>
>> We do not really need to clear it there, we only need it for VFIO with
>> IOMMU SPAPR TCE v1 which reuses these tables but there are
>> iommu_take_ownership/iommu_release_ownership to clear these tables. I'll
>> send a patch for this.
> 
> 
>>
>>
>>> Is the code in tce_buildmulti_pSeriesLP(), the one that does the clear
>>> aswell?
>>
>>
>> This one does not need to clear TCEs as this creates a window of known
>> size and maps it all.
>>
>> Well, actually, it only maps actual guest RAM, if there are gaps in RAM,
>> then TCEs for the gaps will have what hypervisor had there (which is
>> zeroes, qemu/kvm clears it anyway).
>>
>>
>>> But before I close, you have not told me clearly, what is the problem
>>> with;  'share the page, make the H_PUT_INDIRECT_TCE hcall, unshare the page'.
>>
>> Between share and unshare you have a (tiny) window of opportunity to
>> attack the guest. No, I do not know how exactly.
>>
>> For example, the hypervisor does a lot of PHB+PCI hotplug-unplug with
>> 64bit devices - each time this will create a huge window which will
>> share/unshare the same page.  No, I do not know how exactly how this can
>> be exploited either, we cannot rely of what you or myself know today. My
>> point is that we should not be sharing pages at all unless we really
>> really have to, and this does not seem to be the case.
>>
>> But since this seems to an acceptable compromise anyway,
>>
>> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>
>>
>>
>>
>>
>>> Remember this is the same page that is earmarked for doing
>>> H_PUT_INDIRECT_TCE, not by my patch, but its already earmarked by the
>>> existing code. So it not some random buffer that is picked. Second 
>>> this page is temporarily shared and unshared, it does not stay shared
>>> for life.  It does not slow the boot. it does not need any
>>> special command line options on the qemu.
>>>> Shared pages technology was put in place, exactly for the purpose of
>>> sharing data with the hypervisor.  We are using this technology exactly
>>> for that purpose.  And finally I agreed with your concern of having
>>> shared pages staying around.  Hence i addressed that concern, by
>>> unsharing the page.  At this point, I fail to understand your concern.
>>
>>
>>
>>
>> -- 
>> Alexey

-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-11 22:47               ` Alexey Kardashevskiy
@ 2019-12-12  2:39                 ` Alexey Kardashevskiy
  2019-12-13  0:22                 ` Michael Roth
  1 sibling, 0 replies; 21+ messages in thread
From: Alexey Kardashevskiy @ 2019-12-12  2:39 UTC (permalink / raw)
  To: Michael Roth, Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, hch, andmike, sukadev,
	mst, ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo



On 12/12/2019 09:47, Alexey Kardashevskiy wrote:
> 
> 
> On 12/12/2019 07:31, Michael Roth wrote:
>> Quoting Alexey Kardashevskiy (2019-12-11 02:15:44)
>>>
>>>
>>> On 11/12/2019 02:35, Ram Pai wrote:
>>>> On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
>>>>>
>>>>>
>>>>> On 10/12/2019 16:12, Ram Pai wrote:
>>>>>> On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 07/12/2019 12:12, Ram Pai wrote:
>>>>>>>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
>>>>>>>> its parameters.  On secure VMs, hypervisor cannot access the contents of
>>>>>>>> this page since it gets encrypted.  Hence share the page with the
>>>>>>>> hypervisor, and unshare when done.
>>>>>>>
>>>>>>>
>>>>>>> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
>>>>>>> pages. There is small problem that when DDW is enabled,
>>>>>>> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
>>>>>>> about the performance on slack but this is caused by initial cleanup of
>>>>>>> the default TCE window (which we do not use anyway) and to battle this
>>>>>>> we can simply reduce its size by adding
>>>>>>
>>>>>> something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
>>>>>> 13secs per device for H_PUT_TCE approach, during boot. This is with a
>>>>>> 30GB guest. With larger guest, the time will further detoriate.
>>>>>
>>>>>
>>>>> No it will not, I checked. The time is the same for 2GB and 32GB guests-
>>>>> the delay is caused by clearing the small DMA window which is small by
>>>>> the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
>>>>> for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
>>>>> (depends on the system) so the number of TCEs is much smaller.
>>>>
>>>> I cant get your results.  What changes did you make to get it?
>>>
>>>
>>> Get what? I passed "-m 2G" and "-m 32G", got the same time - 13s spent
>>> in clearing the default window and the huge window took a fraction of a
>>> second to create and map.
>>
>> Is this if we disable FW_FEATURE_MULTITCE in the guest and force the use
>> of H_PUT_TCE everywhere?
> 
> 
> Yes. Well, for the DDW case FW_FEATURE_MULTITCE is ignored but even when
> fixed (I have it in my local branch), this does not make a difference.
> 
> 
>>
>> In theory couldn't we leave FW_FEATURE_MULTITCE in place so that
>> iommu_table_clear() can still use H_STUFF_TCE (which I guess is basically
>> instant),
> 
> PAPR/LoPAPR "conveniently" do not describe what hcall-multi-tce does
> exactly. But I am pretty sure the idea is that either both H_STUFF_TCE
> and H_PUT_TCE_INDIRECT are present or neither.
> 
> 
>> and then force H_PUT_TCE for new mappings via something like:
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 6ba081dd61c9..85d092baf17d 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -194,6 +194,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
>>         unsigned long flags;
>>  
>>         if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
>> +       if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE) || is_secure_guest()) {
> 
> 
> Nobody (including myself) seems to like the idea of having
> is_secure_guest() all over the place.
> 
> And with KVM acceleration enabled, it is pretty fast anyway. Just now we
> do not have H_PUT_TCE in KVM/UV for secure guests but we will have to
> fix this for secure PCI passhtrough anyway.
> 
> 
>>                 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
>>                                            direction, attrs);
>>         }
>>
>> That seems like it would avoid the extra 13s.
> 
> Or move around iommu_table_clear() which imho is just the right thing to do.


Huh. It is not the right thing as the firmware could have left mappings
there so we need cleanup. Even if I fixed SLOF, there is POWERVM which I
do not know what it does about TCEs. Thanks,



-- 
Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-11  8:15           ` Alexey Kardashevskiy
  2019-12-11 20:31             ` Michael Roth
@ 2019-12-12  4:11             ` Ram Pai
  1 sibling, 0 replies; 21+ messages in thread
From: Ram Pai @ 2019-12-12  4:11 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: mpe, linuxppc-dev, benh, david, paulus, mdroth, hch, andmike,
	sukadev, mst, ram.n.pai, cai, tglx, bauerman, linux-kernel,
	leonardo

On Wed, Dec 11, 2019 at 07:15:44PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 11/12/2019 02:35, Ram Pai wrote:
> > On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
> >>
..snip..
> >> As discussed in slack, by default we do not need to clear the entire TCE
> >> table and we only have to map swiotlb buffer using the small window. It
> >> is a guest kernel change only. Thanks,
> > 
> > Can you tell me what code you are talking about here.  Where is the TCE
> > table getting cleared? What code needs to be changed to not clear it?
> 
> 
> pci_dma_bus_setup_pSeriesLP()
> 	iommu_init_table()
> 		iommu_table_clear()
> 			for () tbl->it_ops->get()
> 
> We do not really need to clear it there, we only need it for VFIO with
> IOMMU SPAPR TCE v1 which reuses these tables but there are
> iommu_take_ownership/iommu_release_ownership to clear these tables. I'll
> send a patch for this.

Did some experiments. It spent the first 9s in tce_free_pSeriesLP()
clearing the tce entries.  And the second 13s in 
tce_setrange_multi_pSeriesLP_walk().  BTW: the code in
tce_setrange_multi_pSeriesLP_walk() is modified to use DIRECT_TCE.

So it looks like the amount of time spent in
tce_setrange_multi_pSeriesLP_walk() is a function of the size of the
memory that is mapped in the ddw.


> 
..snip..
> 
> > But before I close, you have not told me clearly, what is the problem
> > with;  'share the page, make the H_PUT_INDIRECT_TCE hcall, unshare the page'.
> 
> Between share and unshare you have a (tiny) window of opportunity to
> attack the guest. No, I do not know how exactly.
> 
> For example, the hypervisor does a lot of PHB+PCI hotplug-unplug with
> 64bit devices - each time this will create a huge window which will
> share/unshare the same page.  No, I do not know how exactly how this can
> be exploited either, we cannot rely of what you or myself know today. My
> point is that we should not be sharing pages at all unless we really
> really have to, and this does not seem to be the case.
> 
> But since this seems to an acceptable compromise anyway,
> 
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 

Thanks!
RP


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-11  1:43     ` Michael Roth
  2019-12-11  8:36       ` Alexey Kardashevskiy
@ 2019-12-12  6:45       ` Ram Pai
  2019-12-13  0:19         ` Michael Roth
  1 sibling, 1 reply; 21+ messages in thread
From: Ram Pai @ 2019-12-12  6:45 UTC (permalink / raw)
  To: Michael Roth
  Cc: mpe, linuxppc-dev, benh, david, paulus, hch, andmike, sukadev,
	mst, ram.n.pai, aik, cai, tglx, bauerman, linux-kernel, leonardo

On Tue, Dec 10, 2019 at 07:43:24PM -0600, Michael Roth wrote:
> Quoting Ram Pai (2019-12-06 19:12:39)
> > Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
> >                 secure guests")
> > disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
> > path for secure VMs, helped enable dma_direct path.  This enabled
> > support for bounce-buffering through SWIOTLB.  However it fails to
> > operate when IOMMU is enabled, since I/O pages are not TCE mapped.
> > 
> > Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
> > cases including, TCE mapping I/O pages, in the presence of a
> > IOMMU.
> 
> Wasn't clear to me at first, but I guess the main gist of this series is
> that we want to continue to use SWIOTLB, but also need to create mappings
> of it's bounce buffers in the IOMMU, so we revert to using dma_iommu_ops
> and rely on the various dma_iommu_{map,alloc}_bypass() hooks throughout
> to call into dma_direct_* ops rather than relying on the dma_is_direct(ops)
> checks in DMA API functions to do the same.
> 
> That makes sense, but one issue I see with that is that
> dma_iommu_map_bypass() only tests true if all the following are true:
> 
> 1) the device requests a 64-bit DMA mask via
>    dma_set_mask/dma_set_coherent_mask
> 2) DDW is enabled (i.e. we don't pass disable_ddw on command-line)
> 
> dma_is_direct() checks don't have this limitation, so I think for
> anything cases, such as devices that use a smaller DMA mask, we'll
> end up falling back to the non-bypass functions in dma_iommu_ops, which
> will likely break for things like dma_alloc_coherent/dma_map_single
> since they won't use SWIOTLB pages and won't do the necessary calls to
> set_memory_unencrypted() to share those non-SWIOTLB buffers with
> hypervisor.
> 
> Maybe that's ok, but I think we should be clearer about how to
> fail/handle these cases.

Yes. makes sense. Device that cannot handle 64bit dma mask will not work.

> 
> Though I also agree with some concerns Alexey stated earlier: it seems
> wasteful to map the entire DDW window just so these bounce buffers can be
> mapped.  Especially if you consider the lack of a mapping to be an additional
> safe-guard against things like buggy device implementations on the QEMU
> side. E.g. if we leaked pages to the hypervisor on accident, those pages
> wouldn't be immediately accessible to a device, and would still require
> additional work get past the IOMMU.

Well, an accidental unintented page leak to the hypervisor, is a very
bad thing, regardless of any DMA mapping. The device may not be able to
access it, but the hypervisor still can access it.

> 
> What would it look like if we try to make all this work with disable_ddw passed
> to kernel command-line (or forced for is_secure_guest())?
> 
>   1) dma_iommu_{alloc,map}_bypass() would no longer get us to dma_direct_* ops,
>      but an additional case or hook that considers is_secure_guest() might do
>      it.
>      
>   2) We'd also need to set up an IOMMU mapping for the bounce buffers via
>      io_tlb_start/io_tlb_end. We could do it once, on-demand via
>      dma_iommu_bypass_supported() like we do for the 64-bit DDW window, or
>      maybe in some init function.

Hmm... i not sure how to accomplish (2).   we need use some DDW window
to setup the mappings. right?  If disable_ddw is set, there wont be any
ddw.  What am i missing?

> 
> That also has the benefit of not requiring devices to support 64-bit DMA.
> 
> Alternatively, we could continue to rely on the 64-bit DDW window, but
> modify call to enable_ddw() to only map the io_tlb_start/end range in
> the case of is_secure_guest(). This is a little cleaner implementation-wise
> since we can rely on the existing dma_iommu_{alloc,map}_bypass() hooks.

I have been experimenting with this.  Trying to map only the memory
range from io_tlb_start/io_tlb_end though the 64-bit ddw window.  But
due to some reason, it wants the io_tlb_start to be aligned to some
boundary. It looks like a 2^28 boundary. Not sure what dictates that
boundary.
   

> , but
> devices that don't support 64-bit will fail back to not using dma_direct_* ops
> and fail miserably. We'd probably want to handle that more gracefully.

Yes i will put a warning message to indicate the failure.

> 
> Or we handle both cases gracefully. To me it makes more sense to enable
> non-DDW case, then consider adding DDW case later if there's some reason
> why 64-bit DMA is needed. But would be good to hear if there are other
> opinions.

educate me a bit here. What is a non-DDW case?  is it possible for a
device to acccess memory, in the presence of a IOMMU, without a window-mapping?

> 
> > 
> > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
> >  1 file changed, 1 insertion(+), 10 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 67b5009..4e27d66 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -36,7 +36,6 @@
> >  #include <asm/udbg.h>
> >  #include <asm/mmzone.h>
> >  #include <asm/plpar_wrappers.h>
> > -#include <asm/svm.h>
> >  #include <asm/ultravisor.h>
> > 
> >  #include "pseries.h"
> > @@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
> >         of_reconfig_notifier_register(&iommu_reconfig_nb);
> >         register_memory_notifier(&iommu_mem_nb);
> > 
> > -       /*
> > -        * Secure guest memory is inacessible to devices so regular DMA isn't
> > -        * possible.
> > -        *
> > -        * In that case keep devices' dma_map_ops as NULL so that the generic
> > -        * DMA code path will use SWIOTLB to bounce buffers for DMA.
> > -        */
> > -       if (!is_secure_guest())
> > -               set_pci_dma_ops(&dma_iommu_ops);
> > +       set_pci_dma_ops(&dma_iommu_ops);
> >  }
> > 
> >  static int __init disable_multitce(char *str)
> > -- 
> > 1.8.3.1
> > 

-- 
Ram Pai


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM.
  2019-12-12  6:45       ` Ram Pai
@ 2019-12-13  0:19         ` Michael Roth
  0 siblings, 0 replies; 21+ messages in thread
From: Michael Roth @ 2019-12-13  0:19 UTC (permalink / raw)
  To: Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, hch, andmike, sukadev,
	mst, ram.n.pai, aik, cai, tglx, bauerman, linux-kernel, leonardo

Quoting Ram Pai (2019-12-12 00:45:02)
> On Tue, Dec 10, 2019 at 07:43:24PM -0600, Michael Roth wrote:
> > Quoting Ram Pai (2019-12-06 19:12:39)
> > > Commit edea902c1c1e ("powerpc/pseries/iommu: Don't use dma_iommu_ops on
> > >                 secure guests")
> > > disabled dma_iommu_ops path, for secure VMs. Disabling dma_iommu_ops
> > > path for secure VMs, helped enable dma_direct path.  This enabled
> > > support for bounce-buffering through SWIOTLB.  However it fails to
> > > operate when IOMMU is enabled, since I/O pages are not TCE mapped.
> > > 
> > > Renable dma_iommu_ops path for pseries Secure VMs.  It handles all
> > > cases including, TCE mapping I/O pages, in the presence of a
> > > IOMMU.
> > 
> > Wasn't clear to me at first, but I guess the main gist of this series is
> > that we want to continue to use SWIOTLB, but also need to create mappings
> > of it's bounce buffers in the IOMMU, so we revert to using dma_iommu_ops
> > and rely on the various dma_iommu_{map,alloc}_bypass() hooks throughout
> > to call into dma_direct_* ops rather than relying on the dma_is_direct(ops)
> > checks in DMA API functions to do the same.
> > 
> > That makes sense, but one issue I see with that is that
> > dma_iommu_map_bypass() only tests true if all the following are true:
> > 
> > 1) the device requests a 64-bit DMA mask via
> >    dma_set_mask/dma_set_coherent_mask
> > 2) DDW is enabled (i.e. we don't pass disable_ddw on command-line)
> > 
> > dma_is_direct() checks don't have this limitation, so I think for
> > anything cases, such as devices that use a smaller DMA mask, we'll
> > end up falling back to the non-bypass functions in dma_iommu_ops, which
> > will likely break for things like dma_alloc_coherent/dma_map_single
> > since they won't use SWIOTLB pages and won't do the necessary calls to
> > set_memory_unencrypted() to share those non-SWIOTLB buffers with
> > hypervisor.
> > 
> > Maybe that's ok, but I think we should be clearer about how to
> > fail/handle these cases.
> 
> Yes. makes sense. Device that cannot handle 64bit dma mask will not work.
> 
> > 
> > Though I also agree with some concerns Alexey stated earlier: it seems
> > wasteful to map the entire DDW window just so these bounce buffers can be
> > mapped.  Especially if you consider the lack of a mapping to be an additional
> > safe-guard against things like buggy device implementations on the QEMU
> > side. E.g. if we leaked pages to the hypervisor on accident, those pages
> > wouldn't be immediately accessible to a device, and would still require
> > additional work get past the IOMMU.
> 
> Well, an accidental unintented page leak to the hypervisor, is a very
> bad thing, regardless of any DMA mapping. The device may not be able to
> access it, but the hypervisor still can access it.

Agreed, but if IOMMU can provide additional isolation we should make use
of it when reasonable.

> 
> > 
> > What would it look like if we try to make all this work with disable_ddw passed
> > to kernel command-line (or forced for is_secure_guest())?
> > 
> >   1) dma_iommu_{alloc,map}_bypass() would no longer get us to dma_direct_* ops,
> >      but an additional case or hook that considers is_secure_guest() might do
> >      it.
> >      
> >   2) We'd also need to set up an IOMMU mapping for the bounce buffers via
> >      io_tlb_start/io_tlb_end. We could do it once, on-demand via
> >      dma_iommu_bypass_supported() like we do for the 64-bit DDW window, or
> >      maybe in some init function.
> 
> Hmm... i not sure how to accomplish (2).   we need use some DDW window
> to setup the mappings. right?  If disable_ddw is set, there wont be any
> ddw.  What am i missing?

We have 2 windows, the default 32-bit window that normally covers DMA addresses
0..1GB, and an additional DDW window that's created on demand for 64-bit
devices (since they can address a full linear mapping of all guest
memory at DMA address 1<<59). Your current patch uses the latter, but we
could potentially use the 32-bit window since we only need to map the
SWIOTLB pages.

Not saying that's necessarily better, but one upside is it only requires
devices to support 32-bit DMA addressing, which is a larger class of
devices than those that support 64-bit (since 64-bit device drivers
generally have a 32-bit fallback). Required changes are a bit more
pervasive though.

It might make sense to get both scenarios working at some point, but
maybe it's okay to handle 64-bit first. 64-bit does give us more legroom
if we anticipate changes in where the SWIOTLB memory is allocated from,
or expect it to become larger than 1GB.

> 
> > 
> > That also has the benefit of not requiring devices to support 64-bit DMA.
> > 
> > Alternatively, we could continue to rely on the 64-bit DDW window, but
> > modify call to enable_ddw() to only map the io_tlb_start/end range in
> > the case of is_secure_guest(). This is a little cleaner implementation-wise
> > since we can rely on the existing dma_iommu_{alloc,map}_bypass() hooks.
> 
> I have been experimenting with this.  Trying to map only the memory
> range from io_tlb_start/io_tlb_end though the 64-bit ddw window.  But
> due to some reason, it wants the io_tlb_start to be aligned to some
> boundary. It looks like a 2^28 boundary. Not sure what dictates that
> boundary.

Not sure, but that might be related to 256MB LMB size. Could also be the page
size of the DDW window, but seems large for that. In any case I think it would
be okay if we needed to truncate io_tlb_start to a lower 256MB-boundary and the
subsequent range. We have a few more mappings than strictly necessary, but it's
still better than all guest memory.

>    
> 
> > , but
> > devices that don't support 64-bit will fail back to not using dma_direct_* ops
> > and fail miserably. We'd probably want to handle that more gracefully.
> 
> Yes i will put a warning message to indicate the failure.
> 
> > 
> > Or we handle both cases gracefully. To me it makes more sense to enable
> > non-DDW case, then consider adding DDW case later if there's some reason
> > why 64-bit DMA is needed. But would be good to hear if there are other
> > opinions.
> 
> educate me a bit here. What is a non-DDW case?  is it possible for a
> device to acccess memory, in the presence of a IOMMU, without a window-mapping?

It's not indeed, but we have the default 32-bit window for the non-DDW
case. See above.

> 
> > 
> > > 
> > > Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> > > ---
> > >  arch/powerpc/platforms/pseries/iommu.c | 11 +----------
> > >  1 file changed, 1 insertion(+), 10 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > > index 67b5009..4e27d66 100644
> > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > @@ -36,7 +36,6 @@
> > >  #include <asm/udbg.h>
> > >  #include <asm/mmzone.h>
> > >  #include <asm/plpar_wrappers.h>
> > > -#include <asm/svm.h>
> > >  #include <asm/ultravisor.h>
> > > 
> > >  #include "pseries.h"
> > > @@ -1346,15 +1345,7 @@ void iommu_init_early_pSeries(void)
> > >         of_reconfig_notifier_register(&iommu_reconfig_nb);
> > >         register_memory_notifier(&iommu_mem_nb);
> > > 
> > > -       /*
> > > -        * Secure guest memory is inacessible to devices so regular DMA isn't
> > > -        * possible.
> > > -        *
> > > -        * In that case keep devices' dma_map_ops as NULL so that the generic
> > > -        * DMA code path will use SWIOTLB to bounce buffers for DMA.
> > > -        */
> > > -       if (!is_secure_guest())
> > > -               set_pci_dma_ops(&dma_iommu_ops);
> > > +       set_pci_dma_ops(&dma_iommu_ops);
> > >  }
> > > 
> > >  static int __init disable_multitce(char *str)
> > > -- 
> > > 1.8.3.1
> > > 
> 
> -- 
> Ram Pai

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor.
  2019-12-11 22:47               ` Alexey Kardashevskiy
  2019-12-12  2:39                 ` Alexey Kardashevskiy
@ 2019-12-13  0:22                 ` Michael Roth
  1 sibling, 0 replies; 21+ messages in thread
From: Michael Roth @ 2019-12-13  0:22 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Ram Pai
  Cc: mpe, linuxppc-dev, benh, david, paulus, hch, andmike, sukadev,
	mst, ram.n.pai, cai, tglx, bauerman, linux-kernel, leonardo

Quoting Alexey Kardashevskiy (2019-12-11 16:47:30)
> 
> 
> On 12/12/2019 07:31, Michael Roth wrote:
> > Quoting Alexey Kardashevskiy (2019-12-11 02:15:44)
> >>
> >>
> >> On 11/12/2019 02:35, Ram Pai wrote:
> >>> On Tue, Dec 10, 2019 at 04:32:10PM +1100, Alexey Kardashevskiy wrote:
> >>>>
> >>>>
> >>>> On 10/12/2019 16:12, Ram Pai wrote:
> >>>>> On Tue, Dec 10, 2019 at 02:07:36PM +1100, Alexey Kardashevskiy wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 07/12/2019 12:12, Ram Pai wrote:
> >>>>>>> H_PUT_TCE_INDIRECT hcall uses a page filled with TCE entries, as one of
> >>>>>>> its parameters.  On secure VMs, hypervisor cannot access the contents of
> >>>>>>> this page since it gets encrypted.  Hence share the page with the
> >>>>>>> hypervisor, and unshare when done.
> >>>>>>
> >>>>>>
> >>>>>> I thought the idea was to use H_PUT_TCE and avoid sharing any extra
> >>>>>> pages. There is small problem that when DDW is enabled,
> >>>>>> FW_FEATURE_MULTITCE is ignored (easy to fix); I also noticed complains
> >>>>>> about the performance on slack but this is caused by initial cleanup of
> >>>>>> the default TCE window (which we do not use anyway) and to battle this
> >>>>>> we can simply reduce its size by adding
> >>>>>
> >>>>> something that takes hardly any time with H_PUT_TCE_INDIRECT,  takes
> >>>>> 13secs per device for H_PUT_TCE approach, during boot. This is with a
> >>>>> 30GB guest. With larger guest, the time will further detoriate.
> >>>>
> >>>>
> >>>> No it will not, I checked. The time is the same for 2GB and 32GB guests-
> >>>> the delay is caused by clearing the small DMA window which is small by
> >>>> the space mapped (1GB) but quite huge in TCEs as it uses 4K pages; and
> >>>> for DDW window + emulated devices the IOMMU page size will be 2M/16M/1G
> >>>> (depends on the system) so the number of TCEs is much smaller.
> >>>
> >>> I cant get your results.  What changes did you make to get it?
> >>
> >>
> >> Get what? I passed "-m 2G" and "-m 32G", got the same time - 13s spent
> >> in clearing the default window and the huge window took a fraction of a
> >> second to create and map.
> > 
> > Is this if we disable FW_FEATURE_MULTITCE in the guest and force the use
> > of H_PUT_TCE everywhere?
> 
> 
> Yes. Well, for the DDW case FW_FEATURE_MULTITCE is ignored but even when
> fixed (I have it in my local branch), this does not make a difference.
> 
> 
> > 
> > In theory couldn't we leave FW_FEATURE_MULTITCE in place so that
> > iommu_table_clear() can still use H_STUFF_TCE (which I guess is basically
> > instant),
> 
> PAPR/LoPAPR "conveniently" do not describe what hcall-multi-tce does
> exactly. But I am pretty sure the idea is that either both H_STUFF_TCE
> and H_PUT_TCE_INDIRECT are present or neither.

That was my interpretation (or maybe I just went by what your
implementation did :), but just because they are available doesn't mean
the guest has to use them. I agree it's ugly to condition it on
is_secure_guest(), but to me that seems better than sharing memory
uncessarily, or potentially leaving stale mappings into default IOMMU.

Not sure if that are other alternatives though.

> 
> 
> > and then force H_PUT_TCE for new mappings via something like:
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 6ba081dd61c9..85d092baf17d 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -194,6 +194,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
> >         unsigned long flags;
> >  
> >         if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
> > +       if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE) || is_secure_guest()) {
> 
> 
> Nobody (including myself) seems to like the idea of having
> is_secure_guest() all over the place.
> 
> And with KVM acceleration enabled, it is pretty fast anyway. Just now we
> do not have H_PUT_TCE in KVM/UV for secure guests but we will have to
> fix this for secure PCI passhtrough anyway.
> 
> 
> >                 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
> >                                            direction, attrs);
> >         }
> > 
> > That seems like it would avoid the extra 13s.
> 
> Or move around iommu_table_clear() which imho is just the right thing to do.
> 
> 
> > If we take the additional step of only mapping SWIOTLB range in
> > enable_ddw() for is_secure_guest() that might further improve things
> > (though the bigger motivation with that is the extra isolation it would
> > grant us for stuff behind the IOMMU, since it apparently doesn't affect
> > boot-time all that much)
> 
> 
> Sure, we just need to confirm how many of these swiotlb banks we are
> going to have (just one or many and at what location). Thanks,
> 
> 
> 
> > 
> >>
> >>
> >>>>>>
> >>>>>> -global
> >>>>>> spapr-pci-host-bridge.dma_win_size=0x4000000
> >>>>>
> >>>>> This option, speeds it up tremendously.  But than should this option be
> >>>>> enabled in qemu by default?  only for secure VMs? for both VMs?
> >>>>
> >>>>
> >>>> As discussed in slack, by default we do not need to clear the entire TCE
> >>>> table and we only have to map swiotlb buffer using the small window. It
> >>>> is a guest kernel change only. Thanks,
> >>>
> >>> Can you tell me what code you are talking about here.  Where is the TCE
> >>> table getting cleared? What code needs to be changed to not clear it?
> >>
> >>
> >> pci_dma_bus_setup_pSeriesLP()
> >>         iommu_init_table()
> >>                 iommu_table_clear()
> >>                         for () tbl->it_ops->get()
> >>
> >> We do not really need to clear it there, we only need it for VFIO with
> >> IOMMU SPAPR TCE v1 which reuses these tables but there are
> >> iommu_take_ownership/iommu_release_ownership to clear these tables. I'll
> >> send a patch for this.
> > 
> > 
> >>
> >>
> >>> Is the code in tce_buildmulti_pSeriesLP(), the one that does the clear
> >>> aswell?
> >>
> >>
> >> This one does not need to clear TCEs as this creates a window of known
> >> size and maps it all.
> >>
> >> Well, actually, it only maps actual guest RAM, if there are gaps in RAM,
> >> then TCEs for the gaps will have what hypervisor had there (which is
> >> zeroes, qemu/kvm clears it anyway).
> >>
> >>
> >>> But before I close, you have not told me clearly, what is the problem
> >>> with;  'share the page, make the H_PUT_INDIRECT_TCE hcall, unshare the page'.
> >>
> >> Between share and unshare you have a (tiny) window of opportunity to
> >> attack the guest. No, I do not know how exactly.
> >>
> >> For example, the hypervisor does a lot of PHB+PCI hotplug-unplug with
> >> 64bit devices - each time this will create a huge window which will
> >> share/unshare the same page.  No, I do not know how exactly how this can
> >> be exploited either, we cannot rely of what you or myself know today. My
> >> point is that we should not be sharing pages at all unless we really
> >> really have to, and this does not seem to be the case.
> >>
> >> But since this seems to an acceptable compromise anyway,
> >>
> >> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>
> >>
> >>
> >>
> >>
> >>> Remember this is the same page that is earmarked for doing
> >>> H_PUT_INDIRECT_TCE, not by my patch, but its already earmarked by the
> >>> existing code. So it not some random buffer that is picked. Second 
> >>> this page is temporarily shared and unshared, it does not stay shared
> >>> for life.  It does not slow the boot. it does not need any
> >>> special command line options on the qemu.
> >>>> Shared pages technology was put in place, exactly for the purpose of
> >>> sharing data with the hypervisor.  We are using this technology exactly
> >>> for that purpose.  And finally I agreed with your concern of having
> >>> shared pages staying around.  Hence i addressed that concern, by
> >>> unsharing the page.  At this point, I fail to understand your concern.
> >>
> >>
> >>
> >>
> >> -- 
> >> Alexey
> 
> -- 
> Alexey

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2019-12-13  0:23 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-07  1:12 [PATCH v5 0/2] Enable IOMMU support for pseries Secure VMs Ram Pai
2019-12-07  1:12 ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Ram Pai
2019-12-07  1:12   ` [PATCH v5 2/2] powerpc/pseries/iommu: Use dma_iommu_ops for Secure VM Ram Pai
2019-12-10  3:08     ` Alexey Kardashevskiy
2019-12-10 22:09     ` Thiago Jung Bauermann
2019-12-11  1:43     ` Michael Roth
2019-12-11  8:36       ` Alexey Kardashevskiy
2019-12-11 18:07         ` Michael Roth
2019-12-11 18:20           ` Christoph Hellwig
2019-12-12  6:45       ` Ram Pai
2019-12-13  0:19         ` Michael Roth
2019-12-10  3:07   ` [PATCH v5 1/2] powerpc/pseries/iommu: Share the per-cpu TCE page with the hypervisor Alexey Kardashevskiy
2019-12-10  5:12     ` Ram Pai
2019-12-10  5:32       ` Alexey Kardashevskiy
2019-12-10 15:35         ` Ram Pai
2019-12-11  8:15           ` Alexey Kardashevskiy
2019-12-11 20:31             ` Michael Roth
2019-12-11 22:47               ` Alexey Kardashevskiy
2019-12-12  2:39                 ` Alexey Kardashevskiy
2019-12-13  0:22                 ` Michael Roth
2019-12-12  4:11             ` Ram Pai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).