stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page
@ 2022-03-03 22:38 Jarkko Sakkinen
  2022-03-08 11:36 ` Jarkko Sakkinen
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Jarkko Sakkinen @ 2022-03-03 22:38 UTC (permalink / raw)
  To: linux-sgx
  Cc: Jarkko Sakkinen, Dave Hansen, stable, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	H. Peter Anvin, Jethro Beekman, Sean Christopherson,
	open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)

There is a limited amount of SGX memory (EPC) on each system.  When that
memory is used up, SGX has its own swapping mechanism which is similar
in concept but totally separate from the core mm/* code.  Instead of
swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
comes from a shared memory pseudo-file and can itself be swapped by the
core mm code.  There is a hierarchy like this:

	EPC <-> shmem <-> disk

After data is swapped back in from shmem to EPC, the shmem backing
storage needs to be freed.  Currently, the backing shmem is not freed.
This effectively wastes the shmem while the enclave is running.  The
memory is recovered when the enclave is destroyed and the backing
storage freed.

Sort this out by freeing memory with shmem_truncate_range(), as soon as
a page is faulted back to the EPC.  In addition, free the memory for
PCMD pages as soon as all PCMD's in a page have been marked as unused
by zeroing its contents.

Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
v6:
* Re-applied on top of tip/x86/sgx and fixed the merge conflict, i.e.
  sgx_encl_get_backing() instead of sgx_encl_lookup_backing().
v5:
* Encapsulated file offset calculation for PCMD struct.
* Replaced "magic number" PAGE_SIZE with sizeof(struct sgx_secs) to make
  the offset calculation more self-documentative.
v4:
* Sanitized the offset calculations.
v3:
* Resend.
v2:
* Rewrite commit message as proposed by Dave.
* Truncate PCMD pages (Dave).
---
 arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 001808e3901c..6fa3d0a14b93 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,30 @@
 #include "encls.h"
 #include "sgx.h"
 
+/*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+							    unsigned long page_index)
+{
+	pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+	return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+	struct inode *inode = file_inode(encl->backing);
+
+	shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
 /*
  * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
  * Pages" in the SDM.
@@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 {
 	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
 	struct sgx_encl *encl = encl_page->encl;
+	pgoff_t page_index, page_pcmd_off;
 	struct sgx_pageinfo pginfo;
 	struct sgx_backing b;
-	pgoff_t page_index;
+	bool pcmd_page_empty;
+	u8 *pcmd_page;
 	int ret;
 
 	if (secs_page)
@@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 	else
 		page_index = PFN_DOWN(encl->size);
 
+	page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
 	ret = sgx_encl_get_backing(encl, page_index, &b);
 	if (ret)
 		return ret;
 
 	pginfo.addr = encl_page->desc & PAGE_MASK;
 	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
-	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
-			  b.pcmd_offset;
+	pcmd_page = kmap_atomic(b.pcmd);
+	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
 
 	if (secs_page)
 		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
@@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 		ret = -EFAULT;
 	}
 
-	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+
+	/*
+	 * The area for the PCMD in the page was zeroed above.  Check if the
+	 * whole page is now empty meaning that all PCMD's have been zeroed:
+	 */
+	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+	kunmap_atomic(pcmd_page);
 	kunmap_atomic((void *)(unsigned long)pginfo.contents);
 
 	sgx_encl_put_backing(&b, false);
 
+	sgx_encl_truncate_backing_page(encl, page_index);
+
+	if (pcmd_page_empty)
+		sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+
 	return ret;
 }
 
@@ -577,7 +618,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
 int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 			 struct sgx_backing *backing)
 {
-	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
 	struct page *contents;
 	struct page *pcmd;
 
@@ -585,7 +626,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 	if (IS_ERR(contents))
 		return PTR_ERR(contents);
 
-	pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
+	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
 	if (IS_ERR(pcmd)) {
 		put_page(contents);
 		return PTR_ERR(pcmd);
@@ -594,9 +635,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 	backing->page_index = page_index;
 	backing->contents = contents;
 	backing->pcmd = pcmd;
-	backing->pcmd_offset =
-		(page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
-		sizeof(struct sgx_pcmd);
+	backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
 
 	return 0;
 }
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page
  2022-03-03 22:38 [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page Jarkko Sakkinen
@ 2022-03-08 11:36 ` Jarkko Sakkinen
  2022-03-08 19:16 ` Reinette Chatre
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Jarkko Sakkinen @ 2022-03-08 11:36 UTC (permalink / raw)
  To: linux-sgx
  Cc: Dave Hansen, stable, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	H. Peter Anvin, Jethro Beekman, Sean Christopherson,
	open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)

On Fri, Mar 04, 2022 at 12:38:58AM +0200, Jarkko Sakkinen wrote:
> There is a limited amount of SGX memory (EPC) on each system.  When that
> memory is used up, SGX has its own swapping mechanism which is similar
> in concept but totally separate from the core mm/* code.  Instead of
> swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> comes from a shared memory pseudo-file and can itself be swapped by the
> core mm code.  There is a hierarchy like this:
> 
> 	EPC <-> shmem <-> disk
> 
> After data is swapped back in from shmem to EPC, the shmem backing
> storage needs to be freed.  Currently, the backing shmem is not freed.
> This effectively wastes the shmem while the enclave is running.  The
> memory is recovered when the enclave is destroyed and the backing
> storage freed.
> 
> Sort this out by freeing memory with shmem_truncate_range(), as soon as
> a page is faulted back to the EPC.  In addition, free the memory for
> PCMD pages as soon as all PCMD's in a page have been marked as unused
> by zeroing its contents.
> 
> Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: stable@vger.kernel.org
> Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> ---
> v6:
> * Re-applied on top of tip/x86/sgx and fixed the merge conflict, i.e.
>   sgx_encl_get_backing() instead of sgx_encl_lookup_backing().
> v5:
> * Encapsulated file offset calculation for PCMD struct.
> * Replaced "magic number" PAGE_SIZE with sizeof(struct sgx_secs) to make
>   the offset calculation more self-documentative.
> v4:
> * Sanitized the offset calculations.
> v3:
> * Resend.
> v2:
> * Rewrite commit message as proposed by Dave.
> * Truncate PCMD pages (Dave).
> ---
>  arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++++++++------
>  1 file changed, 48 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 001808e3901c..6fa3d0a14b93 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -12,6 +12,30 @@
>  #include "encls.h"
>  #include "sgx.h"
>  
> +/*
> + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
> + * follow right after the EPC data in the backing storage. In addition to the
> + * visible enclave pages, there's one extra page slot for SECS, before PCMD
> + * structs.
> + */
> +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
> +							    unsigned long page_index)
> +{
> +	pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
> +
> +	return epc_end_off + page_index * sizeof(struct sgx_pcmd);
> +}
> +
> +/*
> + * Free a page from the backing storage in the given page index.
> + */
> +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
> +{
> +	struct inode *inode = file_inode(encl->backing);
> +
> +	shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
> +}
> +
>  /*
>   * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
>   * Pages" in the SDM.
> @@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  {
>  	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
>  	struct sgx_encl *encl = encl_page->encl;
> +	pgoff_t page_index, page_pcmd_off;
>  	struct sgx_pageinfo pginfo;
>  	struct sgx_backing b;
> -	pgoff_t page_index;
> +	bool pcmd_page_empty;
> +	u8 *pcmd_page;
>  	int ret;
>  
>  	if (secs_page)
> @@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  	else
>  		page_index = PFN_DOWN(encl->size);
>  
> +	page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
> +
>  	ret = sgx_encl_get_backing(encl, page_index, &b);
>  	if (ret)
>  		return ret;
>  
>  	pginfo.addr = encl_page->desc & PAGE_MASK;
>  	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
> -	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
> -			  b.pcmd_offset;
> +	pcmd_page = kmap_atomic(b.pcmd);
> +	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
>  
>  	if (secs_page)
>  		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
> @@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  		ret = -EFAULT;
>  	}
>  
> -	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
> +	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
> +
> +	/*
> +	 * The area for the PCMD in the page was zeroed above.  Check if the
> +	 * whole page is now empty meaning that all PCMD's have been zeroed:
> +	 */
> +	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
> +
> +	kunmap_atomic(pcmd_page);
>  	kunmap_atomic((void *)(unsigned long)pginfo.contents);
>  
>  	sgx_encl_put_backing(&b, false);
>  
> +	sgx_encl_truncate_backing_page(encl, page_index);
> +
> +	if (pcmd_page_empty)
> +		sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
> +
>  	return ret;
>  }
>  
> @@ -577,7 +618,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
>  int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
>  			 struct sgx_backing *backing)
>  {
> -	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
> +	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
>  	struct page *contents;
>  	struct page *pcmd;
>  
> @@ -585,7 +626,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
>  	if (IS_ERR(contents))
>  		return PTR_ERR(contents);
>  
> -	pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
> +	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
>  	if (IS_ERR(pcmd)) {
>  		put_page(contents);
>  		return PTR_ERR(pcmd);
> @@ -594,9 +635,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
>  	backing->page_index = page_index;
>  	backing->contents = contents;
>  	backing->pcmd = pcmd;
> -	backing->pcmd_offset =
> -		(page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
> -		sizeof(struct sgx_pcmd);
> +	backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
>  
>  	return 0;
>  }
> -- 
> 2.35.1
> 

Is there still something to be done?

BR, Jarkko

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page
  2022-03-03 22:38 [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page Jarkko Sakkinen
  2022-03-08 11:36 ` Jarkko Sakkinen
@ 2022-03-08 19:16 ` Reinette Chatre
  2022-03-09  8:01   ` Jarkko Sakkinen
  2022-03-11  0:15 ` [tip: x86/sgx] " tip-bot2 for Jarkko Sakkinen
  2022-03-11 19:09 ` [tip: x86/urgent] " tip-bot2 for Jarkko Sakkinen
  3 siblings, 1 reply; 7+ messages in thread
From: Reinette Chatre @ 2022-03-08 19:16 UTC (permalink / raw)
  To: Jarkko Sakkinen, linux-sgx
  Cc: Dave Hansen, stable, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	H. Peter Anvin, Jethro Beekman, Sean Christopherson,
	open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)

Hi,

On 3/3/2022 2:38 PM, Jarkko Sakkinen wrote:
> There is a limited amount of SGX memory (EPC) on each system.  When that
> memory is used up, SGX has its own swapping mechanism which is similar
> in concept but totally separate from the core mm/* code.  Instead of
> swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> comes from a shared memory pseudo-file and can itself be swapped by the
> core mm code.  There is a hierarchy like this:
> 
> 	EPC <-> shmem <-> disk
> 
> After data is swapped back in from shmem to EPC, the shmem backing
> storage needs to be freed.  Currently, the backing shmem is not freed.
> This effectively wastes the shmem while the enclave is running.  The
> memory is recovered when the enclave is destroyed and the backing
> storage freed.
> 
> Sort this out by freeing memory with shmem_truncate_range(), as soon as
> a page is faulted back to the EPC.  In addition, free the memory for
> PCMD pages as soon as all PCMD's in a page have been marked as unused
> by zeroing its contents.
> 
> Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: stable@vger.kernel.org
> Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>

I can reliably reproduce the issue this patch aims to solve by creating
a virtual machine that has a significant portion of its memory consumed
by EPC:

qemu-system-x86_64 -smp 4 -m 4G\
 -enable-kvm \
 -cpu host,+sgx-provisionkey \
 -object memory-backend-ram,size=2G,host-nodes=0,policy=bind,id=node0 \
 -object memory-backend-epc,id=mem0,size=1536M,prealloc=on,host-nodes=0,policy=bind \
 -numa node,nodeid=0,cpus=0-1,memdev=node0 \
 -object memory-backend-ram,size=2G,host-nodes=1,policy=bind,id=node1 \
 -object memory-backend-epc,id=mem1,size=1536M,prealloc=on,host-nodes=1,policy=bind \
 -numa node,nodeid=1,cpus=2-3,memdev=node1 \
 -M sgx-epc.0.memdev=mem0,sgx-epc.0.node=0,sgx-epc.1.memdev=mem1,sgx-epc.1.node=1 \
 ...

Before this patch, running the very stressful SGX2 over subscription test case
(unclobbered_vdso_oversubscribed_remove) in this environment always triggers 
the oom-killer but no amount of tasks killed can save the system
with it always ending deadlocked on memory:

[   58.642719] Tasks state (memory values in pages):
[   58.644324] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
[   58.647237] [    195]     0   195     3153      197    45056        0         -1000 systemd-udevd
[   58.650238] [    281]     0   281  1836367        0 10817536        0             0 test_sgx
[   58.653088] Out of memory and no killable processes...
[   58.654832] Kernel panic - not syncing: System is deadlocked on memory

After applying this patch I was able to run SGX2 selftest
unclobbered_vdso_oversubscribed_remove ten times successfully.

Tested-by: Reinette Chatre <reinette.chatre@intel.com>

Reinette



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page
  2022-03-08 19:16 ` Reinette Chatre
@ 2022-03-09  8:01   ` Jarkko Sakkinen
  2022-03-10 18:00     ` Jarkko Sakkinen
  0 siblings, 1 reply; 7+ messages in thread
From: Jarkko Sakkinen @ 2022-03-09  8:01 UTC (permalink / raw)
  To: Reinette Chatre
  Cc: linux-sgx, Dave Hansen, stable, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	H. Peter Anvin, Jethro Beekman, Sean Christopherson,
	open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)

On Tue, Mar 08, 2022 at 11:16:19AM -0800, Reinette Chatre wrote:
> Hi,
> 
> On 3/3/2022 2:38 PM, Jarkko Sakkinen wrote:
> > There is a limited amount of SGX memory (EPC) on each system.  When that
> > memory is used up, SGX has its own swapping mechanism which is similar
> > in concept but totally separate from the core mm/* code.  Instead of
> > swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> > comes from a shared memory pseudo-file and can itself be swapped by the
> > core mm code.  There is a hierarchy like this:
> > 
> > 	EPC <-> shmem <-> disk
> > 
> > After data is swapped back in from shmem to EPC, the shmem backing
> > storage needs to be freed.  Currently, the backing shmem is not freed.
> > This effectively wastes the shmem while the enclave is running.  The
> > memory is recovered when the enclave is destroyed and the backing
> > storage freed.
> > 
> > Sort this out by freeing memory with shmem_truncate_range(), as soon as
> > a page is faulted back to the EPC.  In addition, free the memory for
> > PCMD pages as soon as all PCMD's in a page have been marked as unused
> > by zeroing its contents.
> > 
> > Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: stable@vger.kernel.org
> > Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> 
> I can reliably reproduce the issue this patch aims to solve by creating
> a virtual machine that has a significant portion of its memory consumed
> by EPC:
> 
> qemu-system-x86_64 -smp 4 -m 4G\
>  -enable-kvm \
>  -cpu host,+sgx-provisionkey \
>  -object memory-backend-ram,size=2G,host-nodes=0,policy=bind,id=node0 \
>  -object memory-backend-epc,id=mem0,size=1536M,prealloc=on,host-nodes=0,policy=bind \
>  -numa node,nodeid=0,cpus=0-1,memdev=node0 \
>  -object memory-backend-ram,size=2G,host-nodes=1,policy=bind,id=node1 \
>  -object memory-backend-epc,id=mem1,size=1536M,prealloc=on,host-nodes=1,policy=bind \
>  -numa node,nodeid=1,cpus=2-3,memdev=node1 \
>  -M sgx-epc.0.memdev=mem0,sgx-epc.0.node=0,sgx-epc.1.memdev=mem1,sgx-epc.1.node=1 \
>  ...
> 
> Before this patch, running the very stressful SGX2 over subscription test case
> (unclobbered_vdso_oversubscribed_remove) in this environment always triggers 
> the oom-killer but no amount of tasks killed can save the system
> with it always ending deadlocked on memory:
> 
> [   58.642719] Tasks state (memory values in pages):
> [   58.644324] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
> [   58.647237] [    195]     0   195     3153      197    45056        0         -1000 systemd-udevd
> [   58.650238] [    281]     0   281  1836367        0 10817536        0             0 test_sgx
> [   58.653088] Out of memory and no killable processes...
> [   58.654832] Kernel panic - not syncing: System is deadlocked on memory
> 
> After applying this patch I was able to run SGX2 selftest
> unclobbered_vdso_oversubscribed_remove ten times successfully.
> 
> Tested-by: Reinette Chatre <reinette.chatre@intel.com>

Thank you.

> Reinette


BR, Jarkko

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page
  2022-03-09  8:01   ` Jarkko Sakkinen
@ 2022-03-10 18:00     ` Jarkko Sakkinen
  0 siblings, 0 replies; 7+ messages in thread
From: Jarkko Sakkinen @ 2022-03-10 18:00 UTC (permalink / raw)
  To: Reinette Chatre, Dave Hansen
  Cc: linux-sgx, stable, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	H. Peter Anvin, Jethro Beekman, Sean Christopherson,
	open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)

On Wed, Mar 09, 2022 at 10:01:17AM +0200, Jarkko Sakkinen wrote:
> On Tue, Mar 08, 2022 at 11:16:19AM -0800, Reinette Chatre wrote:
> > Hi,
> > 
> > On 3/3/2022 2:38 PM, Jarkko Sakkinen wrote:
> > > There is a limited amount of SGX memory (EPC) on each system.  When that
> > > memory is used up, SGX has its own swapping mechanism which is similar
> > > in concept but totally separate from the core mm/* code.  Instead of
> > > swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> > > comes from a shared memory pseudo-file and can itself be swapped by the
> > > core mm code.  There is a hierarchy like this:
> > > 
> > > 	EPC <-> shmem <-> disk
> > > 
> > > After data is swapped back in from shmem to EPC, the shmem backing
> > > storage needs to be freed.  Currently, the backing shmem is not freed.
> > > This effectively wastes the shmem while the enclave is running.  The
> > > memory is recovered when the enclave is destroyed and the backing
> > > storage freed.
> > > 
> > > Sort this out by freeing memory with shmem_truncate_range(), as soon as
> > > a page is faulted back to the EPC.  In addition, free the memory for
> > > PCMD pages as soon as all PCMD's in a page have been marked as unused
> > > by zeroing its contents.
> > > 
> > > Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > Cc: stable@vger.kernel.org
> > > Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> > > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> > 
> > I can reliably reproduce the issue this patch aims to solve by creating
> > a virtual machine that has a significant portion of its memory consumed
> > by EPC:
> > 
> > qemu-system-x86_64 -smp 4 -m 4G\
> >  -enable-kvm \
> >  -cpu host,+sgx-provisionkey \
> >  -object memory-backend-ram,size=2G,host-nodes=0,policy=bind,id=node0 \
> >  -object memory-backend-epc,id=mem0,size=1536M,prealloc=on,host-nodes=0,policy=bind \
> >  -numa node,nodeid=0,cpus=0-1,memdev=node0 \
> >  -object memory-backend-ram,size=2G,host-nodes=1,policy=bind,id=node1 \
> >  -object memory-backend-epc,id=mem1,size=1536M,prealloc=on,host-nodes=1,policy=bind \
> >  -numa node,nodeid=1,cpus=2-3,memdev=node1 \
> >  -M sgx-epc.0.memdev=mem0,sgx-epc.0.node=0,sgx-epc.1.memdev=mem1,sgx-epc.1.node=1 \
> >  ...
> > 
> > Before this patch, running the very stressful SGX2 over subscription test case
> > (unclobbered_vdso_oversubscribed_remove) in this environment always triggers 
> > the oom-killer but no amount of tasks killed can save the system
> > with it always ending deadlocked on memory:
> > 
> > [   58.642719] Tasks state (memory values in pages):
> > [   58.644324] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
> > [   58.647237] [    195]     0   195     3153      197    45056        0         -1000 systemd-udevd
> > [   58.650238] [    281]     0   281  1836367        0 10817536        0             0 test_sgx
> > [   58.653088] Out of memory and no killable processes...
> > [   58.654832] Kernel panic - not syncing: System is deadlocked on memory
> > 
> > After applying this patch I was able to run SGX2 selftest
> > unclobbered_vdso_oversubscribed_remove ten times successfully.
> > 
> > Tested-by: Reinette Chatre <reinette.chatre@intel.com>
> 
> Thank you.
> 
> > Reinette

Dave, can this be picked up?

BR, Jarkko

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip: x86/sgx] x86/sgx: Free backing memory after faulting the enclave page
  2022-03-03 22:38 [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page Jarkko Sakkinen
  2022-03-08 11:36 ` Jarkko Sakkinen
  2022-03-08 19:16 ` Reinette Chatre
@ 2022-03-11  0:15 ` tip-bot2 for Jarkko Sakkinen
  2022-03-11 19:09 ` [tip: x86/urgent] " tip-bot2 for Jarkko Sakkinen
  3 siblings, 0 replies; 7+ messages in thread
From: tip-bot2 for Jarkko Sakkinen @ 2022-03-11  0:15 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: stable, Dave Hansen, Jarkko Sakkinen, x86, linux-kernel

The following commit has been merged into the x86/sgx branch of tip:

Commit-ID:     ed83935a9af088439462ef3f2efcf1ef62b05dfd
Gitweb:        https://git.kernel.org/tip/ed83935a9af088439462ef3f2efcf1ef62b05dfd
Author:        Jarkko Sakkinen <jarkko@kernel.org>
AuthorDate:    Fri, 04 Mar 2022 00:38:58 +02:00
Committer:     Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Thu, 10 Mar 2022 16:09:40 -08:00

x86/sgx: Free backing memory after faulting the enclave page

There is a limited amount of SGX memory (EPC) on each system.  When that
memory is used up, SGX has its own swapping mechanism which is similar
in concept but totally separate from the core mm/* code.  Instead of
swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
comes from a shared memory pseudo-file and can itself be swapped by the
core mm code.  There is a hierarchy like this:

	EPC <-> shmem <-> disk

After data is swapped back in from shmem to EPC, the shmem backing
storage needs to be freed.  Currently, the backing shmem is not freed.
This effectively wastes the shmem while the enclave is running.  The
memory is recovered when the enclave is destroyed and the backing
storage freed.

Sort this out by freeing memory with shmem_truncate_range(), as soon as
a page is faulted back to the EPC.  In addition, free the memory for
PCMD pages as soon as all PCMD's in a page have been marked as unused
by zeroing its contents.

Cc: stable@vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20220303223859.273187-1-jarkko@kernel.org
---
 arch/x86/kernel/cpu/sgx/encl.c | 57 +++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 001808e..6fa3d0a 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -13,6 +13,30 @@
 #include "sgx.h"
 
 /*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+							    unsigned long page_index)
+{
+	pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+	return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+	struct inode *inode = file_inode(encl->backing);
+
+	shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
+/*
  * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
  * Pages" in the SDM.
  */
@@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 {
 	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
 	struct sgx_encl *encl = encl_page->encl;
+	pgoff_t page_index, page_pcmd_off;
 	struct sgx_pageinfo pginfo;
 	struct sgx_backing b;
-	pgoff_t page_index;
+	bool pcmd_page_empty;
+	u8 *pcmd_page;
 	int ret;
 
 	if (secs_page)
@@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 	else
 		page_index = PFN_DOWN(encl->size);
 
+	page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
 	ret = sgx_encl_get_backing(encl, page_index, &b);
 	if (ret)
 		return ret;
 
 	pginfo.addr = encl_page->desc & PAGE_MASK;
 	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
-	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
-			  b.pcmd_offset;
+	pcmd_page = kmap_atomic(b.pcmd);
+	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
 
 	if (secs_page)
 		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
@@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 		ret = -EFAULT;
 	}
 
-	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+
+	/*
+	 * The area for the PCMD in the page was zeroed above.  Check if the
+	 * whole page is now empty meaning that all PCMD's have been zeroed:
+	 */
+	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+	kunmap_atomic(pcmd_page);
 	kunmap_atomic((void *)(unsigned long)pginfo.contents);
 
 	sgx_encl_put_backing(&b, false);
 
+	sgx_encl_truncate_backing_page(encl, page_index);
+
+	if (pcmd_page_empty)
+		sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+
 	return ret;
 }
 
@@ -577,7 +618,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
 int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 			 struct sgx_backing *backing)
 {
-	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
 	struct page *contents;
 	struct page *pcmd;
 
@@ -585,7 +626,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 	if (IS_ERR(contents))
 		return PTR_ERR(contents);
 
-	pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
+	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
 	if (IS_ERR(pcmd)) {
 		put_page(contents);
 		return PTR_ERR(pcmd);
@@ -594,9 +635,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 	backing->page_index = page_index;
 	backing->contents = contents;
 	backing->pcmd = pcmd;
-	backing->pcmd_offset =
-		(page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
-		sizeof(struct sgx_pcmd);
+	backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
 
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [tip: x86/urgent] x86/sgx: Free backing memory after faulting the enclave page
  2022-03-03 22:38 [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page Jarkko Sakkinen
                   ` (2 preceding siblings ...)
  2022-03-11  0:15 ` [tip: x86/sgx] " tip-bot2 for Jarkko Sakkinen
@ 2022-03-11 19:09 ` tip-bot2 for Jarkko Sakkinen
  3 siblings, 0 replies; 7+ messages in thread
From: tip-bot2 for Jarkko Sakkinen @ 2022-03-11 19:09 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: stable, Dave Hansen, Jarkko Sakkinen, x86, linux-kernel

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID:     08999b2489b4c9b939d7483dbd03702ee4576d96
Gitweb:        https://git.kernel.org/tip/08999b2489b4c9b939d7483dbd03702ee4576d96
Author:        Jarkko Sakkinen <jarkko@kernel.org>
AuthorDate:    Fri, 04 Mar 2022 00:38:58 +02:00
Committer:     Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Fri, 11 Mar 2022 10:31:06 -08:00

x86/sgx: Free backing memory after faulting the enclave page

There is a limited amount of SGX memory (EPC) on each system.  When that
memory is used up, SGX has its own swapping mechanism which is similar
in concept but totally separate from the core mm/* code.  Instead of
swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
comes from a shared memory pseudo-file and can itself be swapped by the
core mm code.  There is a hierarchy like this:

	EPC <-> shmem <-> disk

After data is swapped back in from shmem to EPC, the shmem backing
storage needs to be freed.  Currently, the backing shmem is not freed.
This effectively wastes the shmem while the enclave is running.  The
memory is recovered when the enclave is destroyed and the backing
storage freed.

Sort this out by freeing memory with shmem_truncate_range(), as soon as
a page is faulted back to the EPC.  In addition, free the memory for
PCMD pages as soon as all PCMD's in a page have been marked as unused
by zeroing its contents.

Cc: stable@vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20220303223859.273187-1-jarkko@kernel.org
---
 arch/x86/kernel/cpu/sgx/encl.c | 57 +++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 48afe96..7c63a19 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -13,6 +13,30 @@
 #include "sgx.h"
 
 /*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+							    unsigned long page_index)
+{
+	pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+	return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+	struct inode *inode = file_inode(encl->backing);
+
+	shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
+/*
  * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
  * Pages" in the SDM.
  */
@@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 {
 	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
 	struct sgx_encl *encl = encl_page->encl;
+	pgoff_t page_index, page_pcmd_off;
 	struct sgx_pageinfo pginfo;
 	struct sgx_backing b;
-	pgoff_t page_index;
+	bool pcmd_page_empty;
+	u8 *pcmd_page;
 	int ret;
 
 	if (secs_page)
@@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 	else
 		page_index = PFN_DOWN(encl->size);
 
+	page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
 	ret = sgx_encl_get_backing(encl, page_index, &b);
 	if (ret)
 		return ret;
 
 	pginfo.addr = encl_page->desc & PAGE_MASK;
 	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
-	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
-			  b.pcmd_offset;
+	pcmd_page = kmap_atomic(b.pcmd);
+	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
 
 	if (secs_page)
 		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
@@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 		ret = -EFAULT;
 	}
 
-	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+
+	/*
+	 * The area for the PCMD in the page was zeroed above.  Check if the
+	 * whole page is now empty meaning that all PCMD's have been zeroed:
+	 */
+	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+	kunmap_atomic(pcmd_page);
 	kunmap_atomic((void *)(unsigned long)pginfo.contents);
 
 	sgx_encl_put_backing(&b, false);
 
+	sgx_encl_truncate_backing_page(encl, page_index);
+
+	if (pcmd_page_empty)
+		sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+
 	return ret;
 }
 
@@ -579,7 +620,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
 int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 			 struct sgx_backing *backing)
 {
-	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
 	struct page *contents;
 	struct page *pcmd;
 
@@ -587,7 +628,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 	if (IS_ERR(contents))
 		return PTR_ERR(contents);
 
-	pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
+	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
 	if (IS_ERR(pcmd)) {
 		put_page(contents);
 		return PTR_ERR(pcmd);
@@ -596,9 +637,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 	backing->page_index = page_index;
 	backing->contents = contents;
 	backing->pcmd = pcmd;
-	backing->pcmd_offset =
-		(page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
-		sizeof(struct sgx_pcmd);
+	backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
 
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-03-11 19:09 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-03 22:38 [PATCH v6] x86/sgx: Free backing memory after faulting the enclave page Jarkko Sakkinen
2022-03-08 11:36 ` Jarkko Sakkinen
2022-03-08 19:16 ` Reinette Chatre
2022-03-09  8:01   ` Jarkko Sakkinen
2022-03-10 18:00     ` Jarkko Sakkinen
2022-03-11  0:15 ` [tip: x86/sgx] " tip-bot2 for Jarkko Sakkinen
2022-03-11 19:09 ` [tip: x86/urgent] " tip-bot2 for Jarkko Sakkinen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).