xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] xen: speed up grant-table reclaim
@ 2023-06-10 15:32 Demi Marie Obenour
  2023-06-12  6:27 ` Juergen Gross
  0 siblings, 1 reply; 13+ messages in thread
From: Demi Marie Obenour @ 2023-06-10 15:32 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk
  Cc: Demi Marie Obenour, Xen developer discussion,
	Linux Kernel Mailing List, stable

When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list.  Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first.  However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first.  As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.

To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable.  The default is still
10, but it can be overridden at compile-time (via Kconfig), boot-time
(via a kernel command-line option), or runtime (via sysfs).

Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
Cc: stable@vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
---
 drivers/xen/Kconfig       | 12 ++++++++++++
 drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
 	  This value is used to allocate enough space in internal
 	  tables needed for physical memory administration.
 
+config XEN_GRANTS_RECLAIM_PER_ITERATION
+	int "Default number of grant entries to reclaim per iteration"
+	default 10
+	range 10 4294967295
+	help
+	  This sets the default value for the grant_table.free_per_iteration
+	  kernel command line option, which sets the number of grants that
+	  Linux will try to reclaim at once.  The default is 10, but
+	  workloads that make heavy use of gntalloc will likely want to
+	  increase this.  The current value can be accessed and/or modified
+	  via /sys/module/grant_table/parameters/free_per_iteration.
+
 config XEN_SCRUB_PAGES_DEFAULT
 	bool "Scrub pages before returning them to system by default"
 	depends on XEN_BALLOON
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
 static void gnttab_handle_deferred(struct timer_list *);
 static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
+
 static void gnttab_handle_deferred(struct timer_list *unused)
 {
-	unsigned int nr = 10;
+	unsigned int nr = READ_ONCE(free_per_iteration);
+	const bool ignore_limit = nr == 0;
 	struct deferred_entry *first = NULL;
 	unsigned long flags;
+	size_t freed = 0;
 
 	spin_lock_irqsave(&gnttab_list_lock, flags);
-	while (nr--) {
+	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
 		struct deferred_entry *entry
 			= list_first_entry(&deferred_list,
 					   struct deferred_entry, list);
@@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
 		list_del(&entry->list);
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
 		if (_gnttab_end_foreign_access_ref(entry->ref)) {
+			uint64_t ret = atomic64_sub_return(1, &deferred_count);
 			put_free_entry(entry->ref);
-			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
-				 entry->ref, page_to_pfn(entry->page));
+			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+				 entry->ref, page_to_pfn(entry->page),
+				 (unsigned long long)ret);
 			put_page(entry->page);
+			freed++;
 			kfree(entry);
 			entry = NULL;
 		} else {
@@ -530,21 +539,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
 		spin_lock_irqsave(&gnttab_list_lock, flags);
 		if (entry)
 			list_add_tail(&entry->list, &deferred_list);
-		else if (list_empty(&deferred_list))
-			break;
 	}
-	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+	if (list_empty(&deferred_list))
+		WARN_ON(atomic64_read(&deferred_count));
+	else if (!timer_pending(&deferred_timer)) {
 		deferred_timer.expires = jiffies + HZ;
 		add_timer(&deferred_timer);
 	}
 	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+	pr_debug("Freed %zu references", freed);
 }
 
 static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
 {
 	struct deferred_entry *entry;
 	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
-	const char *what = KERN_WARNING "leaking";
+	uint64_t leaked, deferred;
 
 	entry = kmalloc(sizeof(*entry), gfp);
 	if (!page) {
@@ -567,12 +577,20 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
 			add_timer(&deferred_timer);
 		}
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
-		what = KERN_DEBUG "deferring";
+		deferred = atomic64_add_return(1, &deferred_count);
+		leaked = atomic64_read(&leaked_count);
+		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+	} else {
+		deferred = atomic64_read(&deferred_count);
+		leaked = atomic64_add_return(1, &leaked_count);
+		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
 	}
-	printk("%s g.e. %#x (pfn %#lx)\n",
-	       what, ref, page ? page_to_pfn(page) : -1);
 }
 
+module_param(free_per_iteration, uint, 0600);
+
 int gnttab_try_end_foreign_access(grant_ref_t ref)
 {
 	int ret = _gnttab_end_foreign_access_ref(ref);
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-06-10 15:32 [PATCH] xen: speed up grant-table reclaim Demi Marie Obenour
@ 2023-06-12  6:27 ` Juergen Gross
  2023-06-12  6:45   ` Juergen Gross
  2023-06-12 20:09   ` Demi Marie Obenour
  0 siblings, 2 replies; 13+ messages in thread
From: Juergen Gross @ 2023-06-12  6:27 UTC (permalink / raw)
  To: Demi Marie Obenour, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk
  Cc: Xen developer discussion, Linux Kernel Mailing List, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 7914 bytes --]

On 10.06.23 17:32, Demi Marie Obenour wrote:
> When a grant entry is still in use by the remote domain, Linux must put
> it on a deferred list.

This lacks quite some context.

The main problem is related to the grant not having been unmapped after
the end of a request, but the side granting the access is assuming this
should be the case.

In general this means that the two sides implementing the protocol don't
agree how it should work, or that the protocol itself has a flaw.

 > Normally, this list is very short, because
> the PV network and block protocols expect the backend to unmap the grant
> first.

Normally the list is just empty. Only in very rare cases like premature
PV frontend module unloading it is expected to see cases of deferred
grant reclaims.

> However, Qubes OS's GUI protocol is subject to the constraints
> of the X Window System, and as such winds up with the frontend unmapping
> the window first.  As a result, the list can grow very large, resulting
> in a massive memory leak and eventual VM freeze.

I do understand that it is difficult to change the protocol and/or
behavior after the fact, or that performance considerations are in the
way of doing so.

> To partially solve this problem, make the number of entries that the VM
> will attempt to free at each iteration tunable.  The default is still
> 10, but it can be overridden at compile-time (via Kconfig), boot-time
> (via a kernel command-line option), or runtime (via sysfs).

Is there really a need to have another Kconfig option for this? AFAICS
only QubesOS is affected by the problem you are trying to solve. I don't
see why you can't use the command-line option or sysfs node to set the
higher reclaim batch size.

> 
> Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")

I don't think this "Fixes:" tag is appropriate. The mentioned commit didn't
have a bug. You are adding new functionality on top of it.

> Cc: stable@vger.kernel.org
> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> ---
>   drivers/xen/Kconfig       | 12 ++++++++++++
>   drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
>   2 files changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
>   	  This value is used to allocate enough space in internal
>   	  tables needed for physical memory administration.
>   
> +config XEN_GRANTS_RECLAIM_PER_ITERATION
> +	int "Default number of grant entries to reclaim per iteration"
> +	default 10
> +	range 10 4294967295
> +	help
> +	  This sets the default value for the grant_table.free_per_iteration
> +	  kernel command line option, which sets the number of grants that
> +	  Linux will try to reclaim at once.  The default is 10, but
> +	  workloads that make heavy use of gntalloc will likely want to
> +	  increase this.  The current value can be accessed and/or modified
> +	  via /sys/module/grant_table/parameters/free_per_iteration.
> +

Apart from the fact that I don't like adding a new Kconfig option, the help
text is not telling the true story. Heavy use of gntalloc isn't to blame, but
the fact that your PV-device implementation is based on the reclaim
functionality. TBH, someone not familiar with the grant reclaim will have no
chance to select a sensible value based on your help text.

>   config XEN_SCRUB_PAGES_DEFAULT
>   	bool "Scrub pages before returning them to system by default"
>   	depends on XEN_BALLOON
> diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
> --- a/drivers/xen/grant-table.c
> +++ b/drivers/xen/grant-table.c
> @@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
>   static void gnttab_handle_deferred(struct timer_list *);
>   static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
>   
> +static atomic64_t deferred_count;
> +static atomic64_t leaked_count;
> +static unsigned int free_per_iteration = CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
> +
>   static void gnttab_handle_deferred(struct timer_list *unused)
>   {
> -	unsigned int nr = 10;
> +	unsigned int nr = READ_ONCE(free_per_iteration);
> +	const bool ignore_limit = nr == 0;
>   	struct deferred_entry *first = NULL;
>   	unsigned long flags;
> +	size_t freed = 0;
>   
>   	spin_lock_irqsave(&gnttab_list_lock, flags);
> -	while (nr--) {
> +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
>   		struct deferred_entry *entry
>   			= list_first_entry(&deferred_list,
>   					   struct deferred_entry, list);
> @@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
>   		list_del(&entry->list);
>   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
>   		if (_gnttab_end_foreign_access_ref(entry->ref)) {
> +			uint64_t ret = atomic64_sub_return(1, &deferred_count);
>   			put_free_entry(entry->ref);
> -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
> -				 entry->ref, page_to_pfn(entry->page));
> +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
> +				 entry->ref, page_to_pfn(entry->page),
> +				 (unsigned long long)ret);

I'd prefer not to issue lots of prints (being it debug or info ones) if the
reclaim is expected to happen with a specific PV-device.

My preferred solution would be a per-device flag, but at least you should
switch to pr_*_ratelimited(). Same applies further down.

>   			put_page(entry->page);
> +			freed++;
>   			kfree(entry);
>   			entry = NULL;
>   		} else {
> @@ -530,21 +539,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
>   		spin_lock_irqsave(&gnttab_list_lock, flags);
>   		if (entry)
>   			list_add_tail(&entry->list, &deferred_list);
> -		else if (list_empty(&deferred_list))
> -			break;
>   	}
> -	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
> +	if (list_empty(&deferred_list))
> +		WARN_ON(atomic64_read(&deferred_count));
> +	else if (!timer_pending(&deferred_timer)) {
>   		deferred_timer.expires = jiffies + HZ;
>   		add_timer(&deferred_timer);
>   	}
>   	spin_unlock_irqrestore(&gnttab_list_lock, flags);
> +	pr_debug("Freed %zu references", freed);
>   }
>   
>   static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
>   {
>   	struct deferred_entry *entry;
>   	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
> -	const char *what = KERN_WARNING "leaking";
> +	uint64_t leaked, deferred;
>   
>   	entry = kmalloc(sizeof(*entry), gfp);
>   	if (!page) {
> @@ -567,12 +577,20 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
>   			add_timer(&deferred_timer);
>   		}
>   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> -		what = KERN_DEBUG "deferring";
> +		deferred = atomic64_add_return(1, &deferred_count);
> +		leaked = atomic64_read(&leaked_count);
> +		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> +			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> +	} else {
> +		deferred = atomic64_read(&deferred_count);
> +		leaked = atomic64_add_return(1, &leaked_count);
> +		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> +			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
>   	}
> -	printk("%s g.e. %#x (pfn %#lx)\n",
> -	       what, ref, page ? page_to_pfn(page) : -1);
>   }
>   
> +module_param(free_per_iteration, uint, 0600);
> +

Could you please move the parameter definition closer to the related
variable definition?


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-06-12  6:27 ` Juergen Gross
@ 2023-06-12  6:45   ` Juergen Gross
  2023-06-12 20:09   ` Demi Marie Obenour
  1 sibling, 0 replies; 13+ messages in thread
From: Juergen Gross @ 2023-06-12  6:45 UTC (permalink / raw)
  To: Demi Marie Obenour, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk
  Cc: Xen developer discussion, Linux Kernel Mailing List, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 5190 bytes --]

On 12.06.23 08:27, Juergen Gross wrote:
> On 10.06.23 17:32, Demi Marie Obenour wrote:
>> When a grant entry is still in use by the remote domain, Linux must put
>> it on a deferred list.
> 
> This lacks quite some context.
> 
> The main problem is related to the grant not having been unmapped after
> the end of a request, but the side granting the access is assuming this
> should be the case.
> 
> In general this means that the two sides implementing the protocol don't
> agree how it should work, or that the protocol itself has a flaw.
> 
>  > Normally, this list is very short, because
>> the PV network and block protocols expect the backend to unmap the grant
>> first.
> 
> Normally the list is just empty. Only in very rare cases like premature
> PV frontend module unloading it is expected to see cases of deferred
> grant reclaims.
> 
>> However, Qubes OS's GUI protocol is subject to the constraints
>> of the X Window System, and as such winds up with the frontend unmapping
>> the window first.  As a result, the list can grow very large, resulting
>> in a massive memory leak and eventual VM freeze.
> 
> I do understand that it is difficult to change the protocol and/or
> behavior after the fact, or that performance considerations are in the
> way of doing so.
> 
>> To partially solve this problem, make the number of entries that the VM
>> will attempt to free at each iteration tunable.  The default is still
>> 10, but it can be overridden at compile-time (via Kconfig), boot-time
>> (via a kernel command-line option), or runtime (via sysfs).
> 
> Is there really a need to have another Kconfig option for this? AFAICS
> only QubesOS is affected by the problem you are trying to solve. I don't
> see why you can't use the command-line option or sysfs node to set the
> higher reclaim batch size.
> 
>>
>> Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
> 
> I don't think this "Fixes:" tag is appropriate. The mentioned commit didn't
> have a bug. You are adding new functionality on top of it.
> 
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
>> ---
>>   drivers/xen/Kconfig       | 12 ++++++++++++
>>   drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
>>   2 files changed, 41 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
>> index 
>> d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
>> --- a/drivers/xen/Kconfig
>> +++ b/drivers/xen/Kconfig
>> @@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
>>         This value is used to allocate enough space in internal
>>         tables needed for physical memory administration.
>> +config XEN_GRANTS_RECLAIM_PER_ITERATION
>> +    int "Default number of grant entries to reclaim per iteration"
>> +    default 10
>> +    range 10 4294967295
>> +    help
>> +      This sets the default value for the grant_table.free_per_iteration
>> +      kernel command line option, which sets the number of grants that
>> +      Linux will try to reclaim at once.  The default is 10, but
>> +      workloads that make heavy use of gntalloc will likely want to
>> +      increase this.  The current value can be accessed and/or modified
>> +      via /sys/module/grant_table/parameters/free_per_iteration.
>> +
> 
> Apart from the fact that I don't like adding a new Kconfig option, the help
> text is not telling the true story. Heavy use of gntalloc isn't to blame, but
> the fact that your PV-device implementation is based on the reclaim
> functionality. TBH, someone not familiar with the grant reclaim will have no
> chance to select a sensible value based on your help text.
> 
>>   config XEN_SCRUB_PAGES_DEFAULT
>>       bool "Scrub pages before returning them to system by default"
>>       depends on XEN_BALLOON
>> diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
>> index 
>> e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
>> --- a/drivers/xen/grant-table.c
>> +++ b/drivers/xen/grant-table.c
>> @@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
>>   static void gnttab_handle_deferred(struct timer_list *);
>>   static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
>> +static atomic64_t deferred_count;
>> +static atomic64_t leaked_count;
>> +static unsigned int free_per_iteration = 
>> CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
>> +
>>   static void gnttab_handle_deferred(struct timer_list *unused)
>>   {
>> -    unsigned int nr = 10;
>> +    unsigned int nr = READ_ONCE(free_per_iteration);

Two additional remarks:

I wouldn't be opposed to raise the default number of reclaims to a higher
number, e.g. 100.

Another option would be to move the reclaim handling into a workqueue and
try to reclaim as many grants as possible in one go with a cond_resched()
call every e.g. 100 reclaims. This would remove the need for having a
reclaim upper bound.


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-06-12  6:27 ` Juergen Gross
  2023-06-12  6:45   ` Juergen Gross
@ 2023-06-12 20:09   ` Demi Marie Obenour
  2023-06-13  6:45     ` Juergen Gross
  1 sibling, 1 reply; 13+ messages in thread
From: Demi Marie Obenour @ 2023-06-12 20:09 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk
  Cc: Xen developer discussion, Linux Kernel Mailing List, stable

[-- Attachment #1: Type: text/plain, Size: 10778 bytes --]

On Mon, Jun 12, 2023 at 08:27:59AM +0200, Juergen Gross wrote:
> On 10.06.23 17:32, Demi Marie Obenour wrote:
> > When a grant entry is still in use by the remote domain, Linux must put
> > it on a deferred list.
> 
> This lacks quite some context.
> 
> The main problem is related to the grant not having been unmapped after
> the end of a request, but the side granting the access is assuming this
> should be the case.

The GUI agent has relied on deferred grant reclaim for as long as it has
existed.  One could argue that doing so means that the agent is misusing
gntalloc, but this is not documented anywhere.  A better fix would be to
use IOCTL_GNTDEV_SET_UNMAP_NOTIFY in the GUI daemon.

> In general this means that the two sides implementing the protocol don't
> agree how it should work, or that the protocol itself has a flaw.

What would a better solution be?  This is going to be particularly
tricky with Wayland, as the wl_shm protocol makes absolutely no
guarantees that compositors will promptly release the mapping and
provides no way whatsoever for Wayland clients to know when this has
happened.  Relying on an LD_PRELOAD hack is not sustainable.

> > Normally, this list is very short, because
> > the PV network and block protocols expect the backend to unmap the grant
> > first.
> 
> Normally the list is just empty. Only in very rare cases like premature
> PV frontend module unloading it is expected to see cases of deferred
> grant reclaims.

In the case of a system using only properly-designed PV protocols
implemented in kernel mode I agree.  However, both libxenvchan and the
Qubes GUI protocol are implemented in user mode and this means that if
the frontend process (the one that uses gntalloc) crashes, deferred
grant reclaims will occur.  Worse, it is possible for the domain to use
the grant in a PV protocol.  If the PV backend driver then maps and
unmaps the grant and then tells the frontend driver to reclaim it, but
the backend userspace process (the one using gntdev) maps it before the
frontend does reclaim it, the frontend will think the backend is trying
to exploit XSA-396 and freeze the connection.

> > However, Qubes OS's GUI protocol is subject to the constraints
> > of the X Window System, and as such winds up with the frontend unmapping
> > the window first.  As a result, the list can grow very large, resulting
> > in a massive memory leak and eventual VM freeze.
> 
> I do understand that it is difficult to change the protocol and/or
> behavior after the fact, or that performance considerations are in the
> way of doing so.

Would the correct fix be to use IOCTL_GNTDEV_SET_UNMAP_NOTIFY?  That
would require that the agent either create a new event channel for each
window or maintain a pool of event channels, but that should be doable.
This still does not solve the problem of the frontend exiting
unexpectedly, though.

> > To partially solve this problem, make the number of entries that the VM
> > will attempt to free at each iteration tunable.  The default is still
> > 10, but it can be overridden at compile-time (via Kconfig), boot-time
> > (via a kernel command-line option), or runtime (via sysfs).
> 
> Is there really a need to have another Kconfig option for this? AFAICS
> only QubesOS is affected by the problem you are trying to solve. I don't
> see why you can't use the command-line option or sysfs node to set the
> higher reclaim batch size.

Fair.  In practice, Qubes OS will need to use the sysfs node, since
the other two do not work with in-VM kernels.

> > Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
> 
> I don't think this "Fixes:" tag is appropriate. The mentioned commit didn't
> have a bug. You are adding new functionality on top of it.

I’ll drop the "Fixes:" tag, but I will keep the "Cc: stable@vger.kernel.org"
as I believe this patch meets the following criterion for stable
backport (from Documentation/process/stable-kernel-rules.rst):

    Serious issues as reported by a user of a distribution kernel may also
    be considered if they fix a notable performance or interactivity issue.

> > Cc: stable@vger.kernel.org
> > Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> > ---
> >   drivers/xen/Kconfig       | 12 ++++++++++++
> >   drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
> >   2 files changed, 41 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> > index d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
> > --- a/drivers/xen/Kconfig
> > +++ b/drivers/xen/Kconfig
> > @@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
> >   	  This value is used to allocate enough space in internal
> >   	  tables needed for physical memory administration.
> > +config XEN_GRANTS_RECLAIM_PER_ITERATION
> > +	int "Default number of grant entries to reclaim per iteration"
> > +	default 10
> > +	range 10 4294967295
> > +	help
> > +	  This sets the default value for the grant_table.free_per_iteration
> > +	  kernel command line option, which sets the number of grants that
> > +	  Linux will try to reclaim at once.  The default is 10, but
> > +	  workloads that make heavy use of gntalloc will likely want to
> > +	  increase this.  The current value can be accessed and/or modified
> > +	  via /sys/module/grant_table/parameters/free_per_iteration.
> > +
> 
> Apart from the fact that I don't like adding a new Kconfig option, the help
> text is not telling the true story. Heavy use of gntalloc isn't to blame, but
> the fact that your PV-device implementation is based on the reclaim
> functionality. TBH, someone not familiar with the grant reclaim will have no
> chance to select a sensible value based on your help text.

That’s a good point.  Qubes OS will need to use the sysfs value anyway
in order to support in-VM kernels, so the Kconfig option is not really
useful.

> >   config XEN_SCRUB_PAGES_DEFAULT
> >   	bool "Scrub pages before returning them to system by default"
> >   	depends on XEN_BALLOON
> > diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> > index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
> > --- a/drivers/xen/grant-table.c
> > +++ b/drivers/xen/grant-table.c
> > @@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
> >   static void gnttab_handle_deferred(struct timer_list *);
> >   static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
> > +static atomic64_t deferred_count;
> > +static atomic64_t leaked_count;
> > +static unsigned int free_per_iteration = CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
> > +
> >   static void gnttab_handle_deferred(struct timer_list *unused)
> >   {
> > -	unsigned int nr = 10;
> > +	unsigned int nr = READ_ONCE(free_per_iteration);
> > +	const bool ignore_limit = nr == 0;
> >   	struct deferred_entry *first = NULL;
> >   	unsigned long flags;
> > +	size_t freed = 0;
> >   	spin_lock_irqsave(&gnttab_list_lock, flags);
> > -	while (nr--) {
> > +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
> >   		struct deferred_entry *entry
> >   			= list_first_entry(&deferred_list,
> >   					   struct deferred_entry, list);
> > @@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
> >   		list_del(&entry->list);
> >   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> >   		if (_gnttab_end_foreign_access_ref(entry->ref)) {
> > +			uint64_t ret = atomic64_sub_return(1, &deferred_count);
> >   			put_free_entry(entry->ref);
> > -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
> > -				 entry->ref, page_to_pfn(entry->page));
> > +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
> > +				 entry->ref, page_to_pfn(entry->page),
> > +				 (unsigned long long)ret);
> 
> I'd prefer not to issue lots of prints (being it debug or info ones) if the
> reclaim is expected to happen with a specific PV-device.
> 
> My preferred solution would be a per-device flag, but at least you should
> switch to pr_*_ratelimited(). Same applies further down.

What do you mean by “per-device flag”?  These grants are allocated by
userspace using gntalloc, so there is no PV device on which the flag
could be set.

> >   			put_page(entry->page);
> > +			freed++;
> >   			kfree(entry);
> >   			entry = NULL;
> >   		} else {
> > @@ -530,21 +539,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
> >   		spin_lock_irqsave(&gnttab_list_lock, flags);
> >   		if (entry)
> >   			list_add_tail(&entry->list, &deferred_list);
> > -		else if (list_empty(&deferred_list))
> > -			break;
> >   	}
> > -	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
> > +	if (list_empty(&deferred_list))
> > +		WARN_ON(atomic64_read(&deferred_count));
> > +	else if (!timer_pending(&deferred_timer)) {
> >   		deferred_timer.expires = jiffies + HZ;
> >   		add_timer(&deferred_timer);
> >   	}
> >   	spin_unlock_irqrestore(&gnttab_list_lock, flags);
> > +	pr_debug("Freed %zu references", freed);
> >   }
> >   static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
> >   {
> >   	struct deferred_entry *entry;
> >   	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
> > -	const char *what = KERN_WARNING "leaking";
> > +	uint64_t leaked, deferred;
> >   	entry = kmalloc(sizeof(*entry), gfp);
> >   	if (!page) {
> > @@ -567,12 +577,20 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
> >   			add_timer(&deferred_timer);
> >   		}
> >   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> > -		what = KERN_DEBUG "deferring";
> > +		deferred = atomic64_add_return(1, &deferred_count);
> > +		leaked = atomic64_read(&leaked_count);
> > +		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> > +			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> > +	} else {
> > +		deferred = atomic64_read(&deferred_count);
> > +		leaked = atomic64_add_return(1, &leaked_count);
> > +		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> > +			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> >   	}
> > -	printk("%s g.e. %#x (pfn %#lx)\n",
> > -	       what, ref, page ? page_to_pfn(page) : -1);
> >   }
> > +module_param(free_per_iteration, uint, 0600);
> > +
> 
> Could you please move the parameter definition closer to the related
> variable definition?

Will fix in the next version.
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-06-12 20:09   ` Demi Marie Obenour
@ 2023-06-13  6:45     ` Juergen Gross
  2023-06-13 15:11       ` Demi Marie Obenour
  2023-06-19 13:23       ` Marek Marczykowski-Górecki
  0 siblings, 2 replies; 13+ messages in thread
From: Juergen Gross @ 2023-06-13  6:45 UTC (permalink / raw)
  To: Demi Marie Obenour, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk
  Cc: Xen developer discussion, Linux Kernel Mailing List, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 10030 bytes --]

On 12.06.23 22:09, Demi Marie Obenour wrote:
> On Mon, Jun 12, 2023 at 08:27:59AM +0200, Juergen Gross wrote:
>> On 10.06.23 17:32, Demi Marie Obenour wrote:
>>> When a grant entry is still in use by the remote domain, Linux must put
>>> it on a deferred list.
>>
>> This lacks quite some context.
>>
>> The main problem is related to the grant not having been unmapped after
>> the end of a request, but the side granting the access is assuming this
>> should be the case.
> 
> The GUI agent has relied on deferred grant reclaim for as long as it has
> existed.  One could argue that doing so means that the agent is misusing
> gntalloc, but this is not documented anywhere.  A better fix would be to
> use IOCTL_GNTDEV_SET_UNMAP_NOTIFY in the GUI daemon.

I don't think this is a gntalloc specific problem. You could create the same
problem with a kernel implementation.

And yes, using IOCTL_GNTDEV_SET_UNMAP_NOTIFY might be a good idea.

>> In general this means that the two sides implementing the protocol don't
>> agree how it should work, or that the protocol itself has a flaw.
> 
> What would a better solution be?  This is going to be particularly
> tricky with Wayland, as the wl_shm protocol makes absolutely no
> guarantees that compositors will promptly release the mapping and
> provides no way whatsoever for Wayland clients to know when this has
> happened.  Relying on an LD_PRELOAD hack is not sustainable.
> 
>>> Normally, this list is very short, because
>>> the PV network and block protocols expect the backend to unmap the grant
>>> first.
>>
>> Normally the list is just empty. Only in very rare cases like premature
>> PV frontend module unloading it is expected to see cases of deferred
>> grant reclaims.
> 
> In the case of a system using only properly-designed PV protocols
> implemented in kernel mode I agree.  However, both libxenvchan and the
> Qubes GUI protocol are implemented in user mode and this means that if
> the frontend process (the one that uses gntalloc) crashes, deferred
> grant reclaims will occur.

This would be the user space variant of frontend driver unloading.

> Worse, it is possible for the domain to use
> the grant in a PV protocol.  If the PV backend driver then maps and
> unmaps the grant and then tells the frontend driver to reclaim it, but
> the backend userspace process (the one using gntdev) maps it before the
> frontend does reclaim it, the frontend will think the backend is trying
> to exploit XSA-396 and freeze the connection.

Let me sum up how I understand above statement:

1. Frontend F in DomA is granting DomB access via grant X for PV-device Y.
2. DomB backend B for PV-device Y is mapping the page using grant X and uses it.
3. DomB backend B unmaps grant X and signals end of usage to DomA frontend F.
4. DomB userspace process maps grant X
5. DomA frontend F tries to reclaim grant X and fails due to DomB userspace
    mapping

So why would DomB userspace process map grant X? This would imply a malicious
process in DomB. This might be possible, but I don't see how such an attack
would relate to your problem. It could happen with any malicious userspace
program with root access in a domain running a PV backend.

> 
>>> However, Qubes OS's GUI protocol is subject to the constraints
>>> of the X Window System, and as such winds up with the frontend unmapping
>>> the window first.  As a result, the list can grow very large, resulting
>>> in a massive memory leak and eventual VM freeze.
>>
>> I do understand that it is difficult to change the protocol and/or
>> behavior after the fact, or that performance considerations are in the
>> way of doing so.
> 
> Would the correct fix be to use IOCTL_GNTDEV_SET_UNMAP_NOTIFY?  That
> would require that the agent either create a new event channel for each
> window or maintain a pool of event channels, but that should be doable.

I think this sounds like a promising idea.

> This still does not solve the problem of the frontend exiting
> unexpectedly, though.

Such cases need to be handled via deferred reclaim. You could add a flag
to struct deferred_entry when called through gntalloc_vma_close(),
indicating that this is an expected deferred reclaim not leading to
error messages.

> 
>>> To partially solve this problem, make the number of entries that the VM
>>> will attempt to free at each iteration tunable.  The default is still
>>> 10, but it can be overridden at compile-time (via Kconfig), boot-time
>>> (via a kernel command-line option), or runtime (via sysfs).
>>
>> Is there really a need to have another Kconfig option for this? AFAICS
>> only QubesOS is affected by the problem you are trying to solve. I don't
>> see why you can't use the command-line option or sysfs node to set the
>> higher reclaim batch size.
> 
> Fair.  In practice, Qubes OS will need to use the sysfs node, since
> the other two do not work with in-VM kernels.
> 
>>> Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
>>
>> I don't think this "Fixes:" tag is appropriate. The mentioned commit didn't
>> have a bug. You are adding new functionality on top of it.
> 
> I’ll drop the "Fixes:" tag, but I will keep the "Cc: stable@vger.kernel.org"
> as I believe this patch meets the following criterion for stable
> backport (from Documentation/process/stable-kernel-rules.rst):
> 
>      Serious issues as reported by a user of a distribution kernel may also
>      be considered if they fix a notable performance or interactivity issue.
> 

Sure, this seems appropriate.

>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
>>> ---
>>>    drivers/xen/Kconfig       | 12 ++++++++++++
>>>    drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
>>>    2 files changed, 41 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
>>> index d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
>>> --- a/drivers/xen/Kconfig
>>> +++ b/drivers/xen/Kconfig
>>> @@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
>>>    	  This value is used to allocate enough space in internal
>>>    	  tables needed for physical memory administration.
>>> +config XEN_GRANTS_RECLAIM_PER_ITERATION
>>> +	int "Default number of grant entries to reclaim per iteration"
>>> +	default 10
>>> +	range 10 4294967295
>>> +	help
>>> +	  This sets the default value for the grant_table.free_per_iteration
>>> +	  kernel command line option, which sets the number of grants that
>>> +	  Linux will try to reclaim at once.  The default is 10, but
>>> +	  workloads that make heavy use of gntalloc will likely want to
>>> +	  increase this.  The current value can be accessed and/or modified
>>> +	  via /sys/module/grant_table/parameters/free_per_iteration.
>>> +
>>
>> Apart from the fact that I don't like adding a new Kconfig option, the help
>> text is not telling the true story. Heavy use of gntalloc isn't to blame, but
>> the fact that your PV-device implementation is based on the reclaim
>> functionality. TBH, someone not familiar with the grant reclaim will have no
>> chance to select a sensible value based on your help text.
> 
> That’s a good point.  Qubes OS will need to use the sysfs value anyway
> in order to support in-VM kernels, so the Kconfig option is not really
> useful.

Nice.

> 
>>>    config XEN_SCRUB_PAGES_DEFAULT
>>>    	bool "Scrub pages before returning them to system by default"
>>>    	depends on XEN_BALLOON
>>> diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
>>> index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
>>> --- a/drivers/xen/grant-table.c
>>> +++ b/drivers/xen/grant-table.c
>>> @@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
>>>    static void gnttab_handle_deferred(struct timer_list *);
>>>    static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
>>> +static atomic64_t deferred_count;
>>> +static atomic64_t leaked_count;
>>> +static unsigned int free_per_iteration = CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
>>> +
>>>    static void gnttab_handle_deferred(struct timer_list *unused)
>>>    {
>>> -	unsigned int nr = 10;
>>> +	unsigned int nr = READ_ONCE(free_per_iteration);
>>> +	const bool ignore_limit = nr == 0;
>>>    	struct deferred_entry *first = NULL;
>>>    	unsigned long flags;
>>> +	size_t freed = 0;
>>>    	spin_lock_irqsave(&gnttab_list_lock, flags);
>>> -	while (nr--) {
>>> +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
>>>    		struct deferred_entry *entry
>>>    			= list_first_entry(&deferred_list,
>>>    					   struct deferred_entry, list);
>>> @@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
>>>    		list_del(&entry->list);
>>>    		spin_unlock_irqrestore(&gnttab_list_lock, flags);
>>>    		if (_gnttab_end_foreign_access_ref(entry->ref)) {
>>> +			uint64_t ret = atomic64_sub_return(1, &deferred_count);
>>>    			put_free_entry(entry->ref);
>>> -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
>>> -				 entry->ref, page_to_pfn(entry->page));
>>> +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
>>> +				 entry->ref, page_to_pfn(entry->page),
>>> +				 (unsigned long long)ret);
>>
>> I'd prefer not to issue lots of prints (being it debug or info ones) if the
>> reclaim is expected to happen with a specific PV-device.
>>
>> My preferred solution would be a per-device flag, but at least you should
>> switch to pr_*_ratelimited(). Same applies further down.
> 
> What do you mean by “per-device flag”?  These grants are allocated by
> userspace using gntalloc, so there is no PV device on which the flag
> could be set.

In this case the flag should relate to the file pointer used for
gntalloc_open().


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-06-13  6:45     ` Juergen Gross
@ 2023-06-13 15:11       ` Demi Marie Obenour
  2023-06-19 13:23       ` Marek Marczykowski-Górecki
  1 sibling, 0 replies; 13+ messages in thread
From: Demi Marie Obenour @ 2023-06-13 15:11 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk
  Cc: Xen developer discussion, Linux Kernel Mailing List, stable

[-- Attachment #1: Type: text/plain, Size: 14478 bytes --]

On Tue, Jun 13, 2023 at 08:45:31AM +0200, Juergen Gross wrote:
> On 12.06.23 22:09, Demi Marie Obenour wrote:
> > On Mon, Jun 12, 2023 at 08:27:59AM +0200, Juergen Gross wrote:
> > > On 10.06.23 17:32, Demi Marie Obenour wrote:
> > > > When a grant entry is still in use by the remote domain, Linux must put
> > > > it on a deferred list.
> > > 
> > > This lacks quite some context.
> > > 
> > > The main problem is related to the grant not having been unmapped after
> > > the end of a request, but the side granting the access is assuming this
> > > should be the case.
> > 
> > The GUI agent has relied on deferred grant reclaim for as long as it has
> > existed.  One could argue that doing so means that the agent is misusing
> > gntalloc, but this is not documented anywhere.  A better fix would be to
> > use IOCTL_GNTDEV_SET_UNMAP_NOTIFY in the GUI daemon.
> 
> I don't think this is a gntalloc specific problem. You could create the same
> problem with a kernel implementation.

That is true.

> And yes, using IOCTL_GNTDEV_SET_UNMAP_NOTIFY might be a good idea.
> 
> > > In general this means that the two sides implementing the protocol don't
> > > agree how it should work, or that the protocol itself has a flaw.
> > 
> > What would a better solution be?  This is going to be particularly
> > tricky with Wayland, as the wl_shm protocol makes absolutely no
> > guarantees that compositors will promptly release the mapping and
> > provides no way whatsoever for Wayland clients to know when this has
> > happened.  Relying on an LD_PRELOAD hack is not sustainable.
> > 
> > > > Normally, this list is very short, because
> > > > the PV network and block protocols expect the backend to unmap the grant
> > > > first.
> > > 
> > > Normally the list is just empty. Only in very rare cases like premature
> > > PV frontend module unloading it is expected to see cases of deferred
> > > grant reclaims.
> > 
> > In the case of a system using only properly-designed PV protocols
> > implemented in kernel mode I agree.  However, both libxenvchan and the
> > Qubes GUI protocol are implemented in user mode and this means that if
> > the frontend process (the one that uses gntalloc) crashes, deferred
> > grant reclaims will occur.
> 
> This would be the user space variant of frontend driver unloading.

Yes and no.  The main difference is that a frontend driver can prevent
itself from being unloaded while it still has grants outstanding,
whereas a user space process cannot prevent itself from being killed.
The closest a user space process can get to making itself unkillable is
to use mlockall() and mark itself ineligible for the OOM killer.  This
would likely require anything that used grants (including vchans!) to be
a privileged daemon that exposes an API over an AF_UNIX socket, which
would be a substantial change in the case of Qubes OS.

> > Worse, it is possible for the domain to use
> > the grant in a PV protocol.  If the PV backend driver then maps and
> > unmaps the grant and then tells the frontend driver to reclaim it, but
> > the backend userspace process (the one using gntdev) maps it before the
> > frontend does reclaim it, the frontend will think the backend is trying
> > to exploit XSA-396 and freeze the connection.
> 
> Let me sum up how I understand above statement:
> 
> 1. Frontend F in DomA is granting DomB access via grant X for PV-device Y.
> 2. DomB backend B for PV-device Y is mapping the page using grant X and uses it.
> 3. DomB backend B unmaps grant X and signals end of usage to DomA frontend F.
> 4. DomB userspace process maps grant X
> 5. DomA frontend F tries to reclaim grant X and fails due to DomB userspace
>    mapping

All of these are correct, but there is some context that is missing.

> So why would DomB userspace process map grant X? This would imply a malicious
> process in DomB. This might be possible, but I don't see how such an attack
> would relate to your problem. It could happen with any malicious userspace
> program with root access in a domain running a PV backend.

The complete sequence of events is as follows:

 1. DomA userspace process C allocates grant X via gntalloc.
 2. DomA userspace process C tells DomB userspace process to map grant X
    via e.g. a vchan message.
 3. DomB userspace process S receives message telling it to map grant X.
 4. (optional) DomB userspace process S checks that DomA userspace
    process C is still alive and finds that it is.
 5. DomB userspace process S gets preempted.
 6. DomA userspace process C exits or is killed.
 7. DomA kernel KA deallocates grant X.
 8. Frontend F in DomA grants DomB access via grant X for PV-device Y.
 9. DomB backend B for PV-device Y is mapping the page using grant X and uses it.
10. DomB backend B unmaps grant X and signals end of usage to DomA frontend F.
11. DomB userspace process S is scheduled again.  It finally gets the
    chance to use gntdev to map grant X.
12. DomA frontend F tries to reclaim grant X and fails due to DomB userspace
    mapping by S.

Given the current kernel API, I’m not aware of any way to avoid this race
unless C (and possibly S) are unkillable.  Even with the check in step 4
there is still a TOCTOU problem.  Neither kernel has any visibility into
the userspace messaging protocol, so DomA has no way of knowing that S
has been told to map the grant in the future.

There are four other potential solutions I can think of.

1. If a gntalloc handle is closed while it still has mapped grants, leak
   the remaining grants.  It’s okay to reclaim the pages those grants
   refer to, but the grants themselves would not be usable until reboot.

2. Assume that “backend has not unmapped grant” is due to this race, and
   fall back to deferred reclaim instead of closing the connection.  For
   this to work, either the data would need to be discarded by F, or S
   would need to follow some sort of protocol where it always checks
   that C is alive after mapping the grant but before writing to the
   newly mapped page.

3. Implement AF_VSOCK or a similar protocol for kernel-mediated inter-VM
   communication, and send grants as control messages.  This means that
   both kernels are aware of grant lifetimes and can avoid this race.

4. Same as 3, except that grant table entries are replaced by
   capabilities that must be sent over Argo.  This is obviously the best
   long-term solution, but it also requires massive changes in all PV
   protocols, so it isn’t practical in the short term.

Additionally, I wonder if grant operations should require CAP_SYS_RAWIO.
Userspace use of grants very much seems like a raw I/O operation,
inasmuch as it is bypassing the protections Linux normally provides.

> > > > However, Qubes OS's GUI protocol is subject to the constraints
> > > > of the X Window System, and as such winds up with the frontend unmapping
> > > > the window first.  As a result, the list can grow very large, resulting
> > > > in a massive memory leak and eventual VM freeze.
> > > 
> > > I do understand that it is difficult to change the protocol and/or
> > > behavior after the fact, or that performance considerations are in the
> > > way of doing so.
> > 
> > Would the correct fix be to use IOCTL_GNTDEV_SET_UNMAP_NOTIFY?  That
> > would require that the agent either create a new event channel for each
> > window or maintain a pool of event channels, but that should be doable.
> 
> I think this sounds like a promising idea.

Me too.

> > This still does not solve the problem of the frontend exiting
> > unexpectedly, though.
> 
> Such cases need to be handled via deferred reclaim. You could add a flag
> to struct deferred_entry when called through gntalloc_vma_close(),
> indicating that this is an expected deferred reclaim not leading to
> error messages.

Deferred reclaim can handle this most of the time, but there is still a
race (see above).

> > > > To partially solve this problem, make the number of entries that the VM
> > > > will attempt to free at each iteration tunable.  The default is still
> > > > 10, but it can be overridden at compile-time (via Kconfig), boot-time
> > > > (via a kernel command-line option), or runtime (via sysfs).
> > > 
> > > Is there really a need to have another Kconfig option for this? AFAICS
> > > only QubesOS is affected by the problem you are trying to solve. I don't
> > > see why you can't use the command-line option or sysfs node to set the
> > > higher reclaim batch size.
> > 
> > Fair.  In practice, Qubes OS will need to use the sysfs node, since
> > the other two do not work with in-VM kernels.
> > 
> > > > Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
> > > 
> > > I don't think this "Fixes:" tag is appropriate. The mentioned commit didn't
> > > have a bug. You are adding new functionality on top of it.
> > 
> > I’ll drop the "Fixes:" tag, but I will keep the "Cc: stable@vger.kernel.org"
> > as I believe this patch meets the following criterion for stable
> > backport (from Documentation/process/stable-kernel-rules.rst):
> > 
> >      Serious issues as reported by a user of a distribution kernel may also
> >      be considered if they fix a notable performance or interactivity issue.
> > 
> 
> Sure, this seems appropriate.

Thank you.

> > > > Cc: stable@vger.kernel.org
> > > > Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> > > > ---
> > > >    drivers/xen/Kconfig       | 12 ++++++++++++
> > > >    drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
> > > >    2 files changed, 41 insertions(+), 11 deletions(-)
> > > > 
> > > > diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> > > > index d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
> > > > --- a/drivers/xen/Kconfig
> > > > +++ b/drivers/xen/Kconfig
> > > > @@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
> > > >    	  This value is used to allocate enough space in internal
> > > >    	  tables needed for physical memory administration.
> > > > +config XEN_GRANTS_RECLAIM_PER_ITERATION
> > > > +	int "Default number of grant entries to reclaim per iteration"
> > > > +	default 10
> > > > +	range 10 4294967295
> > > > +	help
> > > > +	  This sets the default value for the grant_table.free_per_iteration
> > > > +	  kernel command line option, which sets the number of grants that
> > > > +	  Linux will try to reclaim at once.  The default is 10, but
> > > > +	  workloads that make heavy use of gntalloc will likely want to
> > > > +	  increase this.  The current value can be accessed and/or modified
> > > > +	  via /sys/module/grant_table/parameters/free_per_iteration.
> > > > +
> > > 
> > > Apart from the fact that I don't like adding a new Kconfig option, the help
> > > text is not telling the true story. Heavy use of gntalloc isn't to blame, but
> > > the fact that your PV-device implementation is based on the reclaim
> > > functionality. TBH, someone not familiar with the grant reclaim will have no
> > > chance to select a sensible value based on your help text.
> > 
> > That’s a good point.  Qubes OS will need to use the sysfs value anyway
> > in order to support in-VM kernels, so the Kconfig option is not really
> > useful.
> 
> Nice.

Yeah, having one less Kconfig option is nice.

> > > >    config XEN_SCRUB_PAGES_DEFAULT
> > > >    	bool "Scrub pages before returning them to system by default"
> > > >    	depends on XEN_BALLOON
> > > > diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> > > > index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
> > > > --- a/drivers/xen/grant-table.c
> > > > +++ b/drivers/xen/grant-table.c
> > > > @@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
> > > >    static void gnttab_handle_deferred(struct timer_list *);
> > > >    static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
> > > > +static atomic64_t deferred_count;
> > > > +static atomic64_t leaked_count;
> > > > +static unsigned int free_per_iteration = CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
> > > > +
> > > >    static void gnttab_handle_deferred(struct timer_list *unused)
> > > >    {
> > > > -	unsigned int nr = 10;
> > > > +	unsigned int nr = READ_ONCE(free_per_iteration);
> > > > +	const bool ignore_limit = nr == 0;
> > > >    	struct deferred_entry *first = NULL;
> > > >    	unsigned long flags;
> > > > +	size_t freed = 0;
> > > >    	spin_lock_irqsave(&gnttab_list_lock, flags);
> > > > -	while (nr--) {
> > > > +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
> > > >    		struct deferred_entry *entry
> > > >    			= list_first_entry(&deferred_list,
> > > >    					   struct deferred_entry, list);
> > > > @@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
> > > >    		list_del(&entry->list);
> > > >    		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> > > >    		if (_gnttab_end_foreign_access_ref(entry->ref)) {
> > > > +			uint64_t ret = atomic64_sub_return(1, &deferred_count);
> > > >    			put_free_entry(entry->ref);
> > > > -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
> > > > -				 entry->ref, page_to_pfn(entry->page));
> > > > +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
> > > > +				 entry->ref, page_to_pfn(entry->page),
> > > > +				 (unsigned long long)ret);
> > > 
> > > I'd prefer not to issue lots of prints (being it debug or info ones) if the
> > > reclaim is expected to happen with a specific PV-device.
> > > 
> > > My preferred solution would be a per-device flag, but at least you should
> > > switch to pr_*_ratelimited(). Same applies further down.
> > 
> > What do you mean by “per-device flag”?  These grants are allocated by
> > userspace using gntalloc, so there is no PV device on which the flag
> > could be set.
> 
> In this case the flag should relate to the file pointer used for
> gntalloc_open().

What about adding a warning if a userspace process exits without having
cleaned up all of its grants, and a module parameter that controls
whether the remaining grants are reclaimed or leaked?
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-06-13  6:45     ` Juergen Gross
  2023-06-13 15:11       ` Demi Marie Obenour
@ 2023-06-19 13:23       ` Marek Marczykowski-Górecki
  1 sibling, 0 replies; 13+ messages in thread
From: Marek Marczykowski-Górecki @ 2023-06-19 13:23 UTC (permalink / raw)
  To: Juergen Gross
  Cc: Demi Marie Obenour, Stefano Stabellini, Oleksandr Tyshchenko,
	Jan Beulich, Konrad Rzeszutek Wilk, Xen developer discussion,
	Linux Kernel Mailing List, stable

[-- Attachment #1: Type: text/plain, Size: 3138 bytes --]

On Tue, Jun 13, 2023 at 08:45:31AM +0200, Juergen Gross wrote:
> On 12.06.23 22:09, Demi Marie Obenour wrote:
> > On Mon, Jun 12, 2023 at 08:27:59AM +0200, Juergen Gross wrote:
> > > On 10.06.23 17:32, Demi Marie Obenour wrote:
> > > > When a grant entry is still in use by the remote domain, Linux must put
> > > > it on a deferred list.
> > > 
> > > This lacks quite some context.
> > > 
> > > The main problem is related to the grant not having been unmapped after
> > > the end of a request, but the side granting the access is assuming this
> > > should be the case.
> > 
> > The GUI agent has relied on deferred grant reclaim for as long as it has
> > existed.  One could argue that doing so means that the agent is misusing
> > gntalloc, but this is not documented anywhere.  A better fix would be to
> > use IOCTL_GNTDEV_SET_UNMAP_NOTIFY in the GUI daemon.
> 
> I don't think this is a gntalloc specific problem. You could create the same
> problem with a kernel implementation.
> 
> And yes, using IOCTL_GNTDEV_SET_UNMAP_NOTIFY might be a good idea.

Just a comment to this one: while in practice it might work most of the
time, in theory there could be hundreds or even thousands of mappings at
the same time and event channels seems to be too scarce resource to get
unmap notification for all of those (even if you'd use an event channel
per grant area, not individual page).
As said to Demi elsewhere, our GUI protocol might have a way to signal
when unmapping happened, but the backend (the side mapping the grants)
would need to know when they actually were unmapped by another process
(in some configurations, like Wayland mentioned below, the process
itself doesn't give us this info). Ideally, for this we'd need something
similar to IOCTL_GNTDEV_SET_UNMAP_NOTIFY that notifies the mapping side, not
the other domain. I have no idea how such API should look (just
poll()/read() on the gntdev FD used for that IOCTL?) or whether it's
really worth adding such API.
Or maybe something like this already exists? In worst case, even polling
for this info (so - a way to check if process P has grant G from domain
D still mapped) would do, although that wouldn't be ideal.

> > > In general this means that the two sides implementing the protocol don't
> > > agree how it should work, or that the protocol itself has a flaw.
> > 
> > What would a better solution be?  This is going to be particularly
> > tricky with Wayland, as the wl_shm protocol makes absolutely no
> > guarantees that compositors will promptly release the mapping and
> > provides no way whatsoever for Wayland clients to know when this has
> > happened.  Relying on an LD_PRELOAD hack is not sustainable.


Anyway, besides trying to fix our usage of grant tables, I think it's
still worthwhile to improve deferred reclaim, either by making the limit
per iteration configurable (sysfs node / writable module parameter is
fine) or by making the limit not needed by using workqueue. The former
sounds like a smaller code change.

-- 
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-02-14  7:51     ` Juergen Gross
@ 2023-02-14 15:08       ` Demi Marie Obenour
  0 siblings, 0 replies; 13+ messages in thread
From: Demi Marie Obenour @ 2023-02-14 15:08 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Marek Marczykowski-Górecki, xen-devel, linux-kernel, stable

[-- Attachment #1: Type: text/plain, Size: 4802 bytes --]

On Tue, Feb 14, 2023 at 08:51:09AM +0100, Juergen Gross wrote:
> On 13.02.23 22:01, Demi Marie Obenour wrote:
> > On Mon, Feb 13, 2023 at 10:26:11AM +0100, Juergen Gross wrote:
> > > On 07.02.23 03:10, Demi Marie Obenour wrote:
> > > > When a grant entry is still in use by the remote domain, Linux must put
> > > > it on a deferred list.  Normally, this list is very short, because
> > > > the PV network and block protocols expect the backend to unmap the grant
> > > > first.  However, Qubes OS's GUI protocol is subject to the constraints
> > > > of the X Window System, and as such winds up with the frontend unmapping
> > > > the window first.  As a result, the list can grow very large, resulting
> > > > in a massive memory leak and eventual VM freeze.
> > > > 
> > > > Fix this problem by bumping the number of entries that the VM will
> > > > attempt to free at each iteration to 10000.  This is an ugly hack that
> > > > may well make a denial of service easier, but for Qubes OS that is less
> > > > bad than the problem Qubes OS users are facing today.
> > > 
> > > > There really
> > > > needs to be a way for a frontend to be notified when the backend has
> > > > unmapped the grants.
> > > 
> > > Please remove this sentence from the commit message, or move it below the
> > > "---" marker.
> > 
> > Will fix in v2.
> > 
> > > There are still some flag bits unallocated in struct grant_entry_v1 or
> > > struct grant_entry_header. You could suggest some patches for Xen to use
> > > one of the bits as a marker to get an event from the hypervisor if a
> > > grant with such a bit set has been unmapped.
> > 
> > That is indeed a good idea.  There are other problems with the grant
> > interface as well, but those can be dealt with later.
> > 
> > > I have no idea, whether such an interface would be accepted by the
> > > maintainers, though.
> > > 
> > > > Additionally, a module parameter is provided to
> > > > allow tuning the reclaim speed.
> > > > 
> > > > The code previously used printk(KERN_DEBUG) whenever it had to defer
> > > > reclaiming a page because the grant was still mapped.  This resulted in
> > > > a large volume of log messages that bothered users.  Use pr_debug
> > > > instead, which suppresses the messages by default.  Developers can
> > > > enable them using the dynamic debug mechanism.
> > > > 
> > > > Fixes: QubesOS/qubes-issues#7410 (memory leak)
> > > > Fixes: QubesOS/qubes-issues#7359 (excessive logging)
> > > > Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
> > > > Cc: stable@vger.kernel.org
> > > > Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> > > > ---
> > > > Anyone have suggestions for improving the grant mechanism?  Argo isn't
> > > > a good option, as in the GUI protocol there are substantial performance
> > > > wins to be had by using true shared memory.  Resending as I forgot the
> > > > Signed-off-by on the first submission.  Sorry about that.
> > > > 
> > > >    drivers/xen/grant-table.c | 2 +-
> > > >    1 file changed, 1 insertion(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> > > > index 5c83d41..2c2faa7 100644
> > > > --- a/drivers/xen/grant-table.c
> > > > +++ b/drivers/xen/grant-table.c
> > > > @@ -355,14 +355,20 @@
> > > >    static void gnttab_handle_deferred(struct timer_list *);
> > > >    static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
> > > > +static atomic64_t deferred_count;
> > > > +static atomic64_t leaked_count;
> > > > +static unsigned int free_per_iteration = 10000;
> > > 
> > > As you are adding a kernel parameter to change this value, please set the
> > > default to a value not potentially causing any DoS problems. Qubes OS can
> > > still use a higher value then.
> > 
> > Do you have any suggestions?  I don’t know if this is actually a DoS
> > concern anymore.  Shrinking the interval between iterations would be.
> 
> Why don't you use today's value of 10 for the default?

Will do.  I now remember that the DoS concern is that the kernel could
be made to use excess CPU trying and failing to reclaim memory.

> > > > +
> > > >    static void gnttab_handle_deferred(struct timer_list *unused)
> > > >    {
> > > > -	unsigned int nr = 10;
> > > > +	unsigned int nr = READ_ONCE(free_per_iteration);
> > > 
> > > I don't see why you are needing READ_ONCE() here.
> > 
> > free_per_iteration can be concurrently modified via sysfs.
> 
> My remark was based on the wrong assumption that ignore_limit could be
> dropped.

Even if ignore_limit could not be dropped, READ_ONCE is still necessary
to avoid a data race.
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-02-13 21:01   ` Demi Marie Obenour
@ 2023-02-14  7:51     ` Juergen Gross
  2023-02-14 15:08       ` Demi Marie Obenour
  0 siblings, 1 reply; 13+ messages in thread
From: Juergen Gross @ 2023-02-14  7:51 UTC (permalink / raw)
  To: Demi Marie Obenour, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Marek Marczykowski-Górecki, xen-devel, linux-kernel, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 4668 bytes --]

On 13.02.23 22:01, Demi Marie Obenour wrote:
> On Mon, Feb 13, 2023 at 10:26:11AM +0100, Juergen Gross wrote:
>> On 07.02.23 03:10, Demi Marie Obenour wrote:
>>> When a grant entry is still in use by the remote domain, Linux must put
>>> it on a deferred list.  Normally, this list is very short, because
>>> the PV network and block protocols expect the backend to unmap the grant
>>> first.  However, Qubes OS's GUI protocol is subject to the constraints
>>> of the X Window System, and as such winds up with the frontend unmapping
>>> the window first.  As a result, the list can grow very large, resulting
>>> in a massive memory leak and eventual VM freeze.
>>>
>>> Fix this problem by bumping the number of entries that the VM will
>>> attempt to free at each iteration to 10000.  This is an ugly hack that
>>> may well make a denial of service easier, but for Qubes OS that is less
>>> bad than the problem Qubes OS users are facing today.
>>
>>> There really
>>> needs to be a way for a frontend to be notified when the backend has
>>> unmapped the grants.
>>
>> Please remove this sentence from the commit message, or move it below the
>> "---" marker.
> 
> Will fix in v2.
> 
>> There are still some flag bits unallocated in struct grant_entry_v1 or
>> struct grant_entry_header. You could suggest some patches for Xen to use
>> one of the bits as a marker to get an event from the hypervisor if a
>> grant with such a bit set has been unmapped.
> 
> That is indeed a good idea.  There are other problems with the grant
> interface as well, but those can be dealt with later.
> 
>> I have no idea, whether such an interface would be accepted by the
>> maintainers, though.
>>
>>> Additionally, a module parameter is provided to
>>> allow tuning the reclaim speed.
>>>
>>> The code previously used printk(KERN_DEBUG) whenever it had to defer
>>> reclaiming a page because the grant was still mapped.  This resulted in
>>> a large volume of log messages that bothered users.  Use pr_debug
>>> instead, which suppresses the messages by default.  Developers can
>>> enable them using the dynamic debug mechanism.
>>>
>>> Fixes: QubesOS/qubes-issues#7410 (memory leak)
>>> Fixes: QubesOS/qubes-issues#7359 (excessive logging)
>>> Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
>>> ---
>>> Anyone have suggestions for improving the grant mechanism?  Argo isn't
>>> a good option, as in the GUI protocol there are substantial performance
>>> wins to be had by using true shared memory.  Resending as I forgot the
>>> Signed-off-by on the first submission.  Sorry about that.
>>>
>>>    drivers/xen/grant-table.c | 2 +-
>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
>>> index 5c83d41..2c2faa7 100644
>>> --- a/drivers/xen/grant-table.c
>>> +++ b/drivers/xen/grant-table.c
>>> @@ -355,14 +355,20 @@
>>>    static void gnttab_handle_deferred(struct timer_list *);
>>>    static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
>>> +static atomic64_t deferred_count;
>>> +static atomic64_t leaked_count;
>>> +static unsigned int free_per_iteration = 10000;
>>
>> As you are adding a kernel parameter to change this value, please set the
>> default to a value not potentially causing any DoS problems. Qubes OS can
>> still use a higher value then.
> 
> Do you have any suggestions?  I don’t know if this is actually a DoS
> concern anymore.  Shrinking the interval between iterations would be.

Why don't you use today's value of 10 for the default?

> 
>>> +
>>>    static void gnttab_handle_deferred(struct timer_list *unused)
>>>    {
>>> -	unsigned int nr = 10;
>>> +	unsigned int nr = READ_ONCE(free_per_iteration);
>>
>> I don't see why you are needing READ_ONCE() here.
> 
> free_per_iteration can be concurrently modified via sysfs.

My remark was based on the wrong assumption that ignore_limit could be
dropped.

> 
>>> +	const bool ignore_limit = nr == 0;
>>
>> I don't think you need this extra variable, if you would ...
>>
>>>    	struct deferred_entry *first = NULL;
>>>    	unsigned long flags;
>>> +	size_t freed = 0;
>>>    	spin_lock_irqsave(&gnttab_list_lock, flags);
>>> -	while (nr--) {
>>> +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
>>
>> ... change this to "while ((!nr || nr--) ...".
> 
> nr-- evaluates to the old value of nr, so "while ((!nr | nr--)) ..." is
> an infinite loop.

Ah, right.


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-02-13  9:26 ` Juergen Gross
@ 2023-02-13 21:01   ` Demi Marie Obenour
  2023-02-14  7:51     ` Juergen Gross
  0 siblings, 1 reply; 13+ messages in thread
From: Demi Marie Obenour @ 2023-02-13 21:01 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Marek Marczykowski-Górecki, xen-devel, linux-kernel, stable

[-- Attachment #1: Type: text/plain, Size: 7621 bytes --]

On Mon, Feb 13, 2023 at 10:26:11AM +0100, Juergen Gross wrote:
> On 07.02.23 03:10, Demi Marie Obenour wrote:
> > When a grant entry is still in use by the remote domain, Linux must put
> > it on a deferred list.  Normally, this list is very short, because
> > the PV network and block protocols expect the backend to unmap the grant
> > first.  However, Qubes OS's GUI protocol is subject to the constraints
> > of the X Window System, and as such winds up with the frontend unmapping
> > the window first.  As a result, the list can grow very large, resulting
> > in a massive memory leak and eventual VM freeze.
> > 
> > Fix this problem by bumping the number of entries that the VM will
> > attempt to free at each iteration to 10000.  This is an ugly hack that
> > may well make a denial of service easier, but for Qubes OS that is less
> > bad than the problem Qubes OS users are facing today.
> 
> > There really
> > needs to be a way for a frontend to be notified when the backend has
> > unmapped the grants.
> 
> Please remove this sentence from the commit message, or move it below the
> "---" marker.

Will fix in v2.

> There are still some flag bits unallocated in struct grant_entry_v1 or
> struct grant_entry_header. You could suggest some patches for Xen to use
> one of the bits as a marker to get an event from the hypervisor if a
> grant with such a bit set has been unmapped.

That is indeed a good idea.  There are other problems with the grant
interface as well, but those can be dealt with later.

> I have no idea, whether such an interface would be accepted by the
> maintainers, though.
> 
> > Additionally, a module parameter is provided to
> > allow tuning the reclaim speed.
> > 
> > The code previously used printk(KERN_DEBUG) whenever it had to defer
> > reclaiming a page because the grant was still mapped.  This resulted in
> > a large volume of log messages that bothered users.  Use pr_debug
> > instead, which suppresses the messages by default.  Developers can
> > enable them using the dynamic debug mechanism.
> > 
> > Fixes: QubesOS/qubes-issues#7410 (memory leak)
> > Fixes: QubesOS/qubes-issues#7359 (excessive logging)
> > Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> > ---
> > Anyone have suggestions for improving the grant mechanism?  Argo isn't
> > a good option, as in the GUI protocol there are substantial performance
> > wins to be had by using true shared memory.  Resending as I forgot the
> > Signed-off-by on the first submission.  Sorry about that.
> > 
> >   drivers/xen/grant-table.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> > index 5c83d41..2c2faa7 100644
> > --- a/drivers/xen/grant-table.c
> > +++ b/drivers/xen/grant-table.c
> > @@ -355,14 +355,20 @@
> >   static void gnttab_handle_deferred(struct timer_list *);
> >   static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
> > +static atomic64_t deferred_count;
> > +static atomic64_t leaked_count;
> > +static unsigned int free_per_iteration = 10000;
> 
> As you are adding a kernel parameter to change this value, please set the
> default to a value not potentially causing any DoS problems. Qubes OS can
> still use a higher value then.

Do you have any suggestions?  I don’t know if this is actually a DoS
concern anymore.  Shrinking the interval between iterations would be.

> > +
> >   static void gnttab_handle_deferred(struct timer_list *unused)
> >   {
> > -	unsigned int nr = 10;
> > +	unsigned int nr = READ_ONCE(free_per_iteration);
> 
> I don't see why you are needing READ_ONCE() here.

free_per_iteration can be concurrently modified via sysfs.

> > +	const bool ignore_limit = nr == 0;
> 
> I don't think you need this extra variable, if you would ...
> 
> >   	struct deferred_entry *first = NULL;
> >   	unsigned long flags;
> > +	size_t freed = 0;
> >   	spin_lock_irqsave(&gnttab_list_lock, flags);
> > -	while (nr--) {
> > +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
> 
> ... change this to "while ((!nr || nr--) ...".

nr-- evaluates to the old value of nr, so "while ((!nr | nr--)) ..." is
an infinite loop.

> >   		struct deferred_entry *entry
> >   			= list_first_entry(&deferred_list,
> >   					   struct deferred_entry, list);
> > @@ -372,10 +378,13 @@
> >   		list_del(&entry->list);
> >   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> >   		if (_gnttab_end_foreign_access_ref(entry->ref)) {
> > +			uint64_t ret = atomic64_sub_return(1, &deferred_count);
> 
> Use atomic64_dec_return()?

Will fix in v2.

> >   			put_free_entry(entry->ref);
> > -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
> > -				 entry->ref, page_to_pfn(entry->page));
> > +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
> > +				 entry->ref, page_to_pfn(entry->page),
> > +				 (unsigned long long)ret);
> 
> Is each single instance of this message really needed?

I’m not sure.

> If not, I'd suggest to use pr_debug_ratelimited() instead.

Fair.

> >   			put_page(entry->page);
> > +			freed++;
> >   			kfree(entry);
> >   			entry = NULL;
> >   		} else {
> > @@ -387,14 +396,15 @@
> >   		spin_lock_irqsave(&gnttab_list_lock, flags);
> >   		if (entry)
> >   			list_add_tail(&entry->list, &deferred_list);
> > -		else if (list_empty(&deferred_list))
> > -			break;
> >   	}
> > -	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
> > +	if (list_empty(&deferred_list))
> > +		WARN_ON(atomic64_read(&deferred_count));
> > +	else if (!timer_pending(&deferred_timer)) {
> >   		deferred_timer.expires = jiffies + HZ;
> >   		add_timer(&deferred_timer);
> >   	}
> >   	spin_unlock_irqrestore(&gnttab_list_lock, flags);
> > +	pr_debug("Freed %zu references", freed);
> >   }
> >   static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
> > @@ -402,7 +412,7 @@
> >   {
> >   	struct deferred_entry *entry;
> >   	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
> > -	const char *what = KERN_WARNING "leaking";
> > +	uint64_t leaked, deferred;
> >   	entry = kmalloc(sizeof(*entry), gfp);
> >   	if (!page) {
> > @@ -426,12 +436,20 @@
> >   			add_timer(&deferred_timer);
> >   		}
> >   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> > -		what = KERN_DEBUG "deferring";
> > +		deferred = atomic64_add_return(1, &deferred_count);
> > +		leaked = atomic64_read(&leaked_count);
> > +		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> > +			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> > +	} else {
> > +		deferred = atomic64_read(&deferred_count);
> > +		leaked = atomic64_add_return(1, &leaked_count);
> > +		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> > +			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> 
> Again, maybe use the ratelimited variants of pr_debug() and pr_warn()?

Will fix in v2 unless someone objects.

> >   	}
> > -	printk("%s g.e. %#x (pfn %#lx)\n",
> > -	       what, ref, page ? page_to_pfn(page) : -1);
> >   }
> > +module_param(free_per_iteration, uint, 0600);
> 
> Please add the parameter to Documentation/admin-guide/kernel-parameters.txt
> 
> 
> Juergen

Will fix in v2.
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] xen: speed up grant-table reclaim
  2023-02-07  2:10 Demi Marie Obenour
@ 2023-02-13  9:26 ` Juergen Gross
  2023-02-13 21:01   ` Demi Marie Obenour
  0 siblings, 1 reply; 13+ messages in thread
From: Juergen Gross @ 2023-02-13  9:26 UTC (permalink / raw)
  To: Demi Marie Obenour, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Marek Marczykowski-Górecki, xen-devel, linux-kernel, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 6640 bytes --]

On 07.02.23 03:10, Demi Marie Obenour wrote:
> When a grant entry is still in use by the remote domain, Linux must put
> it on a deferred list.  Normally, this list is very short, because
> the PV network and block protocols expect the backend to unmap the grant
> first.  However, Qubes OS's GUI protocol is subject to the constraints
> of the X Window System, and as such winds up with the frontend unmapping
> the window first.  As a result, the list can grow very large, resulting
> in a massive memory leak and eventual VM freeze.
> 
> Fix this problem by bumping the number of entries that the VM will
> attempt to free at each iteration to 10000.  This is an ugly hack that
> may well make a denial of service easier, but for Qubes OS that is less
> bad than the problem Qubes OS users are facing today.

> There really
> needs to be a way for a frontend to be notified when the backend has
> unmapped the grants.

Please remove this sentence from the commit message, or move it below the
"---" marker.

There are still some flag bits unallocated in struct grant_entry_v1 or
struct grant_entry_header. You could suggest some patches for Xen to use
one of the bits as a marker to get an event from the hypervisor if a
grant with such a bit set has been unmapped.

I have no idea, whether such an interface would be accepted by the
maintainers, though.

> Additionally, a module parameter is provided to
> allow tuning the reclaim speed.
> 
> The code previously used printk(KERN_DEBUG) whenever it had to defer
> reclaiming a page because the grant was still mapped.  This resulted in
> a large volume of log messages that bothered users.  Use pr_debug
> instead, which suppresses the messages by default.  Developers can
> enable them using the dynamic debug mechanism.
> 
> Fixes: QubesOS/qubes-issues#7410 (memory leak)
> Fixes: QubesOS/qubes-issues#7359 (excessive logging)
> Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
> Cc: stable@vger.kernel.org
> Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
> ---
> Anyone have suggestions for improving the grant mechanism?  Argo isn't
> a good option, as in the GUI protocol there are substantial performance
> wins to be had by using true shared memory.  Resending as I forgot the
> Signed-off-by on the first submission.  Sorry about that.
> 
>   drivers/xen/grant-table.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
> index 5c83d41..2c2faa7 100644
> --- a/drivers/xen/grant-table.c
> +++ b/drivers/xen/grant-table.c
> @@ -355,14 +355,20 @@
>   static void gnttab_handle_deferred(struct timer_list *);
>   static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
>   
> +static atomic64_t deferred_count;
> +static atomic64_t leaked_count;
> +static unsigned int free_per_iteration = 10000;

As you are adding a kernel parameter to change this value, please set the
default to a value not potentially causing any DoS problems. Qubes OS can
still use a higher value then.

> +
>   static void gnttab_handle_deferred(struct timer_list *unused)
>   {
> -	unsigned int nr = 10;
> +	unsigned int nr = READ_ONCE(free_per_iteration);

I don't see why you are needing READ_ONCE() here.

> +	const bool ignore_limit = nr == 0;

I don't think you need this extra variable, if you would ...

>   	struct deferred_entry *first = NULL;
>   	unsigned long flags;
> +	size_t freed = 0;
>   
>   	spin_lock_irqsave(&gnttab_list_lock, flags);
> -	while (nr--) {
> +	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {

... change this to "while ((!nr || nr--) ...".

>   		struct deferred_entry *entry
>   			= list_first_entry(&deferred_list,
>   					   struct deferred_entry, list);
> @@ -372,10 +378,13 @@
>   		list_del(&entry->list);
>   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
>   		if (_gnttab_end_foreign_access_ref(entry->ref)) {
> +			uint64_t ret = atomic64_sub_return(1, &deferred_count);

Use atomic64_dec_return()?

>   			put_free_entry(entry->ref);
> -			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
> -				 entry->ref, page_to_pfn(entry->page));
> +			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
> +				 entry->ref, page_to_pfn(entry->page),
> +				 (unsigned long long)ret);

Is each single instance of this message really needed?

If not, I'd suggest to use pr_debug_ratelimited() instead.

>   			put_page(entry->page);
> +			freed++;
>   			kfree(entry);
>   			entry = NULL;
>   		} else {
> @@ -387,14 +396,15 @@
>   		spin_lock_irqsave(&gnttab_list_lock, flags);
>   		if (entry)
>   			list_add_tail(&entry->list, &deferred_list);
> -		else if (list_empty(&deferred_list))
> -			break;
>   	}
> -	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
> +	if (list_empty(&deferred_list))
> +		WARN_ON(atomic64_read(&deferred_count));
> +	else if (!timer_pending(&deferred_timer)) {
>   		deferred_timer.expires = jiffies + HZ;
>   		add_timer(&deferred_timer);
>   	}
>   	spin_unlock_irqrestore(&gnttab_list_lock, flags);
> +	pr_debug("Freed %zu references", freed);
>   }
>   
>   static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
> @@ -402,7 +412,7 @@
>   {
>   	struct deferred_entry *entry;
>   	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
> -	const char *what = KERN_WARNING "leaking";
> +	uint64_t leaked, deferred;
>   
>   	entry = kmalloc(sizeof(*entry), gfp);
>   	if (!page) {
> @@ -426,12 +436,20 @@
>   			add_timer(&deferred_timer);
>   		}
>   		spin_unlock_irqrestore(&gnttab_list_lock, flags);
> -		what = KERN_DEBUG "deferring";
> +		deferred = atomic64_add_return(1, &deferred_count);
> +		leaked = atomic64_read(&leaked_count);
> +		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> +			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
> +	} else {
> +		deferred = atomic64_read(&deferred_count);
> +		leaked = atomic64_add_return(1, &leaked_count);
> +		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
> +			ref, page ? page_to_pfn(page) : -1, deferred, leaked);

Again, maybe use the ratelimited variants of pr_debug() and pr_warn()?

>   	}
> -	printk("%s g.e. %#x (pfn %#lx)\n",
> -	       what, ref, page ? page_to_pfn(page) : -1);
>   }
>   
> +module_param(free_per_iteration, uint, 0600);

Please add the parameter to Documentation/admin-guide/kernel-parameters.txt


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] xen: speed up grant-table reclaim
@ 2023-02-07  2:10 Demi Marie Obenour
  2023-02-13  9:26 ` Juergen Gross
  0 siblings, 1 reply; 13+ messages in thread
From: Demi Marie Obenour @ 2023-02-07  2:10 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Demi Marie Obenour, Marek Marczykowski-Górecki, xen-devel,
	linux-kernel, stable

When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list.  Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first.  However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first.  As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.

Fix this problem by bumping the number of entries that the VM will
attempt to free at each iteration to 10000.  This is an ugly hack that
may well make a denial of service easier, but for Qubes OS that is less
bad than the problem Qubes OS users are facing today.  There really
needs to be a way for a frontend to be notified when the backend has
unmapped the grants.  Additionally, a module parameter is provided to
allow tuning the reclaim speed.

The code previously used printk(KERN_DEBUG) whenever it had to defer
reclaiming a page because the grant was still mapped.  This resulted in
a large volume of log messages that bothered users.  Use pr_debug
instead, which suppresses the messages by default.  Developers can
enable them using the dynamic debug mechanism.

Fixes: QubesOS/qubes-issues#7410 (memory leak)
Fixes: QubesOS/qubes-issues#7359 (excessive logging)
Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
Cc: stable@vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi@invisiblethingslab.com>
---
Anyone have suggestions for improving the grant mechanism?  Argo isn't
a good option, as in the GUI protocol there are substantial performance
wins to be had by using true shared memory.  Resending as I forgot the
Signed-off-by on the first submission.  Sorry about that.

 drivers/xen/grant-table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 5c83d41..2c2faa7 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -355,14 +355,20 @@
 static void gnttab_handle_deferred(struct timer_list *);
 static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10000;
+
 static void gnttab_handle_deferred(struct timer_list *unused)
 {
-	unsigned int nr = 10;
+	unsigned int nr = READ_ONCE(free_per_iteration);
+	const bool ignore_limit = nr == 0;
 	struct deferred_entry *first = NULL;
 	unsigned long flags;
+	size_t freed = 0;
 
 	spin_lock_irqsave(&gnttab_list_lock, flags);
-	while (nr--) {
+	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
 		struct deferred_entry *entry
 			= list_first_entry(&deferred_list,
 					   struct deferred_entry, list);
@@ -372,10 +378,13 @@
 		list_del(&entry->list);
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
 		if (_gnttab_end_foreign_access_ref(entry->ref)) {
+			uint64_t ret = atomic64_sub_return(1, &deferred_count);
 			put_free_entry(entry->ref);
-			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
-				 entry->ref, page_to_pfn(entry->page));
+			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+				 entry->ref, page_to_pfn(entry->page),
+				 (unsigned long long)ret);
 			put_page(entry->page);
+			freed++;
 			kfree(entry);
 			entry = NULL;
 		} else {
@@ -387,14 +396,15 @@
 		spin_lock_irqsave(&gnttab_list_lock, flags);
 		if (entry)
 			list_add_tail(&entry->list, &deferred_list);
-		else if (list_empty(&deferred_list))
-			break;
 	}
-	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+	if (list_empty(&deferred_list))
+		WARN_ON(atomic64_read(&deferred_count));
+	else if (!timer_pending(&deferred_timer)) {
 		deferred_timer.expires = jiffies + HZ;
 		add_timer(&deferred_timer);
 	}
 	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+	pr_debug("Freed %zu references", freed);
 }
 
 static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
@@ -402,7 +412,7 @@
 {
 	struct deferred_entry *entry;
 	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
-	const char *what = KERN_WARNING "leaking";
+	uint64_t leaked, deferred;
 
 	entry = kmalloc(sizeof(*entry), gfp);
 	if (!page) {
@@ -426,12 +436,20 @@
 			add_timer(&deferred_timer);
 		}
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
-		what = KERN_DEBUG "deferring";
+		deferred = atomic64_add_return(1, &deferred_count);
+		leaked = atomic64_read(&leaked_count);
+		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+	} else {
+		deferred = atomic64_read(&deferred_count);
+		leaked = atomic64_add_return(1, &leaked_count);
+		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
 	}
-	printk("%s g.e. %#x (pfn %#lx)\n",
-	       what, ref, page ? page_to_pfn(page) : -1);
 }
 
+module_param(free_per_iteration, uint, 0600);
+
 int gnttab_try_end_foreign_access(grant_ref_t ref)
 {
 	int ret = _gnttab_end_foreign_access_ref(ref);
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH] xen: speed up grant-table reclaim
@ 2023-02-07  1:38 Demi Marie Obenour
  0 siblings, 0 replies; 13+ messages in thread
From: Demi Marie Obenour @ 2023-02-07  1:38 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Demi Marie Obenour, Marek Marczykowski-Górecki, xen-devel,
	linux-kernel

When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list.  Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first.  However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first.  As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.

Fix this problem by bumping the number of entries that the VM will
attempt to free at each iteration to 10000.  This is an ugly hack that
may well make a denial of service easier, but for Qubes OS that is less
bad than the problem Qubes OS users are facing today.  There really
needs to be a way for a frontend to be notified when the backend has
unmapped the grants.  Additionally, a module parameter is provided to
allow tuning the reclaim speed.

The code previously used printk(KERN_DEBUG) whenever it had to defer
reclaiming a page because the grant was still mapped.  This resulted in
a large volume of log messages that bothered users.  Use pr_debug
instead, which suppresses the messages by default.  Developers can
enable them using the dynamic debug mechanism.

Fixes QubesOS/qubes-issues#7410 (memory leak)
Fixes QubesOS/qubes-issues#7359 (excessive logging)
---
Anyone have suggestions for improving the grant mechanism?  Argo isn't
a good option, as in the GUI protocol there are substantial performance
wins to be had by using true shared memory.

 drivers/xen/grant-table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 5c83d41..2c2faa7 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -355,14 +355,20 @@
 static void gnttab_handle_deferred(struct timer_list *);
 static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10000;
+
 static void gnttab_handle_deferred(struct timer_list *unused)
 {
-	unsigned int nr = 10;
+	unsigned int nr = READ_ONCE(free_per_iteration);
+	const bool ignore_limit = nr == 0;
 	struct deferred_entry *first = NULL;
 	unsigned long flags;
+	size_t freed = 0;
 
 	spin_lock_irqsave(&gnttab_list_lock, flags);
-	while (nr--) {
+	while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
 		struct deferred_entry *entry
 			= list_first_entry(&deferred_list,
 					   struct deferred_entry, list);
@@ -372,10 +378,13 @@
 		list_del(&entry->list);
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
 		if (_gnttab_end_foreign_access_ref(entry->ref)) {
+			uint64_t ret = atomic64_sub_return(1, &deferred_count);
 			put_free_entry(entry->ref);
-			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
-				 entry->ref, page_to_pfn(entry->page));
+			pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+				 entry->ref, page_to_pfn(entry->page),
+				 (unsigned long long)ret);
 			put_page(entry->page);
+			freed++;
 			kfree(entry);
 			entry = NULL;
 		} else {
@@ -387,14 +396,15 @@
 		spin_lock_irqsave(&gnttab_list_lock, flags);
 		if (entry)
 			list_add_tail(&entry->list, &deferred_list);
-		else if (list_empty(&deferred_list))
-			break;
 	}
-	if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+	if (list_empty(&deferred_list))
+		WARN_ON(atomic64_read(&deferred_count));
+	else if (!timer_pending(&deferred_timer)) {
 		deferred_timer.expires = jiffies + HZ;
 		add_timer(&deferred_timer);
 	}
 	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+	pr_debug("Freed %zu references", freed);
 }
 
 static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
@@ -402,7 +412,7 @@
 {
 	struct deferred_entry *entry;
 	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
-	const char *what = KERN_WARNING "leaking";
+	uint64_t leaked, deferred;
 
 	entry = kmalloc(sizeof(*entry), gfp);
 	if (!page) {
@@ -426,12 +436,20 @@
 			add_timer(&deferred_timer);
 		}
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
-		what = KERN_DEBUG "deferring";
+		deferred = atomic64_add_return(1, &deferred_count);
+		leaked = atomic64_read(&leaked_count);
+		pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			 ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+	} else {
+		deferred = atomic64_read(&deferred_count);
+		leaked = atomic64_add_return(1, &leaked_count);
+		pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+			ref, page ? page_to_pfn(page) : -1, deferred, leaked);
 	}
-	printk("%s g.e. %#x (pfn %#lx)\n",
-	       what, ref, page ? page_to_pfn(page) : -1);
 }
 
+module_param(free_per_iteration, uint, 0600);
+
 int gnttab_try_end_foreign_access(grant_ref_t ref)
 {
 	int ret = _gnttab_end_foreign_access_ref(ref);
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-06-19 13:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-10 15:32 [PATCH] xen: speed up grant-table reclaim Demi Marie Obenour
2023-06-12  6:27 ` Juergen Gross
2023-06-12  6:45   ` Juergen Gross
2023-06-12 20:09   ` Demi Marie Obenour
2023-06-13  6:45     ` Juergen Gross
2023-06-13 15:11       ` Demi Marie Obenour
2023-06-19 13:23       ` Marek Marczykowski-Górecki
  -- strict thread matches above, loose matches on Subject: below --
2023-02-07  2:10 Demi Marie Obenour
2023-02-13  9:26 ` Juergen Gross
2023-02-13 21:01   ` Demi Marie Obenour
2023-02-14  7:51     ` Juergen Gross
2023-02-14 15:08       ` Demi Marie Obenour
2023-02-07  1:38 Demi Marie Obenour

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).