linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] Increase page and bit waitqueue hash size
@ 2021-03-17  7:54 Nicholas Piggin
  2021-03-17  8:38 ` Ingo Molnar
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Nicholas Piggin @ 2021-03-17  7:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nicholas Piggin, Andrew Morton, Linus Torvalds, linux-mm,
	Anton Blanchard, Ingo Molnar

The page waitqueue hash is a bit small (256 entries) on very big systems. A
16 socket 1536 thread POWER9 system was found to encounter hash collisions
and excessive time in waitqueue locking at times. This was intermittent and
hard to reproduce easily with the setup we had (very little real IO
capacity). The theory is that sometimes (depending on allocation luck)
important pages would happen to collide a lot in the hash, slowing down page
locking, causing the problem to snowball.

An small test case was made where threads would write and fsync different
pages, generating just a small amount of contention across many pages.

Increasing page waitqueue hash size to 262144 entries increased throughput
by 182% while also reducing standard deviation 3x. perf before the increase:

  36.23%  [k] _raw_spin_lock_irqsave                -      -
              |
              |--34.60%--wake_up_page_bit
              |          0
              |          iomap_write_end.isra.38
              |          iomap_write_actor
              |          iomap_apply
              |          iomap_file_buffered_write
              |          xfs_file_buffered_aio_write
              |          new_sync_write

  17.93%  [k] native_queued_spin_lock_slowpath      -      -
              |
              |--16.74%--_raw_spin_lock_irqsave
              |          |
              |           --16.44%--wake_up_page_bit
              |                     iomap_write_end.isra.38
              |                     iomap_write_actor
              |                     iomap_apply
              |                     iomap_file_buffered_write
              |                     xfs_file_buffered_aio_write

This patch uses alloc_large_system_hash to allocate a bigger system hash
that scales somewhat with memory size. The bit/var wait-queue is also
changed to keep code matching, albiet with a smaller scale factor.

A very small CONFIG_BASE_SMALL option is also added because these are two
of the biggest static objects in the image on very small systems.

This hash could be made per-node, which may help reduce remote accesses
on well localised workloads, but that adds some complexity with indexing
and hotplug, so until we get a less artificial workload to test with,
keep it simple.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 kernel/sched/wait_bit.c | 30 +++++++++++++++++++++++-------
 mm/filemap.c            | 24 +++++++++++++++++++++---
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..dba73dec17c4 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -2,19 +2,24 @@
 /*
  * The implementation of the wait_bit*() and related waiting APIs:
  */
+#include <linux/memblock.h>
 #include "sched.h"
 
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+#define BIT_WAIT_TABLE_SIZE (1 << bit_wait_table_bits)
+#if CONFIG_BASE_SMALL
+static const unsigned int bit_wait_table_bits = 3;
+static wait_queue_head_t bit_wait_table[BIT_WAIT_TABLE_SIZE] __cacheline_aligned;
+#else
+static unsigned int bit_wait_table_bits __ro_after_init;
+static wait_queue_head_t *bit_wait_table __ro_after_init;
+#endif
 
 wait_queue_head_t *bit_waitqueue(void *word, int bit)
 {
 	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
 	unsigned long val = (unsigned long)word << shift | bit;
 
-	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+	return bit_wait_table + hash_long(val, bit_wait_table_bits);
 }
 EXPORT_SYMBOL(bit_waitqueue);
 
@@ -152,7 +157,7 @@ EXPORT_SYMBOL(wake_up_bit);
 
 wait_queue_head_t *__var_waitqueue(void *p)
 {
-	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+	return bit_wait_table + hash_ptr(p, bit_wait_table_bits);
 }
 EXPORT_SYMBOL(__var_waitqueue);
 
@@ -246,6 +251,17 @@ void __init wait_bit_init(void)
 {
 	int i;
 
-	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+	if (!CONFIG_BASE_SMALL) {
+		bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
+							sizeof(wait_queue_head_t),
+							0,
+							22,
+							0,
+							&bit_wait_table_bits,
+							NULL,
+							0,
+							0);
+	}
+	for (i = 0; i < BIT_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(bit_wait_table + i);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 43700480d897..dbbb5b9d951d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/security.h>
 #include <linux/cpuset.h>
 #include <linux/hugetlb.h>
+#include <linux/memblock.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 #include <linux/shmem_fs.h>
@@ -990,19 +991,36 @@ EXPORT_SYMBOL(__page_cache_alloc);
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-#define PAGE_WAIT_TABLE_BITS 8
-#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+#define PAGE_WAIT_TABLE_SIZE (1 << page_wait_table_bits)
+#if CONFIG_BASE_SMALL
+static const unsigned int page_wait_table_bits = 4;
 static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+#else
+static unsigned int page_wait_table_bits __ro_after_init;
+static wait_queue_head_t *page_wait_table __ro_after_init;
+#endif
 
 static wait_queue_head_t *page_waitqueue(struct page *page)
 {
-	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+	return &page_wait_table[hash_ptr(page, page_wait_table_bits)];
 }
 
 void __init pagecache_init(void)
 {
 	int i;
 
+	if (!CONFIG_BASE_SMALL) {
+		page_wait_table = alloc_large_system_hash("page waitqueue hash",
+							sizeof(wait_queue_head_t),
+							0,
+							21,
+							0,
+							&page_wait_table_bits,
+							NULL,
+							0,
+							0);
+	}
+
 	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(&page_wait_table[i]);
 
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17  7:54 [PATCH v2] Increase page and bit waitqueue hash size Nicholas Piggin
@ 2021-03-17  8:38 ` Ingo Molnar
  2021-03-17 10:02   ` Nicholas Piggin
  2021-03-17 10:12 ` Rasmus Villemoes
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2021-03-17  8:38 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-kernel, Andrew Morton, Linus Torvalds, linux-mm, Anton Blanchard


* Nicholas Piggin <npiggin@gmail.com> wrote:

> The page waitqueue hash is a bit small (256 entries) on very big systems. A
> 16 socket 1536 thread POWER9 system was found to encounter hash collisions
> and excessive time in waitqueue locking at times. This was intermittent and
> hard to reproduce easily with the setup we had (very little real IO
> capacity). The theory is that sometimes (depending on allocation luck)
> important pages would happen to collide a lot in the hash, slowing down page
> locking, causing the problem to snowball.
> 
> An small test case was made where threads would write and fsync different
> pages, generating just a small amount of contention across many pages.
> 
> Increasing page waitqueue hash size to 262144 entries increased throughput
> by 182% while also reducing standard deviation 3x. perf before the increase:
> 
>   36.23%  [k] _raw_spin_lock_irqsave                -      -
>               |
>               |--34.60%--wake_up_page_bit
>               |          0
>               |          iomap_write_end.isra.38
>               |          iomap_write_actor
>               |          iomap_apply
>               |          iomap_file_buffered_write
>               |          xfs_file_buffered_aio_write
>               |          new_sync_write
> 
>   17.93%  [k] native_queued_spin_lock_slowpath      -      -
>               |
>               |--16.74%--_raw_spin_lock_irqsave
>               |          |
>               |           --16.44%--wake_up_page_bit
>               |                     iomap_write_end.isra.38
>               |                     iomap_write_actor
>               |                     iomap_apply
>               |                     iomap_file_buffered_write
>               |                     xfs_file_buffered_aio_write
> 
> This patch uses alloc_large_system_hash to allocate a bigger system hash
> that scales somewhat with memory size. The bit/var wait-queue is also
> changed to keep code matching, albiet with a smaller scale factor.
> 
> A very small CONFIG_BASE_SMALL option is also added because these are two
> of the biggest static objects in the image on very small systems.
> 
> This hash could be made per-node, which may help reduce remote accesses
> on well localised workloads, but that adds some complexity with indexing
> and hotplug, so until we get a less artificial workload to test with,
> keep it simple.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  kernel/sched/wait_bit.c | 30 +++++++++++++++++++++++-------
>  mm/filemap.c            | 24 +++++++++++++++++++++---
>  2 files changed, 44 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
> index 02ce292b9bc0..dba73dec17c4 100644
> --- a/kernel/sched/wait_bit.c
> +++ b/kernel/sched/wait_bit.c
> @@ -2,19 +2,24 @@
>  /*
>   * The implementation of the wait_bit*() and related waiting APIs:
>   */
> +#include <linux/memblock.h>
>  #include "sched.h"
>  
> -#define WAIT_TABLE_BITS 8
> -#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)

Ugh, 256 entries is almost embarrassingly small indeed.

I've put your patch into sched/core, unless Andrew is objecting.

> -	for (i = 0; i < WAIT_TABLE_SIZE; i++)
> +	if (!CONFIG_BASE_SMALL) {
> +		bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
> +							sizeof(wait_queue_head_t),
> +							0,
> +							22,
> +							0,
> +							&bit_wait_table_bits,
> +							NULL,
> +							0,
> +							0);
> +	}
> +	for (i = 0; i < BIT_WAIT_TABLE_SIZE; i++)
>  		init_waitqueue_head(bit_wait_table + i);


Meta suggestion: maybe the CONFIG_BASE_SMALL ugliness could be folded 
into alloc_large_system_hash() itself?

> --- a/mm/filemap.c
> +++ b/mm/filemap.c

>  static wait_queue_head_t *page_waitqueue(struct page *page)
>  {
> -	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
> +	return &page_wait_table[hash_ptr(page, page_wait_table_bits)];
>  }

I'm wondering whether you've tried to make this NUMA aware through 
page->node?

Seems like another useful step when having a global hash ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17  8:38 ` Ingo Molnar
@ 2021-03-17 10:02   ` Nicholas Piggin
  0 siblings, 0 replies; 13+ messages in thread
From: Nicholas Piggin @ 2021-03-17 10:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Anton Blanchard, linux-kernel, linux-mm, Linus Torvalds

Excerpts from Ingo Molnar's message of March 17, 2021 6:38 pm:
> 
> * Nicholas Piggin <npiggin@gmail.com> wrote:
> 
>> The page waitqueue hash is a bit small (256 entries) on very big systems. A
>> 16 socket 1536 thread POWER9 system was found to encounter hash collisions
>> and excessive time in waitqueue locking at times. This was intermittent and
>> hard to reproduce easily with the setup we had (very little real IO
>> capacity). The theory is that sometimes (depending on allocation luck)
>> important pages would happen to collide a lot in the hash, slowing down page
>> locking, causing the problem to snowball.
>> 
>> An small test case was made where threads would write and fsync different
>> pages, generating just a small amount of contention across many pages.
>> 
>> Increasing page waitqueue hash size to 262144 entries increased throughput
>> by 182% while also reducing standard deviation 3x. perf before the increase:
>> 
>>   36.23%  [k] _raw_spin_lock_irqsave                -      -
>>               |
>>               |--34.60%--wake_up_page_bit
>>               |          0
>>               |          iomap_write_end.isra.38
>>               |          iomap_write_actor
>>               |          iomap_apply
>>               |          iomap_file_buffered_write
>>               |          xfs_file_buffered_aio_write
>>               |          new_sync_write
>> 
>>   17.93%  [k] native_queued_spin_lock_slowpath      -      -
>>               |
>>               |--16.74%--_raw_spin_lock_irqsave
>>               |          |
>>               |           --16.44%--wake_up_page_bit
>>               |                     iomap_write_end.isra.38
>>               |                     iomap_write_actor
>>               |                     iomap_apply
>>               |                     iomap_file_buffered_write
>>               |                     xfs_file_buffered_aio_write
>> 
>> This patch uses alloc_large_system_hash to allocate a bigger system hash
>> that scales somewhat with memory size. The bit/var wait-queue is also
>> changed to keep code matching, albiet with a smaller scale factor.
>> 
>> A very small CONFIG_BASE_SMALL option is also added because these are two
>> of the biggest static objects in the image on very small systems.
>> 
>> This hash could be made per-node, which may help reduce remote accesses
>> on well localised workloads, but that adds some complexity with indexing
>> and hotplug, so until we get a less artificial workload to test with,
>> keep it simple.
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  kernel/sched/wait_bit.c | 30 +++++++++++++++++++++++-------
>>  mm/filemap.c            | 24 +++++++++++++++++++++---
>>  2 files changed, 44 insertions(+), 10 deletions(-)
>> 
>> diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
>> index 02ce292b9bc0..dba73dec17c4 100644
>> --- a/kernel/sched/wait_bit.c
>> +++ b/kernel/sched/wait_bit.c
>> @@ -2,19 +2,24 @@
>>  /*
>>   * The implementation of the wait_bit*() and related waiting APIs:
>>   */
>> +#include <linux/memblock.h>
>>  #include "sched.h"
>>  
>> -#define WAIT_TABLE_BITS 8
>> -#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
> 
> Ugh, 256 entries is almost embarrassingly small indeed.
> 
> I've put your patch into sched/core, unless Andrew is objecting.

Thanks. Andrew and Linux might have some opinions on it, but if it's 
just in a testing branch for now that's okay.


> 
>> -	for (i = 0; i < WAIT_TABLE_SIZE; i++)
>> +	if (!CONFIG_BASE_SMALL) {
>> +		bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
>> +							sizeof(wait_queue_head_t),
>> +							0,
>> +							22,
>> +							0,
>> +							&bit_wait_table_bits,
>> +							NULL,
>> +							0,
>> +							0);
>> +	}
>> +	for (i = 0; i < BIT_WAIT_TABLE_SIZE; i++)
>>  		init_waitqueue_head(bit_wait_table + i);
> 
> 
> Meta suggestion: maybe the CONFIG_BASE_SMALL ugliness could be folded 
> into alloc_large_system_hash() itself?

I don't like the ugliness and that's a good suggestion in some ways, but 
having the constant size and table is nice for the small system. I don't 
know, maybe we need to revise the alloc_large_system_hash API slightly.

Having some kind of DEFINE_LARGE_ARRAY perhaps then you could have both
static and dynamic? I'll think about it.

> 
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
> 
>>  static wait_queue_head_t *page_waitqueue(struct page *page)
>>  {
>> -	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
>> +	return &page_wait_table[hash_ptr(page, page_wait_table_bits)];
>>  }
> 
> I'm wondering whether you've tried to make this NUMA aware through 
> page->node?
> 
> Seems like another useful step when having a global hash ...

Yes I have patches for that on the back burner. Just wanted to try one
step at a time, but I think we should be able to justify it (a well 
NUMAified workload will tend to store mostly to local page waitqueue so 
keep cacheline contention within the node). We need to get some access 
to a big system again and try get some more IO on it at some point, so 
stay tuned for that.

We actually used to have similar to this, but Linux removed it with
9dcb8b685fc30.

The difference now is that the page waitqueue has been split out from
the bit waitqueue. Doing the page waitqueue is much easier because we
don't have the vmalloc problem to deal with. But still it's some
complexity.

We also do have the page contention bit that Linus refers to which takes 
pressure off the waitqueues (which is probably why 256 entries has held 
up surprisingly well), but as we can see we do need larger at the high
end.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17  7:54 [PATCH v2] Increase page and bit waitqueue hash size Nicholas Piggin
  2021-03-17  8:38 ` Ingo Molnar
@ 2021-03-17 10:12 ` Rasmus Villemoes
  2021-03-17 10:44   ` Nicholas Piggin
  2021-03-17 11:25 ` kernel test robot
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Rasmus Villemoes @ 2021-03-17 10:12 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: Andrew Morton, Linus Torvalds, linux-mm, Anton Blanchard, Ingo Molnar

On 17/03/2021 08.54, Nicholas Piggin wrote:

> +#if CONFIG_BASE_SMALL
> +static const unsigned int page_wait_table_bits = 4;
>  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

>  
> +	if (!CONFIG_BASE_SMALL) {
> +		page_wait_table = alloc_large_system_hash("page waitqueue hash",
> +							sizeof(wait_queue_head_t),
> +							0,

So, how does the compiler not scream at you for assigning to an array,
even if it's inside an if (0)?

Rasmus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17 10:12 ` Rasmus Villemoes
@ 2021-03-17 10:44   ` Nicholas Piggin
  2021-03-17 19:26     ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2021-03-17 10:44 UTC (permalink / raw)
  To: linux-kernel, Rasmus Villemoes
  Cc: Andrew Morton, Anton Blanchard, linux-mm, Ingo Molnar, Linus Torvalds

Excerpts from Rasmus Villemoes's message of March 17, 2021 8:12 pm:
> On 17/03/2021 08.54, Nicholas Piggin wrote:
> 
>> +#if CONFIG_BASE_SMALL
>> +static const unsigned int page_wait_table_bits = 4;
>>  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
> 
>>  
>> +	if (!CONFIG_BASE_SMALL) {
>> +		page_wait_table = alloc_large_system_hash("page waitqueue hash",
>> +							sizeof(wait_queue_head_t),
>> +							0,
> 
> So, how does the compiler not scream at you for assigning to an array,
> even if it's inside an if (0)?
> 

Argh, because I didn't test small. Sorry I had the BASE_SMALL setting in 
another patch and thought it would be a good idea to mash them together. 
In hindsight probably not even if it did build.

Thanks,
Nick

--
[PATCH v3] Increase page and bit waitqueue hash size

The page waitqueue hash is a bit small (256 entries) on very big systems. A
16 socket 1536 thread POWER9 system was found to encounter hash collisions
and excessive time in waitqueue locking at times. This was intermittent and
hard to reproduce easily with the setup we had (very little real IO
capacity). The theory is that sometimes (depending on allocation luck)
important pages would happen to collide a lot in the hash, slowing down page
locking, causing the problem to snowball.

An small test case was made where threads would write and fsync different
pages, generating just a small amount of contention across many pages.

Increasing page waitqueue hash size to 262144 entries increased throughput
by 182% while also reducing standard deviation 3x. perf before the increase:

  36.23%  [k] _raw_spin_lock_irqsave                -      -
              |
              |--34.60%--wake_up_page_bit
              |          0
              |          iomap_write_end.isra.38
              |          iomap_write_actor
              |          iomap_apply
              |          iomap_file_buffered_write
              |          xfs_file_buffered_aio_write
              |          new_sync_write

  17.93%  [k] native_queued_spin_lock_slowpath      -      -
              |
              |--16.74%--_raw_spin_lock_irqsave
              |          |
              |           --16.44%--wake_up_page_bit
              |                     iomap_write_end.isra.38
              |                     iomap_write_actor
              |                     iomap_apply
              |                     iomap_file_buffered_write
              |                     xfs_file_buffered_aio_write

This patch uses alloc_large_system_hash to allocate a bigger system hash
that scales somewhat with memory size. The bit/var wait-queue is also
changed to keep code matching, albiet with a smaller scale factor.

This hash could be made per-node, which may help reduce remote accesses
on well localised workloads, but that adds some complexity with indexing
and hotplug, so until we get a less artificial workload to test with,
keep it simple.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 kernel/sched/wait_bit.c | 25 ++++++++++++++++++-------
 mm/filemap.c            | 16 ++++++++++++++--
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..3cc5fa552516 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -2,19 +2,20 @@
 /*
  * The implementation of the wait_bit*() and related waiting APIs:
  */
+#include <linux/memblock.h>
 #include "sched.h"
 
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+#define BIT_WAIT_TABLE_SIZE (1 << BIT_WAIT_TABLE_BITS)
+#define BIT_WAIT_TABLE_BITS bit_wait_table_bits
+static unsigned int bit_wait_table_bits __ro_after_init;
+static wait_queue_head_t *bit_wait_table __ro_after_init;
 
 wait_queue_head_t *bit_waitqueue(void *word, int bit)
 {
 	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
 	unsigned long val = (unsigned long)word << shift | bit;
 
-	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+	return bit_wait_table + hash_long(val, BIT_WAIT_TABLE_BITS);
 }
 EXPORT_SYMBOL(bit_waitqueue);
 
@@ -152,7 +153,7 @@ EXPORT_SYMBOL(wake_up_bit);
 
 wait_queue_head_t *__var_waitqueue(void *p)
 {
-	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+	return bit_wait_table + hash_ptr(p, BIT_WAIT_TABLE_BITS);
 }
 EXPORT_SYMBOL(__var_waitqueue);
 
@@ -246,6 +247,16 @@ void __init wait_bit_init(void)
 {
 	int i;
 
-	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+	bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
+						sizeof(wait_queue_head_t),
+						0,
+						22,
+						0,
+						&bit_wait_table_bits,
+						NULL,
+						0,
+						0);
+
+	for (i = 0; i < BIT_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(bit_wait_table + i);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 43700480d897..df5a3ef4031b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/security.h>
 #include <linux/cpuset.h>
 #include <linux/hugetlb.h>
+#include <linux/memblock.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 #include <linux/shmem_fs.h>
@@ -990,9 +991,10 @@ EXPORT_SYMBOL(__page_cache_alloc);
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-#define PAGE_WAIT_TABLE_BITS 8
 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+#define PAGE_WAIT_TABLE_BITS page_wait_table_bits
+static unsigned int page_wait_table_bits __ro_after_init;
+static wait_queue_head_t *page_wait_table __ro_after_init;
 
 static wait_queue_head_t *page_waitqueue(struct page *page)
 {
@@ -1003,6 +1005,16 @@ void __init pagecache_init(void)
 {
 	int i;
 
+	page_wait_table = alloc_large_system_hash("page waitqueue hash",
+						sizeof(wait_queue_head_t),
+						0,
+						21,
+						0,
+						&page_wait_table_bits,
+						NULL,
+						0,
+						0);
+
 	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(&page_wait_table[i]);
 
-- 
2.23.0

> Rasmus
> 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17  7:54 [PATCH v2] Increase page and bit waitqueue hash size Nicholas Piggin
  2021-03-17  8:38 ` Ingo Molnar
  2021-03-17 10:12 ` Rasmus Villemoes
@ 2021-03-17 11:25 ` kernel test robot
  2021-03-17 11:30 ` kernel test robot
  2021-03-17 12:38 ` [tip: sched/core] sched/wait_bit, mm/filemap: " tip-bot2 for Nicholas Piggin
  4 siblings, 0 replies; 13+ messages in thread
From: kernel test robot @ 2021-03-17 11:25 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: kbuild-all, Nicholas Piggin, Andrew Morton,
	Linux Memory Management List, Anton Blanchard, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 7285 bytes --]

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on tip/sched/core linus/master v5.12-rc3 next-20210317]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/Increase-page-and-bit-waitqueue-hash-size/20210317-155619
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git a74e6a014c9d4d4161061f770c9b4f98372ac778
config: nds32-randconfig-r016-20210317 (attached as .config)
compiler: nds32le-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/d741b0903849631440ae34fb070178e9743c6928
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/Increase-page-and-bit-waitqueue-hash-size/20210317-155619
        git checkout d741b0903849631440ae34fb070178e9743c6928
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=nds32 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

>> mm/filemap.c:997:26: error: variably modified 'page_wait_table' at file scope
     997 | static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
         |                          ^~~~~~~~~~~~~~~
   mm/filemap.c: In function 'pagecache_init':
>> mm/filemap.c:1018:8: warning: passing argument 6 of 'alloc_large_system_hash' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
    1018 |        &page_wait_table_bits,
         |        ^~~~~~~~~~~~~~~~~~~~~
   In file included from mm/filemap.c:37:
   include/linux/memblock.h:574:14: note: expected 'unsigned int *' but argument is of type 'const unsigned int *'
     574 | extern void *alloc_large_system_hash(const char *tablename,
         |              ^~~~~~~~~~~~~~~~~~~~~~~
>> mm/filemap.c:1013:19: error: assignment to expression with array type
    1013 |   page_wait_table = alloc_large_system_hash("page waitqueue hash",
         |                   ^
--
>> kernel/sched/wait_bit.c:11:26: error: variably modified 'bit_wait_table' at file scope
      11 | static wait_queue_head_t bit_wait_table[BIT_WAIT_TABLE_SIZE] __cacheline_aligned;
         |                          ^~~~~~~~~~~~~~
   kernel/sched/wait_bit.c: In function 'wait_bit_init':
>> kernel/sched/wait_bit.c:260:8: warning: passing argument 6 of 'alloc_large_system_hash' discards 'const' qualifier from pointer target type [-Wdiscarded-qualifiers]
     260 |        &bit_wait_table_bits,
         |        ^~~~~~~~~~~~~~~~~~~~
   In file included from kernel/sched/wait_bit.c:5:
   include/linux/memblock.h:574:14: note: expected 'unsigned int *' but argument is of type 'const unsigned int *'
     574 | extern void *alloc_large_system_hash(const char *tablename,
         |              ^~~~~~~~~~~~~~~~~~~~~~~
>> kernel/sched/wait_bit.c:255:18: error: assignment to expression with array type
     255 |   bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
         |                  ^


vim +/page_wait_table +997 mm/filemap.c

44110fe385af23 Paul Jackson    2006-03-24   983  
^1da177e4c3f41 Linus Torvalds  2005-04-16   984  /*
^1da177e4c3f41 Linus Torvalds  2005-04-16   985   * In order to wait for pages to become available there must be
^1da177e4c3f41 Linus Torvalds  2005-04-16   986   * waitqueues associated with pages. By using a hash table of
^1da177e4c3f41 Linus Torvalds  2005-04-16   987   * waitqueues where the bucket discipline is to maintain all
^1da177e4c3f41 Linus Torvalds  2005-04-16   988   * waiters on the same queue and wake all when any of the pages
^1da177e4c3f41 Linus Torvalds  2005-04-16   989   * become available, and for the woken contexts to check to be
^1da177e4c3f41 Linus Torvalds  2005-04-16   990   * sure the appropriate page became available, this saves space
^1da177e4c3f41 Linus Torvalds  2005-04-16   991   * at a cost of "thundering herd" phenomena during rare hash
^1da177e4c3f41 Linus Torvalds  2005-04-16   992   * collisions.
^1da177e4c3f41 Linus Torvalds  2005-04-16   993   */
d741b090384963 Nicholas Piggin 2021-03-17   994  #define PAGE_WAIT_TABLE_SIZE (1 << page_wait_table_bits)
d741b090384963 Nicholas Piggin 2021-03-17   995  #if CONFIG_BASE_SMALL
d741b090384963 Nicholas Piggin 2021-03-17   996  static const unsigned int page_wait_table_bits = 4;
62906027091f1d Nicholas Piggin 2016-12-25  @997  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
d741b090384963 Nicholas Piggin 2021-03-17   998  #else
d741b090384963 Nicholas Piggin 2021-03-17   999  static unsigned int page_wait_table_bits __ro_after_init;
d741b090384963 Nicholas Piggin 2021-03-17  1000  static wait_queue_head_t *page_wait_table __ro_after_init;
d741b090384963 Nicholas Piggin 2021-03-17  1001  #endif
62906027091f1d Nicholas Piggin 2016-12-25  1002  
62906027091f1d Nicholas Piggin 2016-12-25  1003  static wait_queue_head_t *page_waitqueue(struct page *page)
^1da177e4c3f41 Linus Torvalds  2005-04-16  1004  {
d741b090384963 Nicholas Piggin 2021-03-17  1005  	return &page_wait_table[hash_ptr(page, page_wait_table_bits)];
^1da177e4c3f41 Linus Torvalds  2005-04-16  1006  }
^1da177e4c3f41 Linus Torvalds  2005-04-16  1007  
62906027091f1d Nicholas Piggin 2016-12-25  1008  void __init pagecache_init(void)
^1da177e4c3f41 Linus Torvalds  2005-04-16  1009  {
62906027091f1d Nicholas Piggin 2016-12-25  1010  	int i;
62906027091f1d Nicholas Piggin 2016-12-25  1011  
d741b090384963 Nicholas Piggin 2021-03-17  1012  	if (!CONFIG_BASE_SMALL) {
d741b090384963 Nicholas Piggin 2021-03-17 @1013  		page_wait_table = alloc_large_system_hash("page waitqueue hash",
d741b090384963 Nicholas Piggin 2021-03-17  1014  							sizeof(wait_queue_head_t),
d741b090384963 Nicholas Piggin 2021-03-17  1015  							0,
d741b090384963 Nicholas Piggin 2021-03-17  1016  							21,
d741b090384963 Nicholas Piggin 2021-03-17  1017  							0,
d741b090384963 Nicholas Piggin 2021-03-17 @1018  							&page_wait_table_bits,
d741b090384963 Nicholas Piggin 2021-03-17  1019  							NULL,
d741b090384963 Nicholas Piggin 2021-03-17  1020  							0,
d741b090384963 Nicholas Piggin 2021-03-17  1021  							0);
d741b090384963 Nicholas Piggin 2021-03-17  1022  	}
d741b090384963 Nicholas Piggin 2021-03-17  1023  
62906027091f1d Nicholas Piggin 2016-12-25  1024  	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
62906027091f1d Nicholas Piggin 2016-12-25  1025  		init_waitqueue_head(&page_wait_table[i]);
^1da177e4c3f41 Linus Torvalds  2005-04-16  1026  
62906027091f1d Nicholas Piggin 2016-12-25  1027  	page_writeback_init();
^1da177e4c3f41 Linus Torvalds  2005-04-16  1028  }
^1da177e4c3f41 Linus Torvalds  2005-04-16  1029  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 25520 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17  7:54 [PATCH v2] Increase page and bit waitqueue hash size Nicholas Piggin
                   ` (2 preceding siblings ...)
  2021-03-17 11:25 ` kernel test robot
@ 2021-03-17 11:30 ` kernel test robot
  2021-03-17 12:38 ` [tip: sched/core] sched/wait_bit, mm/filemap: " tip-bot2 for Nicholas Piggin
  4 siblings, 0 replies; 13+ messages in thread
From: kernel test robot @ 2021-03-17 11:30 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: kbuild-all, clang-built-linux, Nicholas Piggin, Andrew Morton,
	Linux Memory Management List, Anton Blanchard, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 6165 bytes --]

Hi Nicholas,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on linus/master hnaz-linux-mm/master v5.12-rc3 next-20210317]
[cannot apply to tip/sched/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/Increase-page-and-bit-waitqueue-hash-size/20210317-155619
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git a74e6a014c9d4d4161061f770c9b4f98372ac778
config: x86_64-randconfig-a015-20210317 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 8ef111222a3dd12a9175f69c3bff598c46e8bdf7)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/d741b0903849631440ae34fb070178e9743c6928
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/Increase-page-and-bit-waitqueue-hash-size/20210317-155619
        git checkout d741b0903849631440ae34fb070178e9743c6928
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> mm/filemap.c:997:42: warning: variable length arrays are a C99 feature [-Wvla-extension]
   static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
                                            ^~~~~~~~~~~~~~~~~~~~
   mm/filemap.c:994:30: note: expanded from macro 'PAGE_WAIT_TABLE_SIZE'
   #define PAGE_WAIT_TABLE_SIZE (1 << page_wait_table_bits)
                                ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   mm/filemap.c:1018:8: error: passing 'const unsigned int *' to parameter of type 'unsigned int *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers]
                                                           &page_wait_table_bits,
                                                           ^~~~~~~~~~~~~~~~~~~~~
   include/linux/memblock.h:579:24: note: passing argument to parameter '_hash_shift' here
                                        unsigned int *_hash_shift,
                                                      ^
   mm/filemap.c:1013:19: error: array type 'wait_queue_head_t [16]' is not assignable
                   page_wait_table = alloc_large_system_hash("page waitqueue hash",
                   ~~~~~~~~~~~~~~~ ^
   1 warning and 2 errors generated.
--
>> kernel/sched/wait_bit.c:11:41: warning: variable length arrays are a C99 feature [-Wvla-extension]
   static wait_queue_head_t bit_wait_table[BIT_WAIT_TABLE_SIZE] __cacheline_aligned;
                                           ^~~~~~~~~~~~~~~~~~~
   kernel/sched/wait_bit.c:8:29: note: expanded from macro 'BIT_WAIT_TABLE_SIZE'
   #define BIT_WAIT_TABLE_SIZE (1 << bit_wait_table_bits)
                               ^~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/sched/wait_bit.c:260:8: error: passing 'const unsigned int *' to parameter of type 'unsigned int *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers]
                                                           &bit_wait_table_bits,
                                                           ^~~~~~~~~~~~~~~~~~~~
   include/linux/memblock.h:579:24: note: passing argument to parameter '_hash_shift' here
                                        unsigned int *_hash_shift,
                                                      ^
   kernel/sched/wait_bit.c:255:18: error: array type 'wait_queue_head_t [8]' is not assignable
                   bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
                   ~~~~~~~~~~~~~~ ^
   1 warning and 2 errors generated.


vim +997 mm/filemap.c

44110fe385af23 Paul Jackson    2006-03-24   983  
^1da177e4c3f41 Linus Torvalds  2005-04-16   984  /*
^1da177e4c3f41 Linus Torvalds  2005-04-16   985   * In order to wait for pages to become available there must be
^1da177e4c3f41 Linus Torvalds  2005-04-16   986   * waitqueues associated with pages. By using a hash table of
^1da177e4c3f41 Linus Torvalds  2005-04-16   987   * waitqueues where the bucket discipline is to maintain all
^1da177e4c3f41 Linus Torvalds  2005-04-16   988   * waiters on the same queue and wake all when any of the pages
^1da177e4c3f41 Linus Torvalds  2005-04-16   989   * become available, and for the woken contexts to check to be
^1da177e4c3f41 Linus Torvalds  2005-04-16   990   * sure the appropriate page became available, this saves space
^1da177e4c3f41 Linus Torvalds  2005-04-16   991   * at a cost of "thundering herd" phenomena during rare hash
^1da177e4c3f41 Linus Torvalds  2005-04-16   992   * collisions.
^1da177e4c3f41 Linus Torvalds  2005-04-16   993   */
d741b090384963 Nicholas Piggin 2021-03-17   994  #define PAGE_WAIT_TABLE_SIZE (1 << page_wait_table_bits)
d741b090384963 Nicholas Piggin 2021-03-17   995  #if CONFIG_BASE_SMALL
d741b090384963 Nicholas Piggin 2021-03-17   996  static const unsigned int page_wait_table_bits = 4;
62906027091f1d Nicholas Piggin 2016-12-25  @997  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
d741b090384963 Nicholas Piggin 2021-03-17   998  #else
d741b090384963 Nicholas Piggin 2021-03-17   999  static unsigned int page_wait_table_bits __ro_after_init;
d741b090384963 Nicholas Piggin 2021-03-17  1000  static wait_queue_head_t *page_wait_table __ro_after_init;
d741b090384963 Nicholas Piggin 2021-03-17  1001  #endif
62906027091f1d Nicholas Piggin 2016-12-25  1002  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 38529 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tip: sched/core] sched/wait_bit, mm/filemap: Increase page and bit waitqueue hash size
  2021-03-17  7:54 [PATCH v2] Increase page and bit waitqueue hash size Nicholas Piggin
                   ` (3 preceding siblings ...)
  2021-03-17 11:30 ` kernel test robot
@ 2021-03-17 12:38 ` tip-bot2 for Nicholas Piggin
  2021-03-17 15:16   ` Thomas Gleixner
  4 siblings, 1 reply; 13+ messages in thread
From: tip-bot2 for Nicholas Piggin @ 2021-03-17 12:38 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Nicholas Piggin, Ingo Molnar, Peter Zijlstra, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     873d7c4c6a920d43ff82e44121e54053d4edba93
Gitweb:        https://git.kernel.org/tip/873d7c4c6a920d43ff82e44121e54053d4edba93
Author:        Nicholas Piggin <npiggin@gmail.com>
AuthorDate:    Wed, 17 Mar 2021 17:54:27 +10:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 17 Mar 2021 09:32:30 +01:00

sched/wait_bit, mm/filemap: Increase page and bit waitqueue hash size

The page waitqueue hash is a bit small (256 entries) on very big systems. A
16 socket 1536 thread POWER9 system was found to encounter hash collisions
and excessive time in waitqueue locking at times. This was intermittent and
hard to reproduce easily with the setup we had (very little real IO
capacity). The theory is that sometimes (depending on allocation luck)
important pages would happen to collide a lot in the hash, slowing down page
locking, causing the problem to snowball.

An small test case was made where threads would write and fsync different
pages, generating just a small amount of contention across many pages.

Increasing page waitqueue hash size to 262144 entries increased throughput
by 182% while also reducing standard deviation 3x. perf before the increase:

  36.23%  [k] _raw_spin_lock_irqsave                -      -
              |
              |--34.60%--wake_up_page_bit
              |          0
              |          iomap_write_end.isra.38
              |          iomap_write_actor
              |          iomap_apply
              |          iomap_file_buffered_write
              |          xfs_file_buffered_aio_write
              |          new_sync_write

  17.93%  [k] native_queued_spin_lock_slowpath      -      -
              |
              |--16.74%--_raw_spin_lock_irqsave
              |          |
              |           --16.44%--wake_up_page_bit
              |                     iomap_write_end.isra.38
              |                     iomap_write_actor
              |                     iomap_apply
              |                     iomap_file_buffered_write
              |                     xfs_file_buffered_aio_write

This patch uses alloc_large_system_hash to allocate a bigger system hash
that scales somewhat with memory size. The bit/var wait-queue is also
changed to keep code matching, albiet with a smaller scale factor.

A very small CONFIG_BASE_SMALL option is also added because these are two
of the biggest static objects in the image on very small systems.

This hash could be made per-node, which may help reduce remote accesses
on well localised workloads, but that adds some complexity with indexing
and hotplug, so until we get a less artificial workload to test with,
keep it simple.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: https://lore.kernel.org/r/20210317075427.587806-1-npiggin@gmail.com
---
 kernel/sched/wait_bit.c | 30 +++++++++++++++++++++++-------
 mm/filemap.c            | 24 +++++++++++++++++++++---
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292..dba73de 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -2,19 +2,24 @@
 /*
  * The implementation of the wait_bit*() and related waiting APIs:
  */
+#include <linux/memblock.h>
 #include "sched.h"
 
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+#define BIT_WAIT_TABLE_SIZE (1 << bit_wait_table_bits)
+#if CONFIG_BASE_SMALL
+static const unsigned int bit_wait_table_bits = 3;
+static wait_queue_head_t bit_wait_table[BIT_WAIT_TABLE_SIZE] __cacheline_aligned;
+#else
+static unsigned int bit_wait_table_bits __ro_after_init;
+static wait_queue_head_t *bit_wait_table __ro_after_init;
+#endif
 
 wait_queue_head_t *bit_waitqueue(void *word, int bit)
 {
 	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
 	unsigned long val = (unsigned long)word << shift | bit;
 
-	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+	return bit_wait_table + hash_long(val, bit_wait_table_bits);
 }
 EXPORT_SYMBOL(bit_waitqueue);
 
@@ -152,7 +157,7 @@ EXPORT_SYMBOL(wake_up_bit);
 
 wait_queue_head_t *__var_waitqueue(void *p)
 {
-	return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+	return bit_wait_table + hash_ptr(p, bit_wait_table_bits);
 }
 EXPORT_SYMBOL(__var_waitqueue);
 
@@ -246,6 +251,17 @@ void __init wait_bit_init(void)
 {
 	int i;
 
-	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+	if (!CONFIG_BASE_SMALL) {
+		bit_wait_table = alloc_large_system_hash("bit waitqueue hash",
+							sizeof(wait_queue_head_t),
+							0,
+							22,
+							0,
+							&bit_wait_table_bits,
+							NULL,
+							0,
+							0);
+	}
+	for (i = 0; i < BIT_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(bit_wait_table + i);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 4370048..dbbb5b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/security.h>
 #include <linux/cpuset.h>
 #include <linux/hugetlb.h>
+#include <linux/memblock.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 #include <linux/shmem_fs.h>
@@ -990,19 +991,36 @@ EXPORT_SYMBOL(__page_cache_alloc);
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-#define PAGE_WAIT_TABLE_BITS 8
-#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+#define PAGE_WAIT_TABLE_SIZE (1 << page_wait_table_bits)
+#if CONFIG_BASE_SMALL
+static const unsigned int page_wait_table_bits = 4;
 static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+#else
+static unsigned int page_wait_table_bits __ro_after_init;
+static wait_queue_head_t *page_wait_table __ro_after_init;
+#endif
 
 static wait_queue_head_t *page_waitqueue(struct page *page)
 {
-	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+	return &page_wait_table[hash_ptr(page, page_wait_table_bits)];
 }
 
 void __init pagecache_init(void)
 {
 	int i;
 
+	if (!CONFIG_BASE_SMALL) {
+		page_wait_table = alloc_large_system_hash("page waitqueue hash",
+							sizeof(wait_queue_head_t),
+							0,
+							21,
+							0,
+							&page_wait_table_bits,
+							NULL,
+							0,
+							0);
+	}
+
 	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(&page_wait_table[i]);
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [tip: sched/core] sched/wait_bit, mm/filemap: Increase page and bit waitqueue hash size
  2021-03-17 12:38 ` [tip: sched/core] sched/wait_bit, mm/filemap: " tip-bot2 for Nicholas Piggin
@ 2021-03-17 15:16   ` Thomas Gleixner
  2021-03-17 19:54     ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Thomas Gleixner @ 2021-03-17 15:16 UTC (permalink / raw)
  To: tip-bot2 for Nicholas Piggin, linux-tip-commits
  Cc: Nicholas Piggin, Ingo Molnar, Peter Zijlstra, x86, linux-kernel

On Wed, Mar 17 2021 at 12:38, tip-bot wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID:     873d7c4c6a920d43ff82e44121e54053d4edba93
> Gitweb:        https://git.kernel.org/tip/873d7c4c6a920d43ff82e44121e54053d4edba93
> Author:        Nicholas Piggin <npiggin@gmail.com>
> AuthorDate:    Wed, 17 Mar 2021 17:54:27 +10:00
> Committer:     Ingo Molnar <mingo@kernel.org>
> CommitterDate: Wed, 17 Mar 2021 09:32:30 +01:00

Groan. This does not even compile and Nicholas already sent a V3 in the
very same thread. Zapped ...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17 10:44   ` Nicholas Piggin
@ 2021-03-17 19:26     ` Linus Torvalds
  2021-03-17 22:22       ` Nicholas Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2021-03-17 19:26 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Linux Kernel Mailing List, Rasmus Villemoes, Andrew Morton,
	Anton Blanchard, Linux-MM, Ingo Molnar

On Wed, Mar 17, 2021 at 3:44 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Argh, because I didn't test small. Sorry I had the BASE_SMALL setting in
> another patch and thought it would be a good idea to mash them together.
> In hindsight probably not even if it did build.

I was going to complain about that code in general.

First complaining about the hash being small, and then adding a config
option to make it ridiculously much *smaller* seemed wrong to begin
with, and didn't make any sense.

So no, please don't smash together.

In fact, I'd like to see this split up, and with more numbers:

 - separate out the bit_waitqueue thing that is almost certainly not
remotely as critical (and maybe not needed at all)

 - show the profile number _after_ the patch(es)

 - explain why you picked the random scaling numbers (21 and 22 for
the two different cases)?

 - give an estimate of how big the array now ends up being for
different configurations.

I think it ends up using that "scale" factor of 21, and basically
being "memory size >> 21" and then rounding up to a power of two.

And honestly, I'm not sure that makes much sense. So for a 1GB machine
we get the same as we used to for the bit waitqueue (twice as many for
the page waitqueue) , but if you run on some smaller setup, you
apparently can end up with just a couple of buckets.

So I'd feel a lot better about this if I saw the numbers, and got the
feeling that the patch actually tries to take legacy machines into
account.

And even on a big machine, what's the advantage of scaling perfectly
with memory. If you have a terabyte of RAM, why would you need half a
million hash entries (if I did the math right), and use 4GB of memory
on it? The contention doesn't go up by amount of memory, it goes up
roughly by number of threads, and the two are very seldom really all
that linearly connected.

So honestly, I'd like to see more reasonable numbers. I'd like to see
what the impact of just raising the hash bit size from 8 to 16 is on
that big machine. Maybe still using alloc_large_system_hash(), but
using a low-imit of 8 (our traditional very old number that hasn't
been a problem even on small machines), and a high-limit of 16 or
something.

And if you want even more, I really really want that justified by the
performance / profile numbers.

And does does that "bit_waitqueue" really merit updating AT ALL? It's
almost entirely unused these days. I think maybe the page lock code
used to use that, but then realized it had more specialized needs, so
now it's separate.

So can we split that bit-waitqueue thing up from the page waitqueue
changes? They have basically nothing in common except for a history,
and I think they should be treated separately (including the
explanation for what actually hits the bottleneck).

          Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [tip: sched/core] sched/wait_bit, mm/filemap: Increase page and bit waitqueue hash size
  2021-03-17 15:16   ` Thomas Gleixner
@ 2021-03-17 19:54     ` Ingo Molnar
  0 siblings, 0 replies; 13+ messages in thread
From: Ingo Molnar @ 2021-03-17 19:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: tip-bot2 for Nicholas Piggin, linux-tip-commits, Nicholas Piggin,
	Peter Zijlstra, x86, linux-kernel


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Wed, Mar 17 2021 at 12:38, tip-bot wrote:
> > The following commit has been merged into the sched/core branch of tip:
> >
> > Commit-ID:     873d7c4c6a920d43ff82e44121e54053d4edba93
> > Gitweb:        https://git.kernel.org/tip/873d7c4c6a920d43ff82e44121e54053d4edba93
> > Author:        Nicholas Piggin <npiggin@gmail.com>
> > AuthorDate:    Wed, 17 Mar 2021 17:54:27 +10:00
> > Committer:     Ingo Molnar <mingo@kernel.org>
> > CommitterDate: Wed, 17 Mar 2021 09:32:30 +01:00
> 
> Groan. This does not even compile and Nicholas already sent a V3 in the
> very same thread. Zapped ...

Yeah, thanks - got that too late and got distracted, groan #2.

Thanks!

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17 19:26     ` Linus Torvalds
@ 2021-03-17 22:22       ` Nicholas Piggin
  2021-03-17 23:13         ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Nicholas Piggin @ 2021-03-17 22:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Anton Blanchard, Linux Kernel Mailing List,
	Linux-MM, Rasmus Villemoes, Ingo Molnar

Excerpts from Linus Torvalds's message of March 18, 2021 5:26 am:
> On Wed, Mar 17, 2021 at 3:44 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> Argh, because I didn't test small. Sorry I had the BASE_SMALL setting in
>> another patch and thought it would be a good idea to mash them together.
>> In hindsight probably not even if it did build.
> 
> I was going to complain about that code in general.
> 
> First complaining about the hash being small, and then adding a config
> option to make it ridiculously much *smaller* seemed wrong to begin
> with, and didn't make any sense.
> 
> So no, please don't smash together.

Fair point, fixed.

> 
> In fact, I'd like to see this split up, and with more numbers:
> 
>  - separate out the bit_waitqueue thing that is almost certainly not
> remotely as critical (and maybe not needed at all)
> 
>  - show the profile number _after_ the patch(es)

Might take some time to get a system and run tests. We actually had 
difficulty recreating it before this patch too, so it's kind of
hard to say _that_ was the exact case that previously ran badly and
is now fixed. We thought just the statistical nature of collisions
and page / lock contention made things occasionally line up and
tank.

>  - explain why you picked the random scaling numbers (21 and 22 for
> the two different cases)?
> 
>  - give an estimate of how big the array now ends up being for
> different configurations.
> 
> I think it ends up using that "scale" factor of 21, and basically
> being "memory size >> 21" and then rounding up to a power of two.
> 
> And honestly, I'm not sure that makes much sense. So for a 1GB machine
> we get the same as we used to for the bit waitqueue (twice as many for
> the page waitqueue) , but if you run on some smaller setup, you
> apparently can end up with just a couple of buckets.
> 
> So I'd feel a lot better about this if I saw the numbers, and got the
> feeling that the patch actually tries to take legacy machines into
> account.
>
> And even on a big machine, what's the advantage of scaling perfectly
> with memory. If you have a terabyte of RAM, why would you need half a
> million hash entries (if I did the math right), and use 4GB of memory
> on it? The contention doesn't go up by amount of memory, it goes up
> roughly by number of threads, and the two are very seldom really all
> that linearly connected.
> 
> So honestly, I'd like to see more reasonable numbers. I'd like to see
> what the impact of just raising the hash bit size from 8 to 16 is on
> that big machine. Maybe still using alloc_large_system_hash(), but
> using a low-imit of 8 (our traditional very old number that hasn't
> been a problem even on small machines), and a high-limit of 16 or
> something.
> 
> And if you want even more, I really really want that justified by the
> performance / profile numbers.

Yes all good points I'll add those numbers. It may need a floor and
ceiling or something like that. We may not need quite so many entries.

> 
> And does does that "bit_waitqueue" really merit updating AT ALL? It's
> almost entirely unused these days.

I updated it mainly because keeping the code more similar ends up being 
easier than unnecessary diverging. The memory cost is no big deal (once 
limits are fixed) so I prefer not to encounter some case where it falls 
over.

> I think maybe the page lock code
> used to use that, but then realized it had more specialized needs, so
> now it's separate.
> 
> So can we split that bit-waitqueue thing up from the page waitqueue
> changes? They have basically nothing in common except for a history,
> and I think they should be treated separately (including the
> explanation for what actually hits the bottleneck).

It's still used. Buffer heads being an obvious and widely used one that
follows similar usage pattern as page lock / writeback in some cases.
Several other filesystems seem to use it for similar block / IO
tracking structures by the looks (md, btrfs, nfs).

Thanks,
Nick

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] Increase page and bit waitqueue hash size
  2021-03-17 22:22       ` Nicholas Piggin
@ 2021-03-17 23:13         ` Linus Torvalds
  0 siblings, 0 replies; 13+ messages in thread
From: Linus Torvalds @ 2021-03-17 23:13 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andrew Morton, Anton Blanchard, Linux Kernel Mailing List,
	Linux-MM, Rasmus Villemoes, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 3117 bytes --]

On Wed, Mar 17, 2021 at 3:23 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Might take some time to get a system and run tests. We actually had
> difficulty recreating it before this patch too, so it's kind of
> hard to say _that_ was the exact case that previously ran badly and
> is now fixed. We thought just the statistical nature of collisions
> and page / lock contention made things occasionally line up and
> tank.

Yeah, it probably depends a lot on the exact page allocation patterns
and just bad luck.  Plus the actual spinlocks will end up with false
sharing anyway, so you might need to have both contention on multiple
pages on the same hash queue _and_ perhaps some action on other pages
that just hash close-by so that they make the cache coherency protocol
have to work harder..

And then 99% of the time it all just works out fine, because we only
need to look at the hashed spinlocks when we have any contention on
the page lock in the first place, and that generally should be
something we should avoid for other reasons.

But it is perhaps also a sign that while 256 entries is "ridiculously
small", it might be the case that it's not necessarily much of a real
problem in the end, and I get the feeling that growing it by several
orders of magnitude is overkill.

In fact, the problems we have had with the page wait queue have often
been because we did too much page locking in the first place.

I actually have a couple of patches in my "still thinking about it
tree" (that have been there for six months by now, so maybe not
thinking too _actively_ about it) which remove some overly stupid page
locking.

For example, look at "do_shared_fault()". Currently __do_fault()
always locks the result page. Then if you have a page_mkwrite()
function, we immediately unlock it again. And then we lock it again in
do_page_mkwrite().

Does that particular case matter? I really have no idea. But we
basically lock it twice for what looks like absolutely zero reason
other than calling conventions.

Bah. I'll just attach my three "not very actively thinking about it"
patches here in case anybody else wants to not actively think about
them.

I've actually been running these on my own machine since October,
since the kernel I actually boot on my main desktop always includes my
"thinking about it" patches.

The two first patches should fix the double lock thing I mentioned.

The third patch should be a no-op right now, but the thinking is
outlined in the commit message: why do we even lock pages in
filemap_fault? I'm not actually convinced we need to. I think we could
return an unputodate page unlocked, and instead do the checking at pte
install time (I had some discussions with Kirill about instead doing
it at pte install time, and depending entirely on memory ordering
constraints wrt page truncation that *does* take the page lock, and
then does various other checks - including the page mapcount and
taking the ptl lock - under that lock).

Anyway, I don't mind the "make the hash table larger" regardless of
this, but I do want it to be documented a bit more.

               Linus

[-- Attachment #2: 0001-mm-move-final-page-locking-out-of-__do_fault-helper-.patch --]
[-- Type: text/x-patch, Size: 2744 bytes --]

From efdc3feadb493ae7f24c573c8b863d7a51117391 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 13 Oct 2020 10:22:00 -0700
Subject: [PATCH 1/3] mm: move final page locking out of __do_fault() helper
 into callers

The old semantics of our __do_fault() helper was that it always locked
the page unless there was an error (or unless the faulting had already
handled a COW event).

That turns out to be a mistake.  Not all callers actually want the page
locked at all, and they might as well check the same VM_FAULT_LOCKED bit
that __do_fault() itself checked whether the page is already locked or
not.

This change only moves that final page locking out into the callers, but
intentionally does not actually change any of the locking semantics: the
callers will not just do that final page locking themselves instead.

That means that future patches may then decide to not lock the page
after all, but this is just preparation for any such future change.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 5efa07fb6cdc..1e2796d68e43 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3647,11 +3647,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
 		return VM_FAULT_HWPOISON;
 	}
 
-	if (unlikely(!(ret & VM_FAULT_LOCKED)))
-		lock_page(vmf->page);
-	else
-		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
-
 	return ret;
 }
 
@@ -3940,6 +3935,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
+		lock_page(vmf->page);
+	else
+		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
+
 	ret |= finish_fault(vmf);
 	unlock_page(vmf->page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3971,6 +3971,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
 	if (ret & VM_FAULT_DONE_COW)
 		return ret;
 
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
+		lock_page(vmf->page);
+	else
+		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
+
 	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
 	__SetPageUptodate(vmf->cow_page);
 
@@ -3994,6 +3999,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
+		lock_page(vmf->page);
+	else
+		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
+
 	/*
 	 * Check if the backing address space wants to know that the page is
 	 * about to become writable
-- 
2.30.0.352.gf6a05684e6


[-- Attachment #3: 0002-mm-don-t-lock-the-page-only-to-immediately-unlock-it.patch --]
[-- Type: text/x-patch, Size: 2276 bytes --]

From 1b2b061f622ea60924eba886975183306aa56972 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 13 Oct 2020 10:43:05 -0700
Subject: [PATCH 2/3] mm: don't lock the page, only to immediately unlock it
 again for do_page_mkwrite()

Our page locking during fault handling a bit messy, and the shared fault
code in particular was locking the page only to immediately unlock it
again because do_page_mkwrite() wanted it unlocked.

We keep the "did we lock it" state around in the VM_FAULT_LOCKED bit, so
let's just use that knowledge, and not first lock it if it wasn't
locked, only to then unlock it again.

It would be even better to transfer the "did we already lock this page"
information into do_page_mkwrite(), because that function will actually
want to lock it eventually anyway, but let's just clean up one thing at
a time.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 1e2796d68e43..d7e30273bef1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3999,25 +3999,33 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
-	if (unlikely(!(ret & VM_FAULT_LOCKED)))
-		lock_page(vmf->page);
-	else
-		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
-
 	/*
 	 * Check if the backing address space wants to know that the page is
 	 * about to become writable
 	 */
 	if (vma->vm_ops->page_mkwrite) {
-		unlock_page(vmf->page);
+		/* do_page_mkwrite() wants the page unlocked */
+		if (ret & VM_FAULT_LOCKED) {
+			unlock_page(vmf->page);
+			ret &= ~VM_FAULT_LOCKED;
+		}
+
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp ||
 				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
+
+		/* Did do_page_mkwrite() lock the page again? */
+		ret |= tmp & VM_FAULT_LOCKED;
 	}
 
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
+		lock_page(vmf->page);
+	else
+		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
+
 	ret |= finish_fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
 					VM_FAULT_RETRY))) {
-- 
2.30.0.352.gf6a05684e6


[-- Attachment #4: 0003-mm-do_cow_fault-does-not-need-the-source-page-to-be-.patch --]
[-- Type: text/x-patch, Size: 1984 bytes --]

From 7cf5ad81641214d2c4bb6404b64436a787791749 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 13 Oct 2020 11:00:33 -0700
Subject: [PATCH 3/3] mm: do_cow_fault() does not need the source page to be
 locked

This removes the "lock if it wasn't locked" logic from do_cow_fault(),
since we're not even going to install that page into the destination
address space (finish_fault() will use ->cow_page rather than ->page),
and copying the source page does not need the source to be locked.

So instead of doing "lock if it wasn't locked" followed by an
unconditional unlock of the page, just do "unlock if it was locked".

Of course, since all the normal file mapping ->fault() handlers
currently lock the page they return (see filemap_fault() for details),
all of this is pretty much theoretical.

But this is the right thing to do - making sure we hold the page lock
when we really don't is just confusing and wrong.  And this prepares the
way for any future changes to filemap_fault() where we go "Oh, we
actually _don't_ need to lock the page for this case at all".

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index d7e30273bef1..44653b5b2af6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3971,16 +3971,13 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
 	if (ret & VM_FAULT_DONE_COW)
 		return ret;
 
-	if (unlikely(!(ret & VM_FAULT_LOCKED)))
-		lock_page(vmf->page);
-	else
-		VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
-
 	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
 	__SetPageUptodate(vmf->cow_page);
 
+	if (ret & VM_FAULT_LOCKED)
+		unlock_page(vmf->page);
+
 	ret |= finish_fault(vmf);
-	unlock_page(vmf->page);
 	put_page(vmf->page);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
-- 
2.30.0.352.gf6a05684e6


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-03-17 23:14 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-17  7:54 [PATCH v2] Increase page and bit waitqueue hash size Nicholas Piggin
2021-03-17  8:38 ` Ingo Molnar
2021-03-17 10:02   ` Nicholas Piggin
2021-03-17 10:12 ` Rasmus Villemoes
2021-03-17 10:44   ` Nicholas Piggin
2021-03-17 19:26     ` Linus Torvalds
2021-03-17 22:22       ` Nicholas Piggin
2021-03-17 23:13         ` Linus Torvalds
2021-03-17 11:25 ` kernel test robot
2021-03-17 11:30 ` kernel test robot
2021-03-17 12:38 ` [tip: sched/core] sched/wait_bit, mm/filemap: " tip-bot2 for Nicholas Piggin
2021-03-17 15:16   ` Thomas Gleixner
2021-03-17 19:54     ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).