All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH]shmem: reduce one time of locking in pagefault
@ 2010-07-07  1:15 Shaohua Li
  2010-07-07  1:32   ` Andrew Morton
  2010-07-09  1:13   ` Hugh Dickins
  0 siblings, 2 replies; 11+ messages in thread
From: Shaohua Li @ 2010-07-07  1:15 UTC (permalink / raw)
  To: lkml, linux-mm; +Cc: Andrew Morton, Andi Kleen, Zhang, Yanmin

[-- Attachment #1: Type: text/plain, Size: 3922 bytes --]

I'm running a shmem pagefault test case (see attached file) under a 64 CPU
system. Profile shows shmem_inode_info->lock is heavily contented and 100%
CPUs time are trying to get the lock. In the pagefault (no swap) case,
shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
page so we could reduce one time of locking. This is what below patch does.

The result of the test case:
2.6.35-rc3: ~20s
2.6.35-rc3 + patch: ~12s
so this is 40% improvement.

One might argue if we could have better locking for shmem. But even shmem is lockless,
the pagefault will soon have pagecache lock heavily contented because shmem must add
new page to pagecache. So before we have better locking for pagecache, improving shmem
locking doesn't have too much improvement. I did a similar pagefault test against
a ramfs file, the test result is ~10.5s.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

diff --git a/mm/shmem.c b/mm/shmem.c
index f65f840..c5f2939 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1223,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 	struct shmem_sb_info *sbinfo;
 	struct page *filepage = *pagep;
 	struct page *swappage;
+	struct page *prealloc_page = NULL;
 	swp_entry_t *entry;
 	swp_entry_t swap;
 	gfp_t gfp;
@@ -1247,7 +1248,6 @@ repeat:
 		filepage = find_lock_page(mapping, idx);
 	if (filepage && PageUptodate(filepage))
 		goto done;
-	error = 0;
 	gfp = mapping_gfp_mask(mapping);
 	if (!filepage) {
 		/*
@@ -1258,7 +1258,19 @@ repeat:
 		if (error)
 			goto failed;
 		radix_tree_preload_end();
+		if (sgp != SGP_READ) {
+			/* don't care if this successes */
+			prealloc_page = shmem_alloc_page(gfp, info, idx);
+			if (prealloc_page) {
+				if (mem_cgroup_cache_charge(prealloc_page,
+				    current->mm, GFP_KERNEL)) {
+					page_cache_release(prealloc_page);
+					prealloc_page = NULL;
+				}
+			}
+		}
 	}
+	error = 0;
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
@@ -1407,28 +1419,37 @@ repeat:
 		if (!filepage) {
 			int ret;
 
-			spin_unlock(&info->lock);
-			filepage = shmem_alloc_page(gfp, info, idx);
-			if (!filepage) {
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				error = -ENOMEM;
-				goto failed;
-			}
-			SetPageSwapBacked(filepage);
+			if (!prealloc_page) {
+				spin_unlock(&info->lock);
+				filepage = shmem_alloc_page(gfp, info, idx);
+				if (!filepage) {
+					shmem_unacct_blocks(info->flags, 1);
+					shmem_free_blocks(inode, 1);
+					error = -ENOMEM;
+					goto failed;
+				}
+				SetPageSwapBacked(filepage);
 
-			/* Precharge page while we can wait, compensate after */
-			error = mem_cgroup_cache_charge(filepage, current->mm,
-					GFP_KERNEL);
-			if (error) {
-				page_cache_release(filepage);
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				filepage = NULL;
-				goto failed;
+				/* Precharge page while we can wait, compensate
+				 * after
+				 */
+				error = mem_cgroup_cache_charge(filepage,
+					current->mm, GFP_KERNEL);
+				if (error) {
+					page_cache_release(filepage);
+					shmem_unacct_blocks(info->flags, 1);
+					shmem_free_blocks(inode, 1);
+					filepage = NULL;
+					goto failed;
+				}
+
+				spin_lock(&info->lock);
+			} else {
+				filepage = prealloc_page;
+				prealloc_page = NULL;
+				SetPageSwapBacked(filepage);
 			}
 
-			spin_lock(&info->lock);
 			entry = shmem_swp_alloc(info, idx, sgp);
 			if (IS_ERR(entry))
 				error = PTR_ERR(entry);
@@ -1469,6 +1490,10 @@ repeat:
 	}
 done:
 	*pagep = filepage;
+	if (prealloc_page) {
+		mem_cgroup_uncharge_cache_page(prealloc_page);
+		page_cache_release(prealloc_page);
+	}
 	return 0;
 
 failed:
@@ -1476,6 +1501,10 @@ failed:
 		unlock_page(filepage);
 		page_cache_release(filepage);
 	}
+	if (prealloc_page) {
+		mem_cgroup_uncharge_cache_page(prealloc_page);
+		page_cache_release(prealloc_page);
+	}
 	return error;
 }
 


[-- Attachment #2: shmem-test.c --]
[-- Type: text/x-csrc, Size: 1293 bytes --]

#include <sys/mman.h>
#include <sys/time.h>
#include <unistd.h>
#include <pthread.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>

#define THREAD_NUM (64L)
#define MEM_SIZE (1024*1024*1024*16L)
#define PER_TH_SIZE (MEM_SIZE/THREAD_NUM)

void *thread_func(void *data)
{
	char *addr = data;
	unsigned long size = PER_TH_SIZE, index = 0;
	int t;

	while (index < size) {
		t = *(addr + index);
		index += 4096;
	}
}

int main(int argc, char *argv[])
{
	int i;
	pthread_t threads[THREAD_NUM];
	pthread_attr_t attr;
	struct timeval start, stop, diff;
	char *mem;

	mem = mmap(NULL, MEM_SIZE, PROT_READ|PROT_WRITE,
		MAP_SHARED|MAP_ANON, 0, 0);
	if (!mem) {
		perror("mmap error");
		exit(1);
	}
	
	pthread_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
	gettimeofday(&start, NULL);
	for (i = 0; i < THREAD_NUM; i++)
		if (pthread_create(&threads[i], &attr, thread_func,
			mem + PER_TH_SIZE * i)) {
			perror("thread create error");
			exit(1);
		}

	for (i=0; i< THREAD_NUM; i++)
		pthread_join(threads[i], NULL);

	gettimeofday(&stop, NULL);
	timersub(&stop, &start, &diff);
	printf("Thread %ld Mem %dG time %lu.%03lusec\n",
		THREAD_NUM, MEM_SIZE/1024/1024/1024,
		diff.tv_sec, diff.tv_usec/1000);

	pthread_attr_destroy(&attr);
	return 0;
}

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
  2010-07-07  1:15 [PATCH]shmem: reduce one time of locking in pagefault Shaohua Li
@ 2010-07-07  1:32   ` Andrew Morton
  2010-07-09  1:13   ` Hugh Dickins
  1 sibling, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2010-07-07  1:32 UTC (permalink / raw)
  To: Shaohua Li; +Cc: lkml, linux-mm, Andi Kleen, Zhang, Yanmin, Hugh Dickins

On Wed, 07 Jul 2010 09:15:46 +0800 Shaohua Li <shaohua.li@intel.com> wrote:

> I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> CPUs time are trying to get the lock.

I seem to remember complaining about that in 2002 ;) Faulting in a
mapping of /dev/zero is just awful on a 4-way(!).

> In the pagefault (no swap) case,
> shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> page so we could reduce one time of locking. This is what below patch does.
> 
> The result of the test case:
> 2.6.35-rc3: ~20s
> 2.6.35-rc3 + patch: ~12s
> so this is 40% improvement.
> 
> One might argue if we could have better locking for shmem. But even shmem is lockless,
> the pagefault will soon have pagecache lock heavily contented because shmem must add
> new page to pagecache. So before we have better locking for pagecache, improving shmem
> locking doesn't have too much improvement. I did a similar pagefault test against
> a ramfs file, the test result is ~10.5s.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index f65f840..c5f2939 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c

The patch doesn't make shmem_getpage() any clearer :(

shmem_inode_info.lock appears to be held too much.  Surely
lookup_swap_cache() didn't need it (for example).

What data does shmem_inode_info.lock actually protect?



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
@ 2010-07-07  1:32   ` Andrew Morton
  0 siblings, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2010-07-07  1:32 UTC (permalink / raw)
  To: Shaohua Li; +Cc: lkml, linux-mm, Andi Kleen, Zhang, Yanmin, Hugh Dickins

On Wed, 07 Jul 2010 09:15:46 +0800 Shaohua Li <shaohua.li@intel.com> wrote:

> I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> CPUs time are trying to get the lock.

I seem to remember complaining about that in 2002 ;) Faulting in a
mapping of /dev/zero is just awful on a 4-way(!).

> In the pagefault (no swap) case,
> shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> page so we could reduce one time of locking. This is what below patch does.
> 
> The result of the test case:
> 2.6.35-rc3: ~20s
> 2.6.35-rc3 + patch: ~12s
> so this is 40% improvement.
> 
> One might argue if we could have better locking for shmem. But even shmem is lockless,
> the pagefault will soon have pagecache lock heavily contented because shmem must add
> new page to pagecache. So before we have better locking for pagecache, improving shmem
> locking doesn't have too much improvement. I did a similar pagefault test against
> a ramfs file, the test result is ~10.5s.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index f65f840..c5f2939 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c

The patch doesn't make shmem_getpage() any clearer :(

shmem_inode_info.lock appears to be held too much.  Surely
lookup_swap_cache() didn't need it (for example).

What data does shmem_inode_info.lock actually protect?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
  2010-07-07  1:32   ` Andrew Morton
@ 2010-07-07  1:39     ` Shaohua Li
  -1 siblings, 0 replies; 11+ messages in thread
From: Shaohua Li @ 2010-07-07  1:39 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, linux-mm, Andi Kleen, Zhang, Yanmin, Hugh Dickins

On Wed, Jul 07, 2010 at 09:32:54AM +0800, Andrew Morton wrote:
> On Wed, 07 Jul 2010 09:15:46 +0800 Shaohua Li <shaohua.li@intel.com> wrote:
> 
> > I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> > system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> > CPUs time are trying to get the lock.
> 
> I seem to remember complaining about that in 2002 ;) Faulting in a
> mapping of /dev/zero is just awful on a 4-way(!).
> 
> > In the pagefault (no swap) case,
> > shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> > page so we could reduce one time of locking. This is what below patch does.
> > 
> > The result of the test case:
> > 2.6.35-rc3: ~20s
> > 2.6.35-rc3 + patch: ~12s
> > so this is 40% improvement.
> > 
> > One might argue if we could have better locking for shmem. But even shmem is lockless,
> > the pagefault will soon have pagecache lock heavily contented because shmem must add
> > new page to pagecache. So before we have better locking for pagecache, improving shmem
> > locking doesn't have too much improvement. I did a similar pagefault test against
> > a ramfs file, the test result is ~10.5s.
> > 
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index f65f840..c5f2939 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> 
> The patch doesn't make shmem_getpage() any clearer :(
> 
> shmem_inode_info.lock appears to be held too much.  Surely
> lookup_swap_cache() didn't need it (for example).
> 
> What data does shmem_inode_info.lock actually protect?
As far as my understanding, it protects shmem swp_entry, which is most used
to support swap. It also protects some accounting. If no swap, the lock almost
can be removed like tiny-shmem.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
@ 2010-07-07  1:39     ` Shaohua Li
  0 siblings, 0 replies; 11+ messages in thread
From: Shaohua Li @ 2010-07-07  1:39 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml, linux-mm, Andi Kleen, Zhang, Yanmin, Hugh Dickins

On Wed, Jul 07, 2010 at 09:32:54AM +0800, Andrew Morton wrote:
> On Wed, 07 Jul 2010 09:15:46 +0800 Shaohua Li <shaohua.li@intel.com> wrote:
> 
> > I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> > system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> > CPUs time are trying to get the lock.
> 
> I seem to remember complaining about that in 2002 ;) Faulting in a
> mapping of /dev/zero is just awful on a 4-way(!).
> 
> > In the pagefault (no swap) case,
> > shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> > page so we could reduce one time of locking. This is what below patch does.
> > 
> > The result of the test case:
> > 2.6.35-rc3: ~20s
> > 2.6.35-rc3 + patch: ~12s
> > so this is 40% improvement.
> > 
> > One might argue if we could have better locking for shmem. But even shmem is lockless,
> > the pagefault will soon have pagecache lock heavily contented because shmem must add
> > new page to pagecache. So before we have better locking for pagecache, improving shmem
> > locking doesn't have too much improvement. I did a similar pagefault test against
> > a ramfs file, the test result is ~10.5s.
> > 
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index f65f840..c5f2939 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> 
> The patch doesn't make shmem_getpage() any clearer :(
> 
> shmem_inode_info.lock appears to be held too much.  Surely
> lookup_swap_cache() didn't need it (for example).
> 
> What data does shmem_inode_info.lock actually protect?
As far as my understanding, it protects shmem swp_entry, which is most used
to support swap. It also protects some accounting. If no swap, the lock almost
can be removed like tiny-shmem.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
  2010-07-07  1:15 [PATCH]shmem: reduce one time of locking in pagefault Shaohua Li
@ 2010-07-09  1:13   ` Hugh Dickins
  2010-07-09  1:13   ` Hugh Dickins
  1 sibling, 0 replies; 11+ messages in thread
From: Hugh Dickins @ 2010-07-09  1:13 UTC (permalink / raw)
  To: Shaohua Li
  Cc: lkml, linux-mm, Andrew Morton, Andi Kleen, Zhang, Yanmin, Tim Chen

On Wed, 7 Jul 2010, Shaohua Li wrote:

> I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> CPUs time are trying to get the lock. In the pagefault (no swap) case,
> shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> page so we could reduce one time of locking. This is what below patch does.

Right.  As usual, I'm rather unenthusiastic about a patch which has to
duplicate code paths to satisfy an artificial testcase; but I can see
the appeal.

We can ignore that you're making the swap path slower, that will be lost
in its noise.  I did like the way the old code checked the max_blocks
limit before it let you allocate the page: whereas you might have many
threads simultaneously over-allocating before reaching that check; but
I guess we can live with that.

> 
> The result of the test case:
> 2.6.35-rc3: ~20s
> 2.6.35-rc3 + patch: ~12s
> so this is 40% improvement.

Was that with or without Tim's shmem_sb_info max_blocks scalability
changes (that I've still not studied)?  Or max_blocks 0 (unlimited)?

I notice your test case lets each thread fault in from its own
disjoint part of the whole area.  Please also test with each thread
touching each page in the whole area at the same time: which I think
is just as likely a case, but not obvious to me how well it would
work with your changes - what numbers does it show?

> 
> One might argue if we could have better locking for shmem. But even shmem is lockless,
> the pagefault will soon have pagecache lock heavily contented because shmem must add
> new page to pagecache. So before we have better locking for pagecache, improving shmem
> locking doesn't have too much improvement. I did a similar pagefault test against
> a ramfs file, the test result is ~10.5s.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index f65f840..c5f2939 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
...
> @@ -1258,7 +1258,19 @@ repeat:
>  		if (error)
>  			goto failed;
>  		radix_tree_preload_end();
> +		if (sgp != SGP_READ) {

Don't you need to check that prealloc_page is not already set there?
There are several places in the swap path where it has to goto repeat.

> +			/* don't care if this successes */
> +			prealloc_page = shmem_alloc_page(gfp, info, idx);
> +			if (prealloc_page) {
> +				if (mem_cgroup_cache_charge(prealloc_page,
> +				    current->mm, GFP_KERNEL)) {
> +					page_cache_release(prealloc_page);
> +					prealloc_page = NULL;
> +				}
> +			}
> +		}
>  	}

Hugh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
@ 2010-07-09  1:13   ` Hugh Dickins
  0 siblings, 0 replies; 11+ messages in thread
From: Hugh Dickins @ 2010-07-09  1:13 UTC (permalink / raw)
  To: Shaohua Li
  Cc: lkml, linux-mm, Andrew Morton, Andi Kleen, Zhang, Yanmin, Tim Chen

On Wed, 7 Jul 2010, Shaohua Li wrote:

> I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> CPUs time are trying to get the lock. In the pagefault (no swap) case,
> shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> page so we could reduce one time of locking. This is what below patch does.

Right.  As usual, I'm rather unenthusiastic about a patch which has to
duplicate code paths to satisfy an artificial testcase; but I can see
the appeal.

We can ignore that you're making the swap path slower, that will be lost
in its noise.  I did like the way the old code checked the max_blocks
limit before it let you allocate the page: whereas you might have many
threads simultaneously over-allocating before reaching that check; but
I guess we can live with that.

> 
> The result of the test case:
> 2.6.35-rc3: ~20s
> 2.6.35-rc3 + patch: ~12s
> so this is 40% improvement.

Was that with or without Tim's shmem_sb_info max_blocks scalability
changes (that I've still not studied)?  Or max_blocks 0 (unlimited)?

I notice your test case lets each thread fault in from its own
disjoint part of the whole area.  Please also test with each thread
touching each page in the whole area at the same time: which I think
is just as likely a case, but not obvious to me how well it would
work with your changes - what numbers does it show?

> 
> One might argue if we could have better locking for shmem. But even shmem is lockless,
> the pagefault will soon have pagecache lock heavily contented because shmem must add
> new page to pagecache. So before we have better locking for pagecache, improving shmem
> locking doesn't have too much improvement. I did a similar pagefault test against
> a ramfs file, the test result is ~10.5s.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index f65f840..c5f2939 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
...
> @@ -1258,7 +1258,19 @@ repeat:
>  		if (error)
>  			goto failed;
>  		radix_tree_preload_end();
> +		if (sgp != SGP_READ) {

Don't you need to check that prealloc_page is not already set there?
There are several places in the swap path where it has to goto repeat.

> +			/* don't care if this successes */
> +			prealloc_page = shmem_alloc_page(gfp, info, idx);
> +			if (prealloc_page) {
> +				if (mem_cgroup_cache_charge(prealloc_page,
> +				    current->mm, GFP_KERNEL)) {
> +					page_cache_release(prealloc_page);
> +					prealloc_page = NULL;
> +				}
> +			}
> +		}
>  	}

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
  2010-07-07  1:39     ` Shaohua Li
@ 2010-07-09  1:28       ` Hugh Dickins
  -1 siblings, 0 replies; 11+ messages in thread
From: Hugh Dickins @ 2010-07-09  1:28 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Andrew Morton, lkml, linux-mm, Andi Kleen, Zhang, Yanmin

On Wed, 7 Jul 2010, Shaohua Li wrote:
> On Wed, Jul 07, 2010 at 09:32:54AM +0800, Andrew Morton wrote:
> > 
> > The patch doesn't make shmem_getpage() any clearer :(

:)

> > 
> > shmem_inode_info.lock appears to be held too much.  Surely
> > lookup_swap_cache() didn't need it (for example).
> > 
> > What data does shmem_inode_info.lock actually protect?
> As far as my understanding, it protects shmem swp_entry, which is most used
> to support swap. It also protects some accounting. If no swap, the lock almost
> can be removed like tiny-shmem.

That's right: shmem_info_info.lock protects what's in shmem_inode_info,
plus what hangs off it (the shmem_swp blocks).

We want that lock across the lookup_swap_cache() to be sure that what we
find is still what we want (otherwise another thread might bring it out
of swap and that swap be reused for something else) - the page lock is
good once you have a page to lock, but until then....  I guess could be
done by dropping the lock then retaking and rechecking after, but that
would go right against the grain of this patch.

Hugh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
@ 2010-07-09  1:28       ` Hugh Dickins
  0 siblings, 0 replies; 11+ messages in thread
From: Hugh Dickins @ 2010-07-09  1:28 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Andrew Morton, lkml, linux-mm, Andi Kleen, Zhang, Yanmin

On Wed, 7 Jul 2010, Shaohua Li wrote:
> On Wed, Jul 07, 2010 at 09:32:54AM +0800, Andrew Morton wrote:
> > 
> > The patch doesn't make shmem_getpage() any clearer :(

:)

> > 
> > shmem_inode_info.lock appears to be held too much.  Surely
> > lookup_swap_cache() didn't need it (for example).
> > 
> > What data does shmem_inode_info.lock actually protect?
> As far as my understanding, it protects shmem swp_entry, which is most used
> to support swap. It also protects some accounting. If no swap, the lock almost
> can be removed like tiny-shmem.

That's right: shmem_info_info.lock protects what's in shmem_inode_info,
plus what hangs off it (the shmem_swp blocks).

We want that lock across the lookup_swap_cache() to be sure that what we
find is still what we want (otherwise another thread might bring it out
of swap and that swap be reused for something else) - the page lock is
good once you have a page to lock, but until then....  I guess could be
done by dropping the lock then retaking and rechecking after, but that
would go right against the grain of this patch.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
  2010-07-09  1:13   ` Hugh Dickins
@ 2010-07-09  2:52     ` Shaohua Li
  -1 siblings, 0 replies; 11+ messages in thread
From: Shaohua Li @ 2010-07-09  2:52 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: lkml, linux-mm, Andrew Morton, Andi Kleen, Zhang, Yanmin, Tim Chen

On Fri, Jul 09, 2010 at 09:13:55AM +0800, Hugh Dickins wrote:
> On Wed, 7 Jul 2010, Shaohua Li wrote:
> 
> > I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> > system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> > CPUs time are trying to get the lock. In the pagefault (no swap) case,
> > shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> > page so we could reduce one time of locking. This is what below patch does.
> 
> Right.  As usual, I'm rather unenthusiastic about a patch which has to
> duplicate code paths to satisfy an artificial testcase; but I can see
> the appeal.
> 
> We can ignore that you're making the swap path slower, that will be lost
> in its noise.  I did like the way the old code checked the max_blocks
> limit before it let you allocate the page: whereas you might have many
> threads simultaneously over-allocating before reaching that check; but
> I guess we can live with that.
> 
> > 
> > The result of the test case:
> > 2.6.35-rc3: ~20s
> > 2.6.35-rc3 + patch: ~12s
> > so this is 40% improvement.
> 
> Was that with or without Tim's shmem_sb_info max_blocks scalability
> changes (that I've still not studied)?  Or max_blocks 0 (unlimited)?
no Tim's patch. max_blocks 0.
 
> I notice your test case lets each thread fault in from its own
> disjoint part of the whole area.  Please also test with each thread
> touching each page in the whole area at the same time: which I think
> is just as likely a case, but not obvious to me how well it would
> work with your changes - what numbers does it show?
Tried this (I must use less memory (1G) because this is quite slow):
2.6.35-rc5: ~78s (quite stable in 6 run)
2.6.35-rc5 + patch: not stable. I collect 6 data: 75.5s, 20.9s, 76.1s, 14.6s
22.3s, 75.7s. So sometimes there are big improvements, sometimes not. But
not worse anyway.

> > One might argue if we could have better locking for shmem. But even shmem is lockless,
> > the pagefault will soon have pagecache lock heavily contented because shmem must add
> > new page to pagecache. So before we have better locking for pagecache, improving shmem
> > locking doesn't have too much improvement. I did a similar pagefault test against
> > a ramfs file, the test result is ~10.5s.
> > 
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index f65f840..c5f2939 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> ...
> > @@ -1258,7 +1258,19 @@ repeat:
> >  		if (error)
> >  			goto failed;
> >  		radix_tree_preload_end();
> > +		if (sgp != SGP_READ) {
> 
> Don't you need to check that prealloc_page is not already set there?
> There are several places in the swap path where it has to goto repeat.
Thanks for pointing out this. Updated patch.


I'm running a shmem pagefault test case (see attached file) under a 64 CPU
system. Profile shows shmem_inode_info->lock is heavily contented and 100%
CPUs time are trying to get the lock. In the pagefault (no swap) case,
shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
page so we could reduce one time of locking. This is what below patch does.

The result of the test case:
2.6.35-rc3: ~20s
2.6.35-rc3 + patch: ~12s
so this is 40% improvement.

One might argue if we could have better locking for shmem. But even shmem is lockless,
the pagefault will soon have pagecache lock heavily contented because shmem must add
new page to pagecache. So before we have better locking for pagecache, improving shmem
locking doesn't have too much improvement. I did a similar pagefault test against
a ramfs file, the test result is ~10.5s.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 mm/shmem.c |   69 +++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 20 deletions(-)

Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2010-07-10 09:15:05.000000000 +0800
+++ linux-2.6/mm/shmem.c	2010-07-10 09:24:34.000000000 +0800
@@ -1223,6 +1223,7 @@
 	struct shmem_sb_info *sbinfo;
 	struct page *filepage = *pagep;
 	struct page *swappage;
+	struct page *prealloc_page = NULL;
 	swp_entry_t *entry;
 	swp_entry_t swap;
 	gfp_t gfp;
@@ -1247,7 +1248,6 @@
 		filepage = find_lock_page(mapping, idx);
 	if (filepage && PageUptodate(filepage))
 		goto done;
-	error = 0;
 	gfp = mapping_gfp_mask(mapping);
 	if (!filepage) {
 		/*
@@ -1258,7 +1258,19 @@
 		if (error)
 			goto failed;
 		radix_tree_preload_end();
+		if (sgp != SGP_READ && !prealloc_page) {
+			/* don't care if this successes */
+			prealloc_page = shmem_alloc_page(gfp, info, idx);
+			if (prealloc_page) {
+				if (mem_cgroup_cache_charge(prealloc_page,
+				    current->mm, GFP_KERNEL)) {
+					page_cache_release(prealloc_page);
+					prealloc_page = NULL;
+				}
+			}
+		}
 	}
+	error = 0;
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
@@ -1407,28 +1419,37 @@
 		if (!filepage) {
 			int ret;
 
-			spin_unlock(&info->lock);
-			filepage = shmem_alloc_page(gfp, info, idx);
-			if (!filepage) {
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				error = -ENOMEM;
-				goto failed;
-			}
-			SetPageSwapBacked(filepage);
+			if (!prealloc_page) {
+				spin_unlock(&info->lock);
+				filepage = shmem_alloc_page(gfp, info, idx);
+				if (!filepage) {
+					shmem_unacct_blocks(info->flags, 1);
+					shmem_free_blocks(inode, 1);
+					error = -ENOMEM;
+					goto failed;
+				}
+				SetPageSwapBacked(filepage);
 
-			/* Precharge page while we can wait, compensate after */
-			error = mem_cgroup_cache_charge(filepage, current->mm,
-					GFP_KERNEL);
-			if (error) {
-				page_cache_release(filepage);
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				filepage = NULL;
-				goto failed;
+				/* Precharge page while we can wait, compensate
+				 * after
+				 */
+				error = mem_cgroup_cache_charge(filepage,
+					current->mm, GFP_KERNEL);
+				if (error) {
+					page_cache_release(filepage);
+					shmem_unacct_blocks(info->flags, 1);
+					shmem_free_blocks(inode, 1);
+					filepage = NULL;
+					goto failed;
+				}
+
+				spin_lock(&info->lock);
+			} else {
+				filepage = prealloc_page;
+				prealloc_page = NULL;
+				SetPageSwapBacked(filepage);
 			}
 
-			spin_lock(&info->lock);
 			entry = shmem_swp_alloc(info, idx, sgp);
 			if (IS_ERR(entry))
 				error = PTR_ERR(entry);
@@ -1469,6 +1490,10 @@
 	}
 done:
 	*pagep = filepage;
+	if (prealloc_page) {
+		mem_cgroup_uncharge_cache_page(prealloc_page);
+		page_cache_release(prealloc_page);
+	}
 	return 0;
 
 failed:
@@ -1476,6 +1501,10 @@
 		unlock_page(filepage);
 		page_cache_release(filepage);
 	}
+	if (prealloc_page) {
+		mem_cgroup_uncharge_cache_page(prealloc_page);
+		page_cache_release(prealloc_page);
+	}
 	return error;
 }
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]shmem: reduce one time of locking in pagefault
@ 2010-07-09  2:52     ` Shaohua Li
  0 siblings, 0 replies; 11+ messages in thread
From: Shaohua Li @ 2010-07-09  2:52 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: lkml, linux-mm, Andrew Morton, Andi Kleen, Zhang, Yanmin, Tim Chen

On Fri, Jul 09, 2010 at 09:13:55AM +0800, Hugh Dickins wrote:
> On Wed, 7 Jul 2010, Shaohua Li wrote:
> 
> > I'm running a shmem pagefault test case (see attached file) under a 64 CPU
> > system. Profile shows shmem_inode_info->lock is heavily contented and 100%
> > CPUs time are trying to get the lock. In the pagefault (no swap) case,
> > shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
> > page so we could reduce one time of locking. This is what below patch does.
> 
> Right.  As usual, I'm rather unenthusiastic about a patch which has to
> duplicate code paths to satisfy an artificial testcase; but I can see
> the appeal.
> 
> We can ignore that you're making the swap path slower, that will be lost
> in its noise.  I did like the way the old code checked the max_blocks
> limit before it let you allocate the page: whereas you might have many
> threads simultaneously over-allocating before reaching that check; but
> I guess we can live with that.
> 
> > 
> > The result of the test case:
> > 2.6.35-rc3: ~20s
> > 2.6.35-rc3 + patch: ~12s
> > so this is 40% improvement.
> 
> Was that with or without Tim's shmem_sb_info max_blocks scalability
> changes (that I've still not studied)?  Or max_blocks 0 (unlimited)?
no Tim's patch. max_blocks 0.
 
> I notice your test case lets each thread fault in from its own
> disjoint part of the whole area.  Please also test with each thread
> touching each page in the whole area at the same time: which I think
> is just as likely a case, but not obvious to me how well it would
> work with your changes - what numbers does it show?
Tried this (I must use less memory (1G) because this is quite slow):
2.6.35-rc5: ~78s (quite stable in 6 run)
2.6.35-rc5 + patch: not stable. I collect 6 data: 75.5s, 20.9s, 76.1s, 14.6s
22.3s, 75.7s. So sometimes there are big improvements, sometimes not. But
not worse anyway.

> > One might argue if we could have better locking for shmem. But even shmem is lockless,
> > the pagefault will soon have pagecache lock heavily contented because shmem must add
> > new page to pagecache. So before we have better locking for pagecache, improving shmem
> > locking doesn't have too much improvement. I did a similar pagefault test against
> > a ramfs file, the test result is ~10.5s.
> > 
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index f65f840..c5f2939 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> ...
> > @@ -1258,7 +1258,19 @@ repeat:
> >  		if (error)
> >  			goto failed;
> >  		radix_tree_preload_end();
> > +		if (sgp != SGP_READ) {
> 
> Don't you need to check that prealloc_page is not already set there?
> There are several places in the swap path where it has to goto repeat.
Thanks for pointing out this. Updated patch.


I'm running a shmem pagefault test case (see attached file) under a 64 CPU
system. Profile shows shmem_inode_info->lock is heavily contented and 100%
CPUs time are trying to get the lock. In the pagefault (no swap) case,
shmem_getpage gets the lock twice, the last one is avoidable if we prealloc a
page so we could reduce one time of locking. This is what below patch does.

The result of the test case:
2.6.35-rc3: ~20s
2.6.35-rc3 + patch: ~12s
so this is 40% improvement.

One might argue if we could have better locking for shmem. But even shmem is lockless,
the pagefault will soon have pagecache lock heavily contented because shmem must add
new page to pagecache. So before we have better locking for pagecache, improving shmem
locking doesn't have too much improvement. I did a similar pagefault test against
a ramfs file, the test result is ~10.5s.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 mm/shmem.c |   69 +++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 20 deletions(-)

Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2010-07-10 09:15:05.000000000 +0800
+++ linux-2.6/mm/shmem.c	2010-07-10 09:24:34.000000000 +0800
@@ -1223,6 +1223,7 @@
 	struct shmem_sb_info *sbinfo;
 	struct page *filepage = *pagep;
 	struct page *swappage;
+	struct page *prealloc_page = NULL;
 	swp_entry_t *entry;
 	swp_entry_t swap;
 	gfp_t gfp;
@@ -1247,7 +1248,6 @@
 		filepage = find_lock_page(mapping, idx);
 	if (filepage && PageUptodate(filepage))
 		goto done;
-	error = 0;
 	gfp = mapping_gfp_mask(mapping);
 	if (!filepage) {
 		/*
@@ -1258,7 +1258,19 @@
 		if (error)
 			goto failed;
 		radix_tree_preload_end();
+		if (sgp != SGP_READ && !prealloc_page) {
+			/* don't care if this successes */
+			prealloc_page = shmem_alloc_page(gfp, info, idx);
+			if (prealloc_page) {
+				if (mem_cgroup_cache_charge(prealloc_page,
+				    current->mm, GFP_KERNEL)) {
+					page_cache_release(prealloc_page);
+					prealloc_page = NULL;
+				}
+			}
+		}
 	}
+	error = 0;
 
 	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
@@ -1407,28 +1419,37 @@
 		if (!filepage) {
 			int ret;
 
-			spin_unlock(&info->lock);
-			filepage = shmem_alloc_page(gfp, info, idx);
-			if (!filepage) {
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				error = -ENOMEM;
-				goto failed;
-			}
-			SetPageSwapBacked(filepage);
+			if (!prealloc_page) {
+				spin_unlock(&info->lock);
+				filepage = shmem_alloc_page(gfp, info, idx);
+				if (!filepage) {
+					shmem_unacct_blocks(info->flags, 1);
+					shmem_free_blocks(inode, 1);
+					error = -ENOMEM;
+					goto failed;
+				}
+				SetPageSwapBacked(filepage);
 
-			/* Precharge page while we can wait, compensate after */
-			error = mem_cgroup_cache_charge(filepage, current->mm,
-					GFP_KERNEL);
-			if (error) {
-				page_cache_release(filepage);
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				filepage = NULL;
-				goto failed;
+				/* Precharge page while we can wait, compensate
+				 * after
+				 */
+				error = mem_cgroup_cache_charge(filepage,
+					current->mm, GFP_KERNEL);
+				if (error) {
+					page_cache_release(filepage);
+					shmem_unacct_blocks(info->flags, 1);
+					shmem_free_blocks(inode, 1);
+					filepage = NULL;
+					goto failed;
+				}
+
+				spin_lock(&info->lock);
+			} else {
+				filepage = prealloc_page;
+				prealloc_page = NULL;
+				SetPageSwapBacked(filepage);
 			}
 
-			spin_lock(&info->lock);
 			entry = shmem_swp_alloc(info, idx, sgp);
 			if (IS_ERR(entry))
 				error = PTR_ERR(entry);
@@ -1469,6 +1490,10 @@
 	}
 done:
 	*pagep = filepage;
+	if (prealloc_page) {
+		mem_cgroup_uncharge_cache_page(prealloc_page);
+		page_cache_release(prealloc_page);
+	}
 	return 0;
 
 failed:
@@ -1476,6 +1501,10 @@
 		unlock_page(filepage);
 		page_cache_release(filepage);
 	}
+	if (prealloc_page) {
+		mem_cgroup_uncharge_cache_page(prealloc_page);
+		page_cache_release(prealloc_page);
+	}
 	return error;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-07-09  2:52 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-07-07  1:15 [PATCH]shmem: reduce one time of locking in pagefault Shaohua Li
2010-07-07  1:32 ` Andrew Morton
2010-07-07  1:32   ` Andrew Morton
2010-07-07  1:39   ` Shaohua Li
2010-07-07  1:39     ` Shaohua Li
2010-07-09  1:28     ` Hugh Dickins
2010-07-09  1:28       ` Hugh Dickins
2010-07-09  1:13 ` Hugh Dickins
2010-07-09  1:13   ` Hugh Dickins
2010-07-09  2:52   ` Shaohua Li
2010-07-09  2:52     ` Shaohua Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.