All of lore.kernel.org
 help / color / mirror / Atom feed
* [TESTCASE] Clean pages clogging the VM
@ 2010-08-09 13:30 Matthew Wilcox
  2010-08-17 19:50   ` Matthew Wilcox
  0 siblings, 1 reply; 19+ messages in thread
From: Matthew Wilcox @ 2010-08-09 13:30 UTC (permalink / raw)
  To: linux-mm


This testcase shows some odd behaviour from the Linux VM.

It creates a 1TB sparse file, mmaps it, and randomly reads locations 
in it.  Due to the file being entirely sparse, the VM allocates new pages
and zeroes them.  Initially, it runs very fast, taking on the order of
2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
and starts doing huge amounts of work trying to figure out which of
these clean pages to throw away.  In my testing with a 6GB machine 
and 2.9GHz CPU, one in every 15,000 page faults takes over a second, 
and one in every 40,000 page faults take over seven seconds!

This test-case demonstrates a problem that occurs with a read-mostly 
mmap of a file on very fast media.  I wouldn't like to see a solution
that special-cases zeroed pages.  I think userspace has done its part
to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
This ought to be enough to hint to the kernel that it should be eagerly
throwing away pages in this VMA.


/*
 * Copyright (c) 2010, Intel Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <math.h>
#include <pthread.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)


#define MAX_FILE_SIZE	((off_t)1024 * 1024 * 1024 * 1024)
#define	MAX_FILE_IOS	16384
#define	MAX_LATENCY	10000000		// usecs

#define NUM_IOS		1024
#define IO_SIZE		4096
#define BUFFER_SIZE	(1024 * 1024)

pthread_t tid;
double 	cpu_clock;
long long unsigned cpu_start, cpu_stop;

void *mmap_test(void *arg);
void die ();

static const char usage_cmds[] =
"usage: %s [options]\n"
"cmd line options:\n"
"    -f	file_name	Read from File named 'file_name'\n"
"    -a	file_size	File of 'file_size' Bytes/thread\n"
"    -b	buffer_size	Write/Read into/from buffer of 'buffer_size' Bytes/thread\n"
"    -n	num_file_ios	Process 'num_file_ios' IOs\n"
"    -s	io_size		IO Size = 'io_size' Bytes\n"
"    -l max_latency     Show latency stats based on usecs of max_latency\n"
;

void usage(const char *program)
{
	fprintf(stderr, usage_cmds, program);
}

off_t file_size = MAX_FILE_SIZE;	// -a
long long unsigned int buffer_size = BUFFER_SIZE;	// -b
char *filename = "sparse-file";			// -f
int	num_file_ios = NUM_IOS;		// -n
int	max_latency = MAX_LATENCY;	// -l
int	io_size = IO_SIZE;		// -s
long long unsigned int   latency_limit;

int main(int argc, char **argv)
{
	pthread_attr_t 	attr;
	cpu_set_t             mask;
	FILE *proc;
	char buf[256];
	double mhz = 0.0;

	while (1) {
	    int option = getopt(argc, argv, "a:b:f:h:l:n:p:s:");
		if (option == -1) {
		    break;
		}
	    switch (option) {
		case 'a':
		    file_size = strtoul(optarg, NULL, 0);
		    printf("a: file_size:%ld Bytes :%ld MB\n", file_size, file_size/(1024*1024));
		    break;
		case 'b':
		    buffer_size = strtoul(optarg, NULL, 0);
		    printf("b: buffer_size:%lld Bytes\n", buffer_size);
		    break;
		case 'f':
	    	    filename = optarg;
		    printf("f: filename:%s\n", filename);
		    break;
		case 'h':
		    printf("h: options\n");
		    goto help;
		case 'l':
		    max_latency = strtoul(optarg, NULL, 0);
		    printf("l: latency stats based on max latency:%d\n", max_latency);
		    break;
		case 'n':
		    num_file_ios = strtoul(optarg, NULL, 0);
		    printf("n: num_file_ios:%d\n", num_file_ios);
		    if (num_file_ios > MAX_FILE_IOS) {
			printf("-n %d Entered > MAX_FILE_IOS:%d\n", num_file_ios, MAX_FILE_IOS);
			exit(1);
		    }
		    break;
		case 's':
		    io_size = strtoul(optarg, NULL, 0);
		    printf("s: io_size:%d Bytes\n", io_size);
		    break;
		default:
		help:
		    usage(argv[0]);
		    printf("default:\n");
		    exit(1);
	    }
	}

	proc = fopen("/proc/cpuinfo", "r");
	if (!proc)
		return 0.0;

	while (fgets(buf, sizeof buf, proc)) {
		double cpu;

		if (sscanf(buf, "cpu MHz : %lf", &cpu) != 1)
			continue;
		if (mhz == 0.0) {
			mhz = cpu;
			continue;
		}
		if (mhz != cpu) {
			fprintf(stderr,
				"Conflicting CPU frequency values: %lf != %lf\n",
				mhz, cpu);
			return 0.0;
		}
	}
	fclose(proc);
	printf("CPU Clock Freq from /proc/cpuinfo:%.4f\n", mhz);
//
// Measure CPU Core Frequnecy over 5 second period
//
	printf("Measuring CPU Frequency......:");
	rdtscll(cpu_start);
	usleep(5000000);
	rdtscll(cpu_stop);
	cpu_clock = (double)((double)(cpu_stop-cpu_start))/(double)5.0;
	printf("%.3f\n", cpu_clock);
	latency_limit = (long long unsigned int) (cpu_clock*max_latency/1000000);
	printf("latency_limit:%llu cycles or %d usecs\n", latency_limit, max_latency);

	pthread_attr_init (&attr);
	pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
	pthread_attr_setstacksize (&attr, (size_t) (1024*1024));

	if (pthread_create(&tid, &attr, mmap_test, (void *)(long) 0) != 0) {
		die("Thread create failed!");
	}

	CPU_ZERO(&mask);
	CPU_SET(0, &mask);
	if (pthread_setaffinity_np(tid, sizeof(mask), &mask) ) {
	 	printf("WARNING: could not set CPU Affinity, exit...\n");
	 	exit(1);
	}

        pthread_join(tid, NULL);
        sleep(1);

	return 0;
}


void die(char *string)
{
	fprintf(stderr, "\nmmap_test: %s\n", string);
	exit(1);
}

void *mmapfile(char *fname, off_t size, int *filed)
{
	int fd;
	void *file_addr;
	struct stat statbuf;

	fd = open(fname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
	*filed = fd;
	if (fd < 0) {
    		fprintf(stderr, "unable to open %s to get an FD:%s\n", fname, strerror(errno));
		exit(1);
	}

	fstat(fd, &statbuf);
	if (statbuf.st_size < size)
		ftruncate(fd, size);

	file_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
	if (file_addr == MAP_FAILED) {
    		fprintf(stderr, "datafile mmap failed: %s\n", strerror(errno));
		exit(1);
	}

	madvise(file_addr, size, MADV_RANDOM);
	return file_addr;
}

void create_offsets(off_t *offset_buf, int threadnum)
{
	int i, curr_time;

	curr_time = time(NULL);
        srandom(curr_time / (threadnum + 1));

	for (i = 0; i < num_file_ios; i++) {
		double random1 = ((double)(rand()%(RAND_MAX)) / RAND_MAX);
		offset_buf[i] = file_size * random1;
		offset_buf[i] = offset_buf[i] / io_size * io_size;
	}
}

void *mmap_test(void *arg)
{
	int threadnum = (long) arg;
	int fd;
	char *file_ptr, *file_addr;
	char *buf_ptr, *buf_addr = NULL;
	int i, j, ios;
	off_t offset_buf[MAX_FILE_IOS];
	unsigned long long latency_start, latency_stop;

	posix_memalign((void *)&buf_addr, 4096, buffer_size);

	file_addr = mmapfile(filename, file_size, &fd);

	ios = buffer_size/io_size;

	create_offsets(offset_buf, threadnum);

	for (j = 0; j < num_file_ios; j++) {
		buf_ptr = buf_addr;
		file_ptr = file_addr + offset_buf[j];
 
		for (i = 0; i < ios; i++) {
			rdtscll(latency_start);
			*buf_ptr = *(char *)file_ptr;
			rdtscll(latency_stop);
			printf("%lld\n", latency_stop - latency_start);
			buf_ptr += io_size;
			file_ptr += io_size;
		}
	}

	close(fd);
	munmap(file_addr, file_size);
	free(buf_addr);

	pthread_exit(NULL);
	return 0;
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-09 13:30 [TESTCASE] Clean pages clogging the VM Matthew Wilcox
@ 2010-08-17 19:50   ` Matthew Wilcox
  0 siblings, 0 replies; 19+ messages in thread
From: Matthew Wilcox @ 2010-08-17 19:50 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel


No comment on this?  Was it just that I posted it during the VM summit?

On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> 
> This testcase shows some odd behaviour from the Linux VM.
> 
> It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> in it.  Due to the file being entirely sparse, the VM allocates new pages
> and zeroes them.  Initially, it runs very fast, taking on the order of
> 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> and starts doing huge amounts of work trying to figure out which of
> these clean pages to throw away.  In my testing with a 6GB machine 
> and 2.9GHz CPU, one in every 15,000 page faults takes over a second, 
> and one in every 40,000 page faults take over seven seconds!
> 
> This test-case demonstrates a problem that occurs with a read-mostly 
> mmap of a file on very fast media.  I wouldn't like to see a solution
> that special-cases zeroed pages.  I think userspace has done its part
> to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> This ought to be enough to hint to the kernel that it should be eagerly
> throwing away pages in this VMA.
> 
> 
> /*
>  * Copyright (c) 2010, Intel Corporation
>  * All rights reserved.
>  *
>  * Redistribution and use in source and binary forms, with or without
>  * modification, are permitted provided that the following conditions are met:
>  *
>  *  * Redistributions of source code must retain the above copyright notice,
>  *    this list of conditions and the following disclaimer.
>  *  * Redistributions in binary form must reproduce the above copyright notice,
>  *    this list of conditions and the following disclaimer in the documentation
>  *    and/or other materials provided with the distribution.
>  *  * Neither the name of Intel Corporation nor the names of its contributors
>  *    may be used to endorse or promote products derived from this software
>  *    without specific prior written permission.
>  *
>  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
>  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
>  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
>  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
>  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
>  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
>  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
>  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>  * POSSIBILITY OF SUCH DAMAGE.
>  */
> 
> #include <assert.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <math.h>
> #include <pthread.h>
> #include <signal.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <unistd.h>
> 
> #define rdtscll(val) do { \
>      unsigned int __a,__d; \
>      asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
>      (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
> } while(0)
> 
> 
> #define MAX_FILE_SIZE	((off_t)1024 * 1024 * 1024 * 1024)
> #define	MAX_FILE_IOS	16384
> #define	MAX_LATENCY	10000000		// usecs
> 
> #define NUM_IOS		1024
> #define IO_SIZE		4096
> #define BUFFER_SIZE	(1024 * 1024)
> 
> pthread_t tid;
> double 	cpu_clock;
> long long unsigned cpu_start, cpu_stop;
> 
> void *mmap_test(void *arg);
> void die ();
> 
> static const char usage_cmds[] =
> "usage: %s [options]\n"
> "cmd line options:\n"
> "    -f	file_name	Read from File named 'file_name'\n"
> "    -a	file_size	File of 'file_size' Bytes/thread\n"
> "    -b	buffer_size	Write/Read into/from buffer of 'buffer_size' Bytes/thread\n"
> "    -n	num_file_ios	Process 'num_file_ios' IOs\n"
> "    -s	io_size		IO Size = 'io_size' Bytes\n"
> "    -l max_latency     Show latency stats based on usecs of max_latency\n"
> ;
> 
> void usage(const char *program)
> {
> 	fprintf(stderr, usage_cmds, program);
> }
> 
> off_t file_size = MAX_FILE_SIZE;	// -a
> long long unsigned int buffer_size = BUFFER_SIZE;	// -b
> char *filename = "sparse-file";			// -f
> int	num_file_ios = NUM_IOS;		// -n
> int	max_latency = MAX_LATENCY;	// -l
> int	io_size = IO_SIZE;		// -s
> long long unsigned int   latency_limit;
> 
> int main(int argc, char **argv)
> {
> 	pthread_attr_t 	attr;
> 	cpu_set_t             mask;
> 	FILE *proc;
> 	char buf[256];
> 	double mhz = 0.0;
> 
> 	while (1) {
> 	    int option = getopt(argc, argv, "a:b:f:h:l:n:p:s:");
> 		if (option == -1) {
> 		    break;
> 		}
> 	    switch (option) {
> 		case 'a':
> 		    file_size = strtoul(optarg, NULL, 0);
> 		    printf("a: file_size:%ld Bytes :%ld MB\n", file_size, file_size/(1024*1024));
> 		    break;
> 		case 'b':
> 		    buffer_size = strtoul(optarg, NULL, 0);
> 		    printf("b: buffer_size:%lld Bytes\n", buffer_size);
> 		    break;
> 		case 'f':
> 	    	    filename = optarg;
> 		    printf("f: filename:%s\n", filename);
> 		    break;
> 		case 'h':
> 		    printf("h: options\n");
> 		    goto help;
> 		case 'l':
> 		    max_latency = strtoul(optarg, NULL, 0);
> 		    printf("l: latency stats based on max latency:%d\n", max_latency);
> 		    break;
> 		case 'n':
> 		    num_file_ios = strtoul(optarg, NULL, 0);
> 		    printf("n: num_file_ios:%d\n", num_file_ios);
> 		    if (num_file_ios > MAX_FILE_IOS) {
> 			printf("-n %d Entered > MAX_FILE_IOS:%d\n", num_file_ios, MAX_FILE_IOS);
> 			exit(1);
> 		    }
> 		    break;
> 		case 's':
> 		    io_size = strtoul(optarg, NULL, 0);
> 		    printf("s: io_size:%d Bytes\n", io_size);
> 		    break;
> 		default:
> 		help:
> 		    usage(argv[0]);
> 		    printf("default:\n");
> 		    exit(1);
> 	    }
> 	}
> 
> 	proc = fopen("/proc/cpuinfo", "r");
> 	if (!proc)
> 		return 0.0;
> 
> 	while (fgets(buf, sizeof buf, proc)) {
> 		double cpu;
> 
> 		if (sscanf(buf, "cpu MHz : %lf", &cpu) != 1)
> 			continue;
> 		if (mhz == 0.0) {
> 			mhz = cpu;
> 			continue;
> 		}
> 		if (mhz != cpu) {
> 			fprintf(stderr,
> 				"Conflicting CPU frequency values: %lf != %lf\n",
> 				mhz, cpu);
> 			return 0.0;
> 		}
> 	}
> 	fclose(proc);
> 	printf("CPU Clock Freq from /proc/cpuinfo:%.4f\n", mhz);
> //
> // Measure CPU Core Frequnecy over 5 second period
> //
> 	printf("Measuring CPU Frequency......:");
> 	rdtscll(cpu_start);
> 	usleep(5000000);
> 	rdtscll(cpu_stop);
> 	cpu_clock = (double)((double)(cpu_stop-cpu_start))/(double)5.0;
> 	printf("%.3f\n", cpu_clock);
> 	latency_limit = (long long unsigned int) (cpu_clock*max_latency/1000000);
> 	printf("latency_limit:%llu cycles or %d usecs\n", latency_limit, max_latency);
> 
> 	pthread_attr_init (&attr);
> 	pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
> 	pthread_attr_setstacksize (&attr, (size_t) (1024*1024));
> 
> 	if (pthread_create(&tid, &attr, mmap_test, (void *)(long) 0) != 0) {
> 		die("Thread create failed!");
> 	}
> 
> 	CPU_ZERO(&mask);
> 	CPU_SET(0, &mask);
> 	if (pthread_setaffinity_np(tid, sizeof(mask), &mask) ) {
> 	 	printf("WARNING: could not set CPU Affinity, exit...\n");
> 	 	exit(1);
> 	}
> 
>         pthread_join(tid, NULL);
>         sleep(1);
> 
> 	return 0;
> }
> 
> 
> void die(char *string)
> {
> 	fprintf(stderr, "\nmmap_test: %s\n", string);
> 	exit(1);
> }
> 
> void *mmapfile(char *fname, off_t size, int *filed)
> {
> 	int fd;
> 	void *file_addr;
> 	struct stat statbuf;
> 
> 	fd = open(fname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
> 	*filed = fd;
> 	if (fd < 0) {
>     		fprintf(stderr, "unable to open %s to get an FD:%s\n", fname, strerror(errno));
> 		exit(1);
> 	}
> 
> 	fstat(fd, &statbuf);
> 	if (statbuf.st_size < size)
> 		ftruncate(fd, size);
> 
> 	file_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> 	if (file_addr == MAP_FAILED) {
>     		fprintf(stderr, "datafile mmap failed: %s\n", strerror(errno));
> 		exit(1);
> 	}
> 
> 	madvise(file_addr, size, MADV_RANDOM);
> 	return file_addr;
> }
> 
> void create_offsets(off_t *offset_buf, int threadnum)
> {
> 	int i, curr_time;
> 
> 	curr_time = time(NULL);
>         srandom(curr_time / (threadnum + 1));
> 
> 	for (i = 0; i < num_file_ios; i++) {
> 		double random1 = ((double)(rand()%(RAND_MAX)) / RAND_MAX);
> 		offset_buf[i] = file_size * random1;
> 		offset_buf[i] = offset_buf[i] / io_size * io_size;
> 	}
> }
> 
> void *mmap_test(void *arg)
> {
> 	int threadnum = (long) arg;
> 	int fd;
> 	char *file_ptr, *file_addr;
> 	char *buf_ptr, *buf_addr = NULL;
> 	int i, j, ios;
> 	off_t offset_buf[MAX_FILE_IOS];
> 	unsigned long long latency_start, latency_stop;
> 
> 	posix_memalign((void *)&buf_addr, 4096, buffer_size);
> 
> 	file_addr = mmapfile(filename, file_size, &fd);
> 
> 	ios = buffer_size/io_size;
> 
> 	create_offsets(offset_buf, threadnum);
> 
> 	for (j = 0; j < num_file_ios; j++) {
> 		buf_ptr = buf_addr;
> 		file_ptr = file_addr + offset_buf[j];
>  
> 		for (i = 0; i < ios; i++) {
> 			rdtscll(latency_start);
> 			*buf_ptr = *(char *)file_ptr;
> 			rdtscll(latency_stop);
> 			printf("%lld\n", latency_stop - latency_start);
> 			buf_ptr += io_size;
> 			file_ptr += io_size;
> 		}
> 	}
> 
> 	close(fd);
> 	munmap(file_addr, file_size);
> 	free(buf_addr);
> 
> 	pthread_exit(NULL);
> 	return 0;
> }

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-17 19:50   ` Matthew Wilcox
  0 siblings, 0 replies; 19+ messages in thread
From: Matthew Wilcox @ 2010-08-17 19:50 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel


No comment on this?  Was it just that I posted it during the VM summit?

On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> 
> This testcase shows some odd behaviour from the Linux VM.
> 
> It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> in it.  Due to the file being entirely sparse, the VM allocates new pages
> and zeroes them.  Initially, it runs very fast, taking on the order of
> 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> and starts doing huge amounts of work trying to figure out which of
> these clean pages to throw away.  In my testing with a 6GB machine 
> and 2.9GHz CPU, one in every 15,000 page faults takes over a second, 
> and one in every 40,000 page faults take over seven seconds!
> 
> This test-case demonstrates a problem that occurs with a read-mostly 
> mmap of a file on very fast media.  I wouldn't like to see a solution
> that special-cases zeroed pages.  I think userspace has done its part
> to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> This ought to be enough to hint to the kernel that it should be eagerly
> throwing away pages in this VMA.
> 
> 
> /*
>  * Copyright (c) 2010, Intel Corporation
>  * All rights reserved.
>  *
>  * Redistribution and use in source and binary forms, with or without
>  * modification, are permitted provided that the following conditions are met:
>  *
>  *  * Redistributions of source code must retain the above copyright notice,
>  *    this list of conditions and the following disclaimer.
>  *  * Redistributions in binary form must reproduce the above copyright notice,
>  *    this list of conditions and the following disclaimer in the documentation
>  *    and/or other materials provided with the distribution.
>  *  * Neither the name of Intel Corporation nor the names of its contributors
>  *    may be used to endorse or promote products derived from this software
>  *    without specific prior written permission.
>  *
>  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
>  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
>  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
>  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
>  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
>  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
>  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
>  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>  * POSSIBILITY OF SUCH DAMAGE.
>  */
> 
> #include <assert.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <math.h>
> #include <pthread.h>
> #include <signal.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <unistd.h>
> 
> #define rdtscll(val) do { \
>      unsigned int __a,__d; \
>      asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
>      (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
> } while(0)
> 
> 
> #define MAX_FILE_SIZE	((off_t)1024 * 1024 * 1024 * 1024)
> #define	MAX_FILE_IOS	16384
> #define	MAX_LATENCY	10000000		// usecs
> 
> #define NUM_IOS		1024
> #define IO_SIZE		4096
> #define BUFFER_SIZE	(1024 * 1024)
> 
> pthread_t tid;
> double 	cpu_clock;
> long long unsigned cpu_start, cpu_stop;
> 
> void *mmap_test(void *arg);
> void die ();
> 
> static const char usage_cmds[] =
> "usage: %s [options]\n"
> "cmd line options:\n"
> "    -f	file_name	Read from File named 'file_name'\n"
> "    -a	file_size	File of 'file_size' Bytes/thread\n"
> "    -b	buffer_size	Write/Read into/from buffer of 'buffer_size' Bytes/thread\n"
> "    -n	num_file_ios	Process 'num_file_ios' IOs\n"
> "    -s	io_size		IO Size = 'io_size' Bytes\n"
> "    -l max_latency     Show latency stats based on usecs of max_latency\n"
> ;
> 
> void usage(const char *program)
> {
> 	fprintf(stderr, usage_cmds, program);
> }
> 
> off_t file_size = MAX_FILE_SIZE;	// -a
> long long unsigned int buffer_size = BUFFER_SIZE;	// -b
> char *filename = "sparse-file";			// -f
> int	num_file_ios = NUM_IOS;		// -n
> int	max_latency = MAX_LATENCY;	// -l
> int	io_size = IO_SIZE;		// -s
> long long unsigned int   latency_limit;
> 
> int main(int argc, char **argv)
> {
> 	pthread_attr_t 	attr;
> 	cpu_set_t             mask;
> 	FILE *proc;
> 	char buf[256];
> 	double mhz = 0.0;
> 
> 	while (1) {
> 	    int option = getopt(argc, argv, "a:b:f:h:l:n:p:s:");
> 		if (option == -1) {
> 		    break;
> 		}
> 	    switch (option) {
> 		case 'a':
> 		    file_size = strtoul(optarg, NULL, 0);
> 		    printf("a: file_size:%ld Bytes :%ld MB\n", file_size, file_size/(1024*1024));
> 		    break;
> 		case 'b':
> 		    buffer_size = strtoul(optarg, NULL, 0);
> 		    printf("b: buffer_size:%lld Bytes\n", buffer_size);
> 		    break;
> 		case 'f':
> 	    	    filename = optarg;
> 		    printf("f: filename:%s\n", filename);
> 		    break;
> 		case 'h':
> 		    printf("h: options\n");
> 		    goto help;
> 		case 'l':
> 		    max_latency = strtoul(optarg, NULL, 0);
> 		    printf("l: latency stats based on max latency:%d\n", max_latency);
> 		    break;
> 		case 'n':
> 		    num_file_ios = strtoul(optarg, NULL, 0);
> 		    printf("n: num_file_ios:%d\n", num_file_ios);
> 		    if (num_file_ios > MAX_FILE_IOS) {
> 			printf("-n %d Entered > MAX_FILE_IOS:%d\n", num_file_ios, MAX_FILE_IOS);
> 			exit(1);
> 		    }
> 		    break;
> 		case 's':
> 		    io_size = strtoul(optarg, NULL, 0);
> 		    printf("s: io_size:%d Bytes\n", io_size);
> 		    break;
> 		default:
> 		help:
> 		    usage(argv[0]);
> 		    printf("default:\n");
> 		    exit(1);
> 	    }
> 	}
> 
> 	proc = fopen("/proc/cpuinfo", "r");
> 	if (!proc)
> 		return 0.0;
> 
> 	while (fgets(buf, sizeof buf, proc)) {
> 		double cpu;
> 
> 		if (sscanf(buf, "cpu MHz : %lf", &cpu) != 1)
> 			continue;
> 		if (mhz == 0.0) {
> 			mhz = cpu;
> 			continue;
> 		}
> 		if (mhz != cpu) {
> 			fprintf(stderr,
> 				"Conflicting CPU frequency values: %lf != %lf\n",
> 				mhz, cpu);
> 			return 0.0;
> 		}
> 	}
> 	fclose(proc);
> 	printf("CPU Clock Freq from /proc/cpuinfo:%.4f\n", mhz);
> //
> // Measure CPU Core Frequnecy over 5 second period
> //
> 	printf("Measuring CPU Frequency......:");
> 	rdtscll(cpu_start);
> 	usleep(5000000);
> 	rdtscll(cpu_stop);
> 	cpu_clock = (double)((double)(cpu_stop-cpu_start))/(double)5.0;
> 	printf("%.3f\n", cpu_clock);
> 	latency_limit = (long long unsigned int) (cpu_clock*max_latency/1000000);
> 	printf("latency_limit:%llu cycles or %d usecs\n", latency_limit, max_latency);
> 
> 	pthread_attr_init (&attr);
> 	pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
> 	pthread_attr_setstacksize (&attr, (size_t) (1024*1024));
> 
> 	if (pthread_create(&tid, &attr, mmap_test, (void *)(long) 0) != 0) {
> 		die("Thread create failed!");
> 	}
> 
> 	CPU_ZERO(&mask);
> 	CPU_SET(0, &mask);
> 	if (pthread_setaffinity_np(tid, sizeof(mask), &mask) ) {
> 	 	printf("WARNING: could not set CPU Affinity, exit...\n");
> 	 	exit(1);
> 	}
> 
>         pthread_join(tid, NULL);
>         sleep(1);
> 
> 	return 0;
> }
> 
> 
> void die(char *string)
> {
> 	fprintf(stderr, "\nmmap_test: %s\n", string);
> 	exit(1);
> }
> 
> void *mmapfile(char *fname, off_t size, int *filed)
> {
> 	int fd;
> 	void *file_addr;
> 	struct stat statbuf;
> 
> 	fd = open(fname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
> 	*filed = fd;
> 	if (fd < 0) {
>     		fprintf(stderr, "unable to open %s to get an FD:%s\n", fname, strerror(errno));
> 		exit(1);
> 	}
> 
> 	fstat(fd, &statbuf);
> 	if (statbuf.st_size < size)
> 		ftruncate(fd, size);
> 
> 	file_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> 	if (file_addr == MAP_FAILED) {
>     		fprintf(stderr, "datafile mmap failed: %s\n", strerror(errno));
> 		exit(1);
> 	}
> 
> 	madvise(file_addr, size, MADV_RANDOM);
> 	return file_addr;
> }
> 
> void create_offsets(off_t *offset_buf, int threadnum)
> {
> 	int i, curr_time;
> 
> 	curr_time = time(NULL);
>         srandom(curr_time / (threadnum + 1));
> 
> 	for (i = 0; i < num_file_ios; i++) {
> 		double random1 = ((double)(rand()%(RAND_MAX)) / RAND_MAX);
> 		offset_buf[i] = file_size * random1;
> 		offset_buf[i] = offset_buf[i] / io_size * io_size;
> 	}
> }
> 
> void *mmap_test(void *arg)
> {
> 	int threadnum = (long) arg;
> 	int fd;
> 	char *file_ptr, *file_addr;
> 	char *buf_ptr, *buf_addr = NULL;
> 	int i, j, ios;
> 	off_t offset_buf[MAX_FILE_IOS];
> 	unsigned long long latency_start, latency_stop;
> 
> 	posix_memalign((void *)&buf_addr, 4096, buffer_size);
> 
> 	file_addr = mmapfile(filename, file_size, &fd);
> 
> 	ios = buffer_size/io_size;
> 
> 	create_offsets(offset_buf, threadnum);
> 
> 	for (j = 0; j < num_file_ios; j++) {
> 		buf_ptr = buf_addr;
> 		file_ptr = file_addr + offset_buf[j];
>  
> 		for (i = 0; i < ios; i++) {
> 			rdtscll(latency_start);
> 			*buf_ptr = *(char *)file_ptr;
> 			rdtscll(latency_stop);
> 			printf("%lld\n", latency_stop - latency_start);
> 			buf_ptr += io_size;
> 			file_ptr += io_size;
> 		}
> 	}
> 
> 	close(fd);
> 	munmap(file_addr, file_size);
> 	free(buf_addr);
> 
> 	pthread_exit(NULL);
> 	return 0;
> }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-17 19:50   ` Matthew Wilcox
@ 2010-08-18 14:13     ` Johannes Weiner
  -1 siblings, 0 replies; 19+ messages in thread
From: Johannes Weiner @ 2010-08-18 14:13 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-mm, linux-kernel

Hi Matthew,

On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> 
> No comment on this?  Was it just that I posted it during the VM summit?

I have not forgotten about it.  I just have a hard time reproducing
those extreme stalls you observed.

Running that test on a 2.5GHz machine with 2G of memory gives me
stalls of up to half a second.  The patchset I am experimenting with
gets me down to peaks of 70ms, but it needs further work.

Mapped file pages get two rounds on the LRU list, so once the VM
starts scanning, it has to go through all of them twice and can only
reclaim them on the second encounter.

At that point, since we scan without making progress, we start waiting
for IO, which is not happening in this case, so we sit there until a
timeout expires.

This stupid-waiting can be improved, and I am working on that.  But
since I can not reproduce your observations, I don't know if this is
the (sole) source of the problem.  Can I send you patches?

> On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> > 
> > This testcase shows some odd behaviour from the Linux VM.
> > 
> > It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> > in it.  Due to the file being entirely sparse, the VM allocates new pages
> > and zeroes them.  Initially, it runs very fast, taking on the order of
> > 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> > and starts doing huge amounts of work trying to figure out which of
> > these clean pages to throw away.

This is similar to one of my test cases for:

	6457474 vmscan: detect mapped file pages used only once
	31c0569 vmscan: drop page_mapping_inuse()
	dfc8d63 vmscan: factor out page reference checks

because the situation was even worse before (see the series
description in dfc8d63).  Maybe asking the obvious, but the kernel you
tested on did include those commits, right?

And just to be sure, I sent you a test-patch to disable the used-once
detection on IRC the other day.  Did you have time to run it yet?
Here it is again:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c7e57c..c757bba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -584,6 +584,7 @@ static enum page_references page_check_references(struct page *page,
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
+		return PAGEREF_ACTIVATE;
 		if (PageAnon(page))
 			return PAGEREF_ACTIVATE;
 		/*


> > In my testing with a 6GB machine and 2.9GHz CPU, one in every
> > 15,000 page faults takes over a second, and one in every 40,000
> > page faults take over seven seconds!
> > 
> > This test-case demonstrates a problem that occurs with a read-mostly 
> > mmap of a file on very fast media.  I wouldn't like to see a solution
> > that special-cases zeroed pages.  I think userspace has done its part
> > to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> > This ought to be enough to hint to the kernel that it should be eagerly
> > throwing away pages in this VMA.

We can probably do something like the following, but I am not sure
this is a good fix, either.  How many applications are using
madvise()?

--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -495,7 +495,7 @@ int page_referenced_one(struct page *pag
 		 * mapping is already gone, the unmap path will have
 		 * set PG_referenced or activated the page.
 		 */
-		if (likely(!VM_SequentialReadHint(vma)))
+		if (likely(!(vma->vm_flags & (VM_SEQ_READ|VM_RAND_READ))))
 			referenced++;
 	}
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-18 14:13     ` Johannes Weiner
  0 siblings, 0 replies; 19+ messages in thread
From: Johannes Weiner @ 2010-08-18 14:13 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-mm, linux-kernel

Hi Matthew,

On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> 
> No comment on this?  Was it just that I posted it during the VM summit?

I have not forgotten about it.  I just have a hard time reproducing
those extreme stalls you observed.

Running that test on a 2.5GHz machine with 2G of memory gives me
stalls of up to half a second.  The patchset I am experimenting with
gets me down to peaks of 70ms, but it needs further work.

Mapped file pages get two rounds on the LRU list, so once the VM
starts scanning, it has to go through all of them twice and can only
reclaim them on the second encounter.

At that point, since we scan without making progress, we start waiting
for IO, which is not happening in this case, so we sit there until a
timeout expires.

This stupid-waiting can be improved, and I am working on that.  But
since I can not reproduce your observations, I don't know if this is
the (sole) source of the problem.  Can I send you patches?

> On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> > 
> > This testcase shows some odd behaviour from the Linux VM.
> > 
> > It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> > in it.  Due to the file being entirely sparse, the VM allocates new pages
> > and zeroes them.  Initially, it runs very fast, taking on the order of
> > 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> > and starts doing huge amounts of work trying to figure out which of
> > these clean pages to throw away.

This is similar to one of my test cases for:

	6457474 vmscan: detect mapped file pages used only once
	31c0569 vmscan: drop page_mapping_inuse()
	dfc8d63 vmscan: factor out page reference checks

because the situation was even worse before (see the series
description in dfc8d63).  Maybe asking the obvious, but the kernel you
tested on did include those commits, right?

And just to be sure, I sent you a test-patch to disable the used-once
detection on IRC the other day.  Did you have time to run it yet?
Here it is again:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c7e57c..c757bba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -584,6 +584,7 @@ static enum page_references page_check_references(struct page *page,
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
+		return PAGEREF_ACTIVATE;
 		if (PageAnon(page))
 			return PAGEREF_ACTIVATE;
 		/*


> > In my testing with a 6GB machine and 2.9GHz CPU, one in every
> > 15,000 page faults takes over a second, and one in every 40,000
> > page faults take over seven seconds!
> > 
> > This test-case demonstrates a problem that occurs with a read-mostly 
> > mmap of a file on very fast media.  I wouldn't like to see a solution
> > that special-cases zeroed pages.  I think userspace has done its part
> > to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> > This ought to be enough to hint to the kernel that it should be eagerly
> > throwing away pages in this VMA.

We can probably do something like the following, but I am not sure
this is a good fix, either.  How many applications are using
madvise()?

--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -495,7 +495,7 @@ int page_referenced_one(struct page *pag
 		 * mapping is already gone, the unmap path will have
 		 * set PG_referenced or activated the page.
 		 */
-		if (likely(!VM_SequentialReadHint(vma)))
+		if (likely(!(vma->vm_flags & (VM_SEQ_READ|VM_RAND_READ))))
 			referenced++;
 	}
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
       [not found]     ` <20100818160613.GE9431@localhost>
@ 2010-08-18 16:07         ` Wu Fengguang
  0 siblings, 0 replies; 19+ messages in thread
From: Wu Fengguang @ 2010-08-18 16:07 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Matthew Wilcox, linux-mm, linux-kernel, Li Shaohua

On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > Hi Matthew,
> > 
> > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > 
> > > No comment on this?  Was it just that I posted it during the VM summit?
> > 
> > I have not forgotten about it.  I just have a hard time reproducing
> > those extreme stalls you observed.
> > 
> > Running that test on a 2.5GHz machine with 2G of memory gives me
> > stalls of up to half a second.  The patchset I am experimenting with
> > gets me down to peaks of 70ms, but it needs further work.
> > 
> > Mapped file pages get two rounds on the LRU list, so once the VM
> > starts scanning, it has to go through all of them twice and can only
> > reclaim them on the second encounter.
> > 
> > At that point, since we scan without making progress, we start waiting
> > for IO, which is not happening in this case, so we sit there until a
> > timeout expires.
> 
> Right, this could lead to some 1s stall. Shaohua and me also noticed
> this when investigating the responsiveness issues. And we are wondering
> if it makes sense to do congestion_wait() only when the bdi is really
> congested? There are no IO underway anyway in this case.
> 
> > This stupid-waiting can be improved, and I am working on that.  But
> 
> Yeah, stupid waiting :)
> 
> > since I can not reproduce your observations, I don't know if this is
> > the (sole) source of the problem.  Can I send you patches?
> 
> Sure.
> 
> > > On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> > > > 
> > > > This testcase shows some odd behaviour from the Linux VM.
> > > > 
> > > > It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> > > > in it.  Due to the file being entirely sparse, the VM allocates new pages
> > > > and zeroes them.  Initially, it runs very fast, taking on the order of
> > > > 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> > > > and starts doing huge amounts of work trying to figure out which of
> > > > these clean pages to throw away.
> > 
> > This is similar to one of my test cases for:
> > 
> > 	6457474 vmscan: detect mapped file pages used only once
> > 	31c0569 vmscan: drop page_mapping_inuse()
> > 	dfc8d63 vmscan: factor out page reference checks
> > 
> > because the situation was even worse before (see the series
> > description in dfc8d63).  Maybe asking the obvious, but the kernel you
> > tested on did include those commits, right?
> > 
> > And just to be sure, I sent you a test-patch to disable the used-once
> > detection on IRC the other day.  Did you have time to run it yet?
> > Here it is again:
> > 
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9c7e57c..c757bba 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -584,6 +584,7 @@ static enum page_references page_check_references(struct page *page,
> >  		return PAGEREF_RECLAIM;
> >  
> >  	if (referenced_ptes) {
> > +		return PAGEREF_ACTIVATE;
> 
> How come page activation helps?
> 
> >  		if (PageAnon(page))
> >  			return PAGEREF_ACTIVATE;
> >  		/*
> > 
> > 
> > > > In my testing with a 6GB machine and 2.9GHz CPU, one in every
> > > > 15,000 page faults takes over a second, and one in every 40,000
> > > > page faults take over seven seconds!
> > > > 
> > > > This test-case demonstrates a problem that occurs with a read-mostly 
> > > > mmap of a file on very fast media.  I wouldn't like to see a solution
> > > > that special-cases zeroed pages.  I think userspace has done its part
> > > > to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> > > > This ought to be enough to hint to the kernel that it should be eagerly
> > > > throwing away pages in this VMA.
> > 
> > We can probably do something like the following, but I am not sure
> > this is a good fix, either.  How many applications are using
> > madvise()?
> 
> Heh, it sounds crazy to rip random read pages, though it does help to
> produce a FAST test case.
> 
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -495,7 +495,7 @@ int page_referenced_one(struct page *pag
> >  		 * mapping is already gone, the unmap path will have
> >  		 * set PG_referenced or activated the page.
> >  		 */
> > -		if (likely(!VM_SequentialReadHint(vma)))
> > +		if (likely(!(vma->vm_flags & (VM_SEQ_READ|VM_RAND_READ))))
> >  			referenced++;
> >  	}
> 
> Thanks,
> Fengguang
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-18 16:07         ` Wu Fengguang
  0 siblings, 0 replies; 19+ messages in thread
From: Wu Fengguang @ 2010-08-18 16:07 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Matthew Wilcox, linux-mm, linux-kernel, Li Shaohua

On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > Hi Matthew,
> > 
> > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > 
> > > No comment on this?  Was it just that I posted it during the VM summit?
> > 
> > I have not forgotten about it.  I just have a hard time reproducing
> > those extreme stalls you observed.
> > 
> > Running that test on a 2.5GHz machine with 2G of memory gives me
> > stalls of up to half a second.  The patchset I am experimenting with
> > gets me down to peaks of 70ms, but it needs further work.
> > 
> > Mapped file pages get two rounds on the LRU list, so once the VM
> > starts scanning, it has to go through all of them twice and can only
> > reclaim them on the second encounter.
> > 
> > At that point, since we scan without making progress, we start waiting
> > for IO, which is not happening in this case, so we sit there until a
> > timeout expires.
> 
> Right, this could lead to some 1s stall. Shaohua and me also noticed
> this when investigating the responsiveness issues. And we are wondering
> if it makes sense to do congestion_wait() only when the bdi is really
> congested? There are no IO underway anyway in this case.
> 
> > This stupid-waiting can be improved, and I am working on that.  But
> 
> Yeah, stupid waiting :)
> 
> > since I can not reproduce your observations, I don't know if this is
> > the (sole) source of the problem.  Can I send you patches?
> 
> Sure.
> 
> > > On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> > > > 
> > > > This testcase shows some odd behaviour from the Linux VM.
> > > > 
> > > > It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> > > > in it.  Due to the file being entirely sparse, the VM allocates new pages
> > > > and zeroes them.  Initially, it runs very fast, taking on the order of
> > > > 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> > > > and starts doing huge amounts of work trying to figure out which of
> > > > these clean pages to throw away.
> > 
> > This is similar to one of my test cases for:
> > 
> > 	6457474 vmscan: detect mapped file pages used only once
> > 	31c0569 vmscan: drop page_mapping_inuse()
> > 	dfc8d63 vmscan: factor out page reference checks
> > 
> > because the situation was even worse before (see the series
> > description in dfc8d63).  Maybe asking the obvious, but the kernel you
> > tested on did include those commits, right?
> > 
> > And just to be sure, I sent you a test-patch to disable the used-once
> > detection on IRC the other day.  Did you have time to run it yet?
> > Here it is again:
> > 
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9c7e57c..c757bba 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -584,6 +584,7 @@ static enum page_references page_check_references(struct page *page,
> >  		return PAGEREF_RECLAIM;
> >  
> >  	if (referenced_ptes) {
> > +		return PAGEREF_ACTIVATE;
> 
> How come page activation helps?
> 
> >  		if (PageAnon(page))
> >  			return PAGEREF_ACTIVATE;
> >  		/*
> > 
> > 
> > > > In my testing with a 6GB machine and 2.9GHz CPU, one in every
> > > > 15,000 page faults takes over a second, and one in every 40,000
> > > > page faults take over seven seconds!
> > > > 
> > > > This test-case demonstrates a problem that occurs with a read-mostly 
> > > > mmap of a file on very fast media.  I wouldn't like to see a solution
> > > > that special-cases zeroed pages.  I think userspace has done its part
> > > > to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> > > > This ought to be enough to hint to the kernel that it should be eagerly
> > > > throwing away pages in this VMA.
> > 
> > We can probably do something like the following, but I am not sure
> > this is a good fix, either.  How many applications are using
> > madvise()?
> 
> Heh, it sounds crazy to rip random read pages, though it does help to
> produce a FAST test case.
> 
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -495,7 +495,7 @@ int page_referenced_one(struct page *pag
> >  		 * mapping is already gone, the unmap path will have
> >  		 * set PG_referenced or activated the page.
> >  		 */
> > -		if (likely(!VM_SequentialReadHint(vma)))
> > +		if (likely(!(vma->vm_flags & (VM_SEQ_READ|VM_RAND_READ))))
> >  			referenced++;
> >  	}
> 
> Thanks,
> Fengguang
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-18 14:13     ` Johannes Weiner
@ 2010-08-18 21:26       ` Wu Fengguang
  -1 siblings, 0 replies; 19+ messages in thread
From: Wu Fengguang @ 2010-08-18 21:26 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Matthew Wilcox, linux-mm, linux-kernel, Li Shaohua, Rik van Riel

> Mapped file pages get two rounds on the LRU list, so once the VM
> starts scanning, it has to go through all of them twice and can only
> reclaim them on the second encounter.

This can be fixed gracefully based on Rik's refault-distance patch :)
With the distance info we can safely drop the use-once mapped file pages.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-18 21:26       ` Wu Fengguang
  0 siblings, 0 replies; 19+ messages in thread
From: Wu Fengguang @ 2010-08-18 21:26 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Matthew Wilcox, linux-mm, linux-kernel, Li Shaohua, Rik van Riel

> Mapped file pages get two rounds on the LRU list, so once the VM
> starts scanning, it has to go through all of them twice and can only
> reclaim them on the second encounter.

This can be fixed gracefully based on Rik's refault-distance patch :)
With the distance info we can safely drop the use-once mapped file pages.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-18 16:07         ` Wu Fengguang
@ 2010-08-19  1:42           ` Shaohua Li
  -1 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2010-08-19  1:42 UTC (permalink / raw)
  To: Wu, Fengguang; +Cc: Johannes Weiner, Matthew Wilcox, linux-mm, linux-kernel

On Thu, Aug 19, 2010 at 12:07:31AM +0800, Wu, Fengguang wrote:
> On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> > On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > > Hi Matthew,
> > > 
> > > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > > 
> > > > No comment on this?  Was it just that I posted it during the VM summit?
> > > 
> > > I have not forgotten about it.  I just have a hard time reproducing
> > > those extreme stalls you observed.
> > > 
> > > Running that test on a 2.5GHz machine with 2G of memory gives me
> > > stalls of up to half a second.  The patchset I am experimenting with
> > > gets me down to peaks of 70ms, but it needs further work.
> > > 
> > > Mapped file pages get two rounds on the LRU list, so once the VM
> > > starts scanning, it has to go through all of them twice and can only
> > > reclaim them on the second encounter.
> > > 
> > > At that point, since we scan without making progress, we start waiting
> > > for IO, which is not happening in this case, so we sit there until a
> > > timeout expires.
> > 
> > Right, this could lead to some 1s stall. Shaohua and me also noticed
> > this when investigating the responsiveness issues. And we are wondering
> > if it makes sense to do congestion_wait() only when the bdi is really
> > congested? There are no IO underway anyway in this case.
> > 
> > > This stupid-waiting can be improved, and I am working on that.  But
> > 
> > Yeah, stupid waiting :)
How about this one?


Subject: mm: check device is really congested before sleep in direct page reclaim

congestion_wait() blindly sleep without checking if device is really congested.
In a workload without any write, it can cause direct page reclaim sleep 100ms
and hasn't any help for page reclaim.
There might be other places calling congestion_wait() and need check if
device is really congested, but I can't audit all, so this just changes the
direct page reclaim code path. The new congestion_wait_check() will make sure
at least one device is congested before going into sleep.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 include/linux/backing-dev.h |    1 +
 mm/backing-dev.c            |   14 ++++++++++++--
 mm/vmscan.c                 |    2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

Index: linux/mm/backing-dev.c
===================================================================
--- linux.orig/mm/backing-dev.c	2010-08-18 16:41:04.000000000 +0800
+++ linux/mm/backing-dev.c	2010-08-19 08:59:14.000000000 +0800
@@ -725,13 +725,16 @@ static wait_queue_head_t congestion_wqh[
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
 
+static atomic_t nr_congested_bdi[2];
+
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	clear_bit(bit, &bdi->state);
+	if (test_and_clear_bit(bit, &bdi->state))
+		atomic_dec(&nr_congested_bdi[sync]);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
@@ -743,7 +746,8 @@ void set_bdi_congested(struct backing_de
 	enum bdi_state bit;
 
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	set_bit(bit, &bdi->state);
+	if (!test_and_set_bit(bit, &bdi->state))
+		atomic_inc(&nr_congested_bdi[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 
@@ -769,3 +773,9 @@ long congestion_wait(int sync, long time
 }
 EXPORT_SYMBOL(congestion_wait);
 
+long congestion_wait_check(int sync, long timeout)
+{
+	if (atomic_read(&nr_congested_bdi[sync]) == 0)
+		return 0;
+	return congestion_wait(sync, timeout);
+}
Index: linux/include/linux/backing-dev.h
===================================================================
--- linux.orig/include/linux/backing-dev.h	2010-08-18 16:41:04.000000000 +0800
+++ linux/include/linux/backing-dev.h	2010-08-18 16:41:23.000000000 +0800
@@ -285,6 +285,7 @@ enum {
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
+long congestion_wait_check(int sync, long timeout);
 
 
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2010-08-18 16:41:04.000000000 +0800
+++ linux/mm/vmscan.c	2010-08-18 16:41:23.000000000 +0800
@@ -1910,7 +1910,7 @@ static unsigned long do_try_to_free_page
 		/* Take a nap, wait for some writeback to complete */
 		if (!sc->hibernation_mode && sc->nr_scanned &&
 		    priority < DEF_PRIORITY - 2)
-			congestion_wait(BLK_RW_ASYNC, HZ/10);
+			congestion_wait_check(BLK_RW_ASYNC, HZ/10);
 	}
 
 out:

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-19  1:42           ` Shaohua Li
  0 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2010-08-19  1:42 UTC (permalink / raw)
  To: Wu, Fengguang; +Cc: Johannes Weiner, Matthew Wilcox, linux-mm, linux-kernel

On Thu, Aug 19, 2010 at 12:07:31AM +0800, Wu, Fengguang wrote:
> On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> > On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > > Hi Matthew,
> > > 
> > > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > > 
> > > > No comment on this?  Was it just that I posted it during the VM summit?
> > > 
> > > I have not forgotten about it.  I just have a hard time reproducing
> > > those extreme stalls you observed.
> > > 
> > > Running that test on a 2.5GHz machine with 2G of memory gives me
> > > stalls of up to half a second.  The patchset I am experimenting with
> > > gets me down to peaks of 70ms, but it needs further work.
> > > 
> > > Mapped file pages get two rounds on the LRU list, so once the VM
> > > starts scanning, it has to go through all of them twice and can only
> > > reclaim them on the second encounter.
> > > 
> > > At that point, since we scan without making progress, we start waiting
> > > for IO, which is not happening in this case, so we sit there until a
> > > timeout expires.
> > 
> > Right, this could lead to some 1s stall. Shaohua and me also noticed
> > this when investigating the responsiveness issues. And we are wondering
> > if it makes sense to do congestion_wait() only when the bdi is really
> > congested? There are no IO underway anyway in this case.
> > 
> > > This stupid-waiting can be improved, and I am working on that.  But
> > 
> > Yeah, stupid waiting :)
How about this one?


Subject: mm: check device is really congested before sleep in direct page reclaim

congestion_wait() blindly sleep without checking if device is really congested.
In a workload without any write, it can cause direct page reclaim sleep 100ms
and hasn't any help for page reclaim.
There might be other places calling congestion_wait() and need check if
device is really congested, but I can't audit all, so this just changes the
direct page reclaim code path. The new congestion_wait_check() will make sure
at least one device is congested before going into sleep.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 include/linux/backing-dev.h |    1 +
 mm/backing-dev.c            |   14 ++++++++++++--
 mm/vmscan.c                 |    2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

Index: linux/mm/backing-dev.c
===================================================================
--- linux.orig/mm/backing-dev.c	2010-08-18 16:41:04.000000000 +0800
+++ linux/mm/backing-dev.c	2010-08-19 08:59:14.000000000 +0800
@@ -725,13 +725,16 @@ static wait_queue_head_t congestion_wqh[
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
 
+static atomic_t nr_congested_bdi[2];
+
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	clear_bit(bit, &bdi->state);
+	if (test_and_clear_bit(bit, &bdi->state))
+		atomic_dec(&nr_congested_bdi[sync]);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
@@ -743,7 +746,8 @@ void set_bdi_congested(struct backing_de
 	enum bdi_state bit;
 
 	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	set_bit(bit, &bdi->state);
+	if (!test_and_set_bit(bit, &bdi->state))
+		atomic_inc(&nr_congested_bdi[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 
@@ -769,3 +773,9 @@ long congestion_wait(int sync, long time
 }
 EXPORT_SYMBOL(congestion_wait);
 
+long congestion_wait_check(int sync, long timeout)
+{
+	if (atomic_read(&nr_congested_bdi[sync]) == 0)
+		return 0;
+	return congestion_wait(sync, timeout);
+}
Index: linux/include/linux/backing-dev.h
===================================================================
--- linux.orig/include/linux/backing-dev.h	2010-08-18 16:41:04.000000000 +0800
+++ linux/include/linux/backing-dev.h	2010-08-18 16:41:23.000000000 +0800
@@ -285,6 +285,7 @@ enum {
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
+long congestion_wait_check(int sync, long timeout);
 
 
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2010-08-18 16:41:04.000000000 +0800
+++ linux/mm/vmscan.c	2010-08-18 16:41:23.000000000 +0800
@@ -1910,7 +1910,7 @@ static unsigned long do_try_to_free_page
 		/* Take a nap, wait for some writeback to complete */
 		if (!sc->hibernation_mode && sc->nr_scanned &&
 		    priority < DEF_PRIORITY - 2)
-			congestion_wait(BLK_RW_ASYNC, HZ/10);
+			congestion_wait_check(BLK_RW_ASYNC, HZ/10);
 	}
 
 out:

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-18 14:13     ` Johannes Weiner
@ 2010-08-19  9:18       ` KOSAKI Motohiro
  -1 siblings, 0 replies; 19+ messages in thread
From: KOSAKI Motohiro @ 2010-08-19  9:18 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: kosaki.motohiro, Matthew Wilcox, linux-mm, linux-kernel

> Hi Matthew,
> 
> On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > 
> > No comment on this?  Was it just that I posted it during the VM summit?
> 
> I have not forgotten about it.  I just have a hard time reproducing
> those extreme stalls you observed.

me too.
I never forgot this one, but...

I'll trying this again at next week.

Thanks.




^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-19  9:18       ` KOSAKI Motohiro
  0 siblings, 0 replies; 19+ messages in thread
From: KOSAKI Motohiro @ 2010-08-19  9:18 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: kosaki.motohiro, Matthew Wilcox, linux-mm, linux-kernel

> Hi Matthew,
> 
> On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > 
> > No comment on this?  Was it just that I posted it during the VM summit?
> 
> I have not forgotten about it.  I just have a hard time reproducing
> those extreme stalls you observed.

me too.
I never forgot this one, but...

I'll trying this again at next week.

Thanks.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-18 16:07         ` Wu Fengguang
@ 2010-08-19 11:51           ` Johannes Weiner
  -1 siblings, 0 replies; 19+ messages in thread
From: Johannes Weiner @ 2010-08-19 11:51 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: Matthew Wilcox, linux-mm, linux-kernel, Li Shaohua

On Thu, Aug 19, 2010 at 12:07:31AM +0800, Wu Fengguang wrote:
> On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> > On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > > Hi Matthew,
> > > 
> > > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > > 
> > > > No comment on this?  Was it just that I posted it during the VM summit?
> > > 
> > > I have not forgotten about it.  I just have a hard time reproducing
> > > those extreme stalls you observed.
> > > 
> > > Running that test on a 2.5GHz machine with 2G of memory gives me
> > > stalls of up to half a second.  The patchset I am experimenting with
> > > gets me down to peaks of 70ms, but it needs further work.
> > > 
> > > Mapped file pages get two rounds on the LRU list, so once the VM
> > > starts scanning, it has to go through all of them twice and can only
> > > reclaim them on the second encounter.
> > > 
> > > At that point, since we scan without making progress, we start waiting
> > > for IO, which is not happening in this case, so we sit there until a
> > > timeout expires.
> > 
> > Right, this could lead to some 1s stall. Shaohua and me also noticed
> > this when investigating the responsiveness issues. And we are wondering
> > if it makes sense to do congestion_wait() only when the bdi is really
> > congested? There are no IO underway anyway in this case.

I am currently trying to get rid of all the congestion_wait() in the VM.
They are used for different purposes, so they need different replacement
mechanisms.

I saw Shaohua's patch to make congestion_wait() cleverer.  But I really
think that congestion is not a good predicate in the first place.  Why
would the VM care about IO _congestion_?  It needs a bunch of pages to
complete IO, whether the writing device is congested is not really
useful information at this point, I think.

> > > since I can not reproduce your observations, I don't know if this is
> > > the (sole) source of the problem.  Can I send you patches?
> > 
> > Sure.

Cool!

> > > > On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> > > > > 
> > > > > This testcase shows some odd behaviour from the Linux VM.
> > > > > 
> > > > > It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> > > > > in it.  Due to the file being entirely sparse, the VM allocates new pages
> > > > > and zeroes them.  Initially, it runs very fast, taking on the order of
> > > > > 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> > > > > and starts doing huge amounts of work trying to figure out which of
> > > > > these clean pages to throw away.
> > > 
> > > This is similar to one of my test cases for:
> > > 
> > > 	6457474 vmscan: detect mapped file pages used only once
> > > 	31c0569 vmscan: drop page_mapping_inuse()
> > > 	dfc8d63 vmscan: factor out page reference checks
> > > 
> > > because the situation was even worse before (see the series
> > > description in dfc8d63).  Maybe asking the obvious, but the kernel you
> > > tested on did include those commits, right?
> > > 
> > > And just to be sure, I sent you a test-patch to disable the used-once
> > > detection on IRC the other day.  Did you have time to run it yet?
> > > Here it is again:
> > > 
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 9c7e57c..c757bba 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -584,6 +584,7 @@ static enum page_references page_check_references(struct page *page,
> > >  		return PAGEREF_RECLAIM;
> > >  
> > >  	if (referenced_ptes) {
> > > +		return PAGEREF_ACTIVATE;
> > 
> > How come page activation helps?

This is effectively disabling used-once detection and going back to the old
VM behaviour.  I don't think it helps, but this code is recent and directly
related to the test-case.  Maybe I/we missed something, it can't hurt to
make sure, right?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-19 11:51           ` Johannes Weiner
  0 siblings, 0 replies; 19+ messages in thread
From: Johannes Weiner @ 2010-08-19 11:51 UTC (permalink / raw)
  To: Wu Fengguang; +Cc: Matthew Wilcox, linux-mm, linux-kernel, Li Shaohua

On Thu, Aug 19, 2010 at 12:07:31AM +0800, Wu Fengguang wrote:
> On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> > On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > > Hi Matthew,
> > > 
> > > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > > 
> > > > No comment on this?  Was it just that I posted it during the VM summit?
> > > 
> > > I have not forgotten about it.  I just have a hard time reproducing
> > > those extreme stalls you observed.
> > > 
> > > Running that test on a 2.5GHz machine with 2G of memory gives me
> > > stalls of up to half a second.  The patchset I am experimenting with
> > > gets me down to peaks of 70ms, but it needs further work.
> > > 
> > > Mapped file pages get two rounds on the LRU list, so once the VM
> > > starts scanning, it has to go through all of them twice and can only
> > > reclaim them on the second encounter.
> > > 
> > > At that point, since we scan without making progress, we start waiting
> > > for IO, which is not happening in this case, so we sit there until a
> > > timeout expires.
> > 
> > Right, this could lead to some 1s stall. Shaohua and me also noticed
> > this when investigating the responsiveness issues. And we are wondering
> > if it makes sense to do congestion_wait() only when the bdi is really
> > congested? There are no IO underway anyway in this case.

I am currently trying to get rid of all the congestion_wait() in the VM.
They are used for different purposes, so they need different replacement
mechanisms.

I saw Shaohua's patch to make congestion_wait() cleverer.  But I really
think that congestion is not a good predicate in the first place.  Why
would the VM care about IO _congestion_?  It needs a bunch of pages to
complete IO, whether the writing device is congested is not really
useful information at this point, I think.

> > > since I can not reproduce your observations, I don't know if this is
> > > the (sole) source of the problem.  Can I send you patches?
> > 
> > Sure.

Cool!

> > > > On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
> > > > > 
> > > > > This testcase shows some odd behaviour from the Linux VM.
> > > > > 
> > > > > It creates a 1TB sparse file, mmaps it, and randomly reads locations 
> > > > > in it.  Due to the file being entirely sparse, the VM allocates new pages
> > > > > and zeroes them.  Initially, it runs very fast, taking on the order of
> > > > > 2.7 to 4us per page fault.  Eventually, the VM runs out of free pages,
> > > > > and starts doing huge amounts of work trying to figure out which of
> > > > > these clean pages to throw away.
> > > 
> > > This is similar to one of my test cases for:
> > > 
> > > 	6457474 vmscan: detect mapped file pages used only once
> > > 	31c0569 vmscan: drop page_mapping_inuse()
> > > 	dfc8d63 vmscan: factor out page reference checks
> > > 
> > > because the situation was even worse before (see the series
> > > description in dfc8d63).  Maybe asking the obvious, but the kernel you
> > > tested on did include those commits, right?
> > > 
> > > And just to be sure, I sent you a test-patch to disable the used-once
> > > detection on IRC the other day.  Did you have time to run it yet?
> > > Here it is again:
> > > 
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 9c7e57c..c757bba 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -584,6 +584,7 @@ static enum page_references page_check_references(struct page *page,
> > >  		return PAGEREF_RECLAIM;
> > >  
> > >  	if (referenced_ptes) {
> > > +		return PAGEREF_ACTIVATE;
> > 
> > How come page activation helps?

This is effectively disabling used-once detection and going back to the old
VM behaviour.  I don't think it helps, but this code is recent and directly
related to the test-case.  Maybe I/we missed something, it can't hurt to
make sure, right?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-19 11:51           ` Johannes Weiner
@ 2010-08-19 21:09             ` Wu Fengguang
  -1 siblings, 0 replies; 19+ messages in thread
From: Wu Fengguang @ 2010-08-19 21:09 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Matthew Wilcox, linux-mm, linux-kernel, Li, Shaohua

On Thu, Aug 19, 2010 at 07:51:06PM +0800, Johannes Weiner wrote:
> I am currently trying to get rid of all the congestion_wait() in the VM.
> They are used for different purposes, so they need different replacement
> mechanisms.
> 
> I saw Shaohua's patch to make congestion_wait() cleverer.  But I really
> think that congestion is not a good predicate in the first place.  Why
> would the VM care about IO _congestion_?  It needs a bunch of pages to
> complete IO, whether the writing device is congested is not really
> useful information at this point, I think.

I have the same feeling that the congestion_wait() calls are not
pertinent ones.  I'm glad to see people working on that exploring
all possible replacement schemes.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-19 21:09             ` Wu Fengguang
  0 siblings, 0 replies; 19+ messages in thread
From: Wu Fengguang @ 2010-08-19 21:09 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Matthew Wilcox, linux-mm, linux-kernel, Li, Shaohua

On Thu, Aug 19, 2010 at 07:51:06PM +0800, Johannes Weiner wrote:
> I am currently trying to get rid of all the congestion_wait() in the VM.
> They are used for different purposes, so they need different replacement
> mechanisms.
> 
> I saw Shaohua's patch to make congestion_wait() cleverer.  But I really
> think that congestion is not a good predicate in the first place.  Why
> would the VM care about IO _congestion_?  It needs a bunch of pages to
> complete IO, whether the writing device is congested is not really
> useful information at this point, I think.

I have the same feeling that the congestion_wait() calls are not
pertinent ones.  I'm glad to see people working on that exploring
all possible replacement schemes.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
  2010-08-19 11:51           ` Johannes Weiner
@ 2010-08-20  5:05             ` Shaohua Li
  -1 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2010-08-20  5:05 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Wu, Fengguang, Matthew Wilcox, linux-mm, linux-kernel

On Thu, Aug 19, 2010 at 07:51:06PM +0800, Johannes Weiner wrote:
> On Thu, Aug 19, 2010 at 12:07:31AM +0800, Wu Fengguang wrote:
> > On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> > > On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > > > Hi Matthew,
> > > > 
> > > > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > > > 
> > > > > No comment on this?  Was it just that I posted it during the VM summit?
> > > > 
> > > > I have not forgotten about it.  I just have a hard time reproducing
> > > > those extreme stalls you observed.
> > > > 
> > > > Running that test on a 2.5GHz machine with 2G of memory gives me
> > > > stalls of up to half a second.  The patchset I am experimenting with
> > > > gets me down to peaks of 70ms, but it needs further work.
> > > > 
> > > > Mapped file pages get two rounds on the LRU list, so once the VM
> > > > starts scanning, it has to go through all of them twice and can only
> > > > reclaim them on the second encounter.
> > > > 
> > > > At that point, since we scan without making progress, we start waiting
> > > > for IO, which is not happening in this case, so we sit there until a
> > > > timeout expires.
> > > 
> > > Right, this could lead to some 1s stall. Shaohua and me also noticed
> > > this when investigating the responsiveness issues. And we are wondering
> > > if it makes sense to do congestion_wait() only when the bdi is really
> > > congested? There are no IO underway anyway in this case.
> 
> I am currently trying to get rid of all the congestion_wait() in the VM.
> They are used for different purposes, so they need different replacement
> mechanisms.
> 
> I saw Shaohua's patch to make congestion_wait() cleverer.  But I really
> think that congestion is not a good predicate in the first place.  Why
> would the VM care about IO _congestion_?  It needs a bunch of pages to
> complete IO, whether the writing device is congested is not really
> useful information at this point, I think.
> 
> > > > since I can not reproduce your observations, I don't know if this is
> > > > the (sole) source of the problem.  Can I send you patches?
> > > 
> > > Sure.
> 
> Cool!
congestion_wait() isn't the sole source in my test.
with congestion_wait() removed, the max latency is ~50ms.
while if I made the mmaped page reclaimed in one round (makes page_check_references
return PAGEREF_RECLAIM_CLEAN for mmaped pages) in the test, the max latency is ~150us.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [TESTCASE] Clean pages clogging the VM
@ 2010-08-20  5:05             ` Shaohua Li
  0 siblings, 0 replies; 19+ messages in thread
From: Shaohua Li @ 2010-08-20  5:05 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Wu, Fengguang, Matthew Wilcox, linux-mm, linux-kernel

On Thu, Aug 19, 2010 at 07:51:06PM +0800, Johannes Weiner wrote:
> On Thu, Aug 19, 2010 at 12:07:31AM +0800, Wu Fengguang wrote:
> > On Thu, Aug 19, 2010 at 12:06:13AM +0800, Wu Fengguang wrote:
> > > On Wed, Aug 18, 2010 at 04:13:08PM +0200, Johannes Weiner wrote:
> > > > Hi Matthew,
> > > > 
> > > > On Tue, Aug 17, 2010 at 03:50:01PM -0400, Matthew Wilcox wrote:
> > > > > 
> > > > > No comment on this?  Was it just that I posted it during the VM summit?
> > > > 
> > > > I have not forgotten about it.  I just have a hard time reproducing
> > > > those extreme stalls you observed.
> > > > 
> > > > Running that test on a 2.5GHz machine with 2G of memory gives me
> > > > stalls of up to half a second.  The patchset I am experimenting with
> > > > gets me down to peaks of 70ms, but it needs further work.
> > > > 
> > > > Mapped file pages get two rounds on the LRU list, so once the VM
> > > > starts scanning, it has to go through all of them twice and can only
> > > > reclaim them on the second encounter.
> > > > 
> > > > At that point, since we scan without making progress, we start waiting
> > > > for IO, which is not happening in this case, so we sit there until a
> > > > timeout expires.
> > > 
> > > Right, this could lead to some 1s stall. Shaohua and me also noticed
> > > this when investigating the responsiveness issues. And we are wondering
> > > if it makes sense to do congestion_wait() only when the bdi is really
> > > congested? There are no IO underway anyway in this case.
> 
> I am currently trying to get rid of all the congestion_wait() in the VM.
> They are used for different purposes, so they need different replacement
> mechanisms.
> 
> I saw Shaohua's patch to make congestion_wait() cleverer.  But I really
> think that congestion is not a good predicate in the first place.  Why
> would the VM care about IO _congestion_?  It needs a bunch of pages to
> complete IO, whether the writing device is congested is not really
> useful information at this point, I think.
> 
> > > > since I can not reproduce your observations, I don't know if this is
> > > > the (sole) source of the problem.  Can I send you patches?
> > > 
> > > Sure.
> 
> Cool!
congestion_wait() isn't the sole source in my test.
with congestion_wait() removed, the max latency is ~50ms.
while if I made the mmaped page reclaimed in one round (makes page_check_references
return PAGEREF_RECLAIM_CLEAN for mmaped pages) in the test, the max latency is ~150us.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2010-08-20  5:05 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-09 13:30 [TESTCASE] Clean pages clogging the VM Matthew Wilcox
2010-08-17 19:50 ` Matthew Wilcox
2010-08-17 19:50   ` Matthew Wilcox
2010-08-18 14:13   ` Johannes Weiner
2010-08-18 14:13     ` Johannes Weiner
     [not found]     ` <20100818160613.GE9431@localhost>
2010-08-18 16:07       ` Wu Fengguang
2010-08-18 16:07         ` Wu Fengguang
2010-08-19  1:42         ` Shaohua Li
2010-08-19  1:42           ` Shaohua Li
2010-08-19 11:51         ` Johannes Weiner
2010-08-19 11:51           ` Johannes Weiner
2010-08-19 21:09           ` Wu Fengguang
2010-08-19 21:09             ` Wu Fengguang
2010-08-20  5:05           ` Shaohua Li
2010-08-20  5:05             ` Shaohua Li
2010-08-18 21:26     ` Wu Fengguang
2010-08-18 21:26       ` Wu Fengguang
2010-08-19  9:18     ` KOSAKI Motohiro
2010-08-19  9:18       ` KOSAKI Motohiro

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.