* [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
@ 2007-02-12 21:48 Graham, Simon
2007-02-12 22:40 ` John Levon
2007-02-12 23:41 ` Iustin Pop
0 siblings, 2 replies; 6+ messages in thread
From: Graham, Simon @ 2007-02-12 21:48 UTC (permalink / raw)
To: xen-devel
[-- Attachment #1: Type: text/plain, Size: 846 bytes --]
Attached is a patch to unstable that stops save/restore/dump from hosing
Dom0 when dealing with large domains - I'm actually resubmitting the
dump patch I previously submitted in addition as it hasn't been
incorporated yet; this is based on using fadvise64(DONTNEED) to throw
the page cache away once it has been written to disk -- with this in
place, memory usage does go up somewhat but then immediately drops again
when the action is done and this change, in conjunction with setting the
vm.dirty_ratio sysctl parameter seems to gives very good results.
Simon
-----------------------------------
Reduce impact of saving/restoring/dumping large domains on Dom0 memory
usage by means of fadvise64() to tell the OS to discard the cache pages
used for the save/dump file.
Signed-off-by: Simon Graham <Simon.Graham@stratus.com>
[-- Attachment #2: xen-dom0-cache.patch --]
[-- Type: application/octet-stream, Size: 8320 bytes --]
Index: trunk/tools/libxc/xc_linux_save.c
===================================================================
--- trunk/tools/libxc/xc_linux_save.c (revision 10373)
+++ trunk/tools/libxc/xc_linux_save.c (working copy)
@@ -16,6 +16,10 @@
#include "xg_private.h"
#include "xg_save_restore.h"
+#ifdef __linux__
+#define fadvise64 posix_fadvise64
+#endif
+
/*
** Default values for important tuning parameters. Can override by passing
** non-zero replacement values to xc_linux_save().
@@ -26,7 +30,6 @@
#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
-
/* max mfn of the whole machine */
static unsigned long max_mfn;
@@ -171,7 +174,39 @@
(new->tv_usec - old->tv_usec);
}
+static int noncached_write(int fd, int live, void *buffer, int len)
+{
+ static int write_count = 0;
+ int rc = write(fd,buffer,len);
+
+ if (!live) {
+ write_count += len;
+
+ if (write_count >= MAX_PAGECACHE_USAGE*PAGE_SIZE) {
+ int serrno = errno;
+
+ /* Time to discard cache -- do up to last page boundary
+ amount written so far */
+
+ off_t cur = lseek(fd, 0, SEEK_CUR);
+
+ if (cur != (off_t)-1) {
+ cur &= ~(PAGE_SIZE-1);
+
+ if (fadvise64(fd, 0, cur, POSIX_FADV_DONTNEED) < 0) {
+ DPRINTF("Failed to discard cache: %s", strerror(errno));
+ }
+ }
+
+ write_count = 0;
+
+ errno = serrno;
+ }
+ }
+ return rc;
+}
+
#ifdef ADAPTIVE_SAVE
@@ -204,7 +239,7 @@
}
-static int ratewrite(int io_fd, void *buf, int n)
+static int ratewrite(int io_fd, int live, void *buf, int n)
{
static int budget = 0;
static int burst_time_us = -1;
@@ -214,7 +249,7 @@
long long delta;
if (START_MBIT_RATE == 0)
- return write(io_fd, buf, n);
+ return noncached_write(io_fd, live, buf, n);
budget -= n;
if (budget < 0) {
@@ -250,13 +285,13 @@
}
}
}
- return write(io_fd, buf, n);
+ return noncached_write(io_fd, live, buf, n);
}
#else /* ! ADAPTIVE SAVE */
#define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
#define initialize_mbit_rate()
#endif
@@ -1069,7 +1104,7 @@
if(race && !live)
goto out;
- if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
+ if (ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE) {
ERROR("Error when writing to state file (4)"
" (errno %d)", errno);
goto out;
@@ -1078,7 +1113,7 @@
} else {
/* We have a normal page: just write it directly. */
- if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
+ if (ratewrite(io_fd, live, spage, PAGE_SIZE) != PAGE_SIZE) {
ERROR("Error when writing to state file (5)"
" (errno %d)", errno);
goto out;
@@ -1248,6 +1283,15 @@
DPRINTF("Warning - couldn't disable shadow mode");
}
}
+ else {
+ // flush last write and discard cache for file
+ if (fsync(io_fd) < 0) {
+ DPRINTF("Failed to flush file: %s", strerror(errno));
+ }
+ if (fadvise64(io_fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
+ DPRINTF("Failed to discard cache: %s", strerror(errno));
+ }
+ }
if (live_shinfo)
munmap(live_shinfo, PAGE_SIZE);
Index: trunk/tools/libxc/Makefile
===================================================================
--- trunk/tools/libxc/Makefile (revision 10373)
+++ trunk/tools/libxc/Makefile (working copy)
@@ -67,6 +67,7 @@
CFLAGS += -Werror -Wmissing-prototypes
CFLAGS += -fno-strict-aliasing
CFLAGS += $(INCLUDES) -I.
+CFLAGS += -D_GNU_SOURCE
# Define this to make it possible to run valgrind on code linked with these
# libraries.
Index: trunk/tools/libxc/xc_private.h
===================================================================
--- trunk/tools/libxc/xc_private.h (revision 10373)
+++ trunk/tools/libxc/xc_private.h (working copy)
@@ -41,6 +41,13 @@
#define INFO 1
#define PROGRESS 0
+/*
+** Define max dirty page cache to permit during save/restore -- need to balance
+** keeping cache usage down with CPU impact of invalidating too often.
+** (Currently 16MB)
+*/
+#define MAX_PAGECACHE_USAGE (4*1024)
+
#if INFO
#define IPRINTF(_f, _a...) printf(_f , ## _a)
#else
Index: trunk/tools/libxc/xc_core.c
===================================================================
--- trunk/tools/libxc/xc_core.c (revision 10373)
+++ trunk/tools/libxc/xc_core.c (working copy)
@@ -2,6 +2,10 @@
#include <stdlib.h>
#include <unistd.h>
+#ifdef __linux__
+#define fadvise64 posix_fadvise64
+#endif
+
/* number of pages to write at a time */
#define DUMP_INCREMENT (4 * 1024)
#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
@@ -129,12 +133,27 @@
int fd;
};
+/* Flush file to disk and discard page cache */
+static int discard_file_cache(int fd, int flush)
+{
+ if (flush && fsync(fd) < 0) {
+ PERROR("Failed to flush file: %s", strerror(errno));
+ return -errno;
+ }
+ if (fadvise64(fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
+ PERROR("Failed to discard cache: %s", strerror(errno));
+ return -errno;
+ }
+
+ return 0;
+}
+
/* Callback routine for writing to a local dump file. */
static int local_file_dump(void *args, char *buffer, unsigned int length)
{
struct dump_args *da = args;
int bytes, offset;
-
+
for ( offset = 0; offset < length; offset += bytes )
{
bytes = write(da->fd, &buffer[offset], length-offset);
@@ -145,6 +164,12 @@
}
}
+ if (length >= DUMP_INCREMENT*PAGE_SIZE) {
+ // Now dumping pages -- make sure we discard clean pages from
+ // the cache after each write
+ discard_file_cache(da->fd, 0 /* no flush */);
+ }
+
return 0;
}
@@ -165,6 +190,9 @@
sts = xc_domain_dumpcore_via_callback(
xc_handle, domid, &da, &local_file_dump);
+ /* flush and discard any remaining portion of the file from cache */
+ discard_file_cache(da.fd, 1/* flush first*/);
+
close(da.fd);
return sts;
Index: trunk/tools/libxc/xc_linux_restore.c
===================================================================
--- trunk/tools/libxc/xc_linux_restore.c (revision 10373)
+++ trunk/tools/libxc/xc_linux_restore.c (working copy)
@@ -12,6 +12,10 @@
#include "xg_private.h"
#include "xg_save_restore.h"
+#ifdef __linux__
+#define fadvise64 posix_fadvise64
+#endif
+
/* max mfn of the current host machine */
static unsigned long max_mfn;
@@ -143,7 +147,7 @@
unsigned int console_evtchn, unsigned long *console_mfn)
{
DECLARE_DOMCTL;
- int rc = 1, i, n, pae_extended_cr3 = 0;
+ int rc = 1, i, n, m, pae_extended_cr3 = 0;
unsigned long mfn, pfn;
unsigned int prev_pc, this_pc;
int verify = 0;
@@ -330,7 +334,7 @@
*/
prev_pc = 0;
- n = 0;
+ n = m = 0;
while (1) {
int j, nr_mfns = 0;
@@ -529,6 +533,20 @@
munmap(region_base, j*PAGE_SIZE);
n+= j; /* crude stats */
+
+ /* discard cache for portion of file read so far up to last page boundary
+ every 16MB (4k pages) or so */
+ m += j;
+ if (m > MAX_PAGECACHE_USAGE) {
+ off_t cur = lseek(io_fd, 0, SEEK_CUR);
+ if (cur != (off_t)-1) {
+ cur &= ~(PAGE_SIZE-1);
+ if (fadvise64(io_fd, 0, cur, POSIX_FADV_DONTNEED) < 0) {
+ DPRINTF("Failed to discard cache: %s\n", strerror(errno));
+ }
+ }
+ m = 0;
+ }
}
/*
@@ -863,6 +881,11 @@
free(p2m);
free(pfn_type);
+ /* discard cache for save file */
+ if (fadvise64(io_fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
+ DPRINTF("Failed to discard cache: %s\n", strerror(errno));
+ }
+
DPRINTF("Restore exit with rc=%d\n", rc);
return rc;
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
2007-02-12 21:48 [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0 Graham, Simon
@ 2007-02-12 22:40 ` John Levon
2007-02-12 23:41 ` Iustin Pop
1 sibling, 0 replies; 6+ messages in thread
From: John Levon @ 2007-02-12 22:40 UTC (permalink / raw)
To: Graham, Simon; +Cc: xen-devel
On Mon, Feb 12, 2007 at 04:48:09PM -0500, Graham, Simon wrote:
> Attached is a patch to unstable that stops save/restore/dump from hosing
> Dom0 when dealing with large domains - I'm actually resubmitting the
There's no fadvise at all on Solaris, so could you make this conditional
please?
thanks
john
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
2007-02-12 21:48 [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0 Graham, Simon
2007-02-12 22:40 ` John Levon
@ 2007-02-12 23:41 ` Iustin Pop
1 sibling, 0 replies; 6+ messages in thread
From: Iustin Pop @ 2007-02-12 23:41 UTC (permalink / raw)
To: Graham, Simon; +Cc: xen-devel
On Mon, Feb 12, 2007 at 04:48:09PM -0500, Graham, Simon wrote:
> Attached is a patch to unstable that stops save/restore/dump from hosing
> Dom0 when dealing with large domains - I'm actually resubmitting the
> dump patch I previously submitted in addition as it hasn't been
> incorporated yet; this is based on using fadvise64(DONTNEED) to throw
> the page cache away once it has been written to disk -- with this in
> place, memory usage does go up somewhat but then immediately drops again
> when the action is done and this change, in conjunction with setting the
> vm.dirty_ratio sysctl parameter seems to gives very good results.
Question - why fadvise64 and not posix_fadvise? posix_fadvise complies
to some posix standard, according to the man page, whereas the fadvise64
seems to be some glibc internal definition.
Just a thought.
Iustin
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
2007-02-13 13:01 Graham, Simon
@ 2007-02-13 18:49 ` Alan
0 siblings, 0 replies; 6+ messages in thread
From: Alan @ 2007-02-13 18:49 UTC (permalink / raw)
To: Graham, Simon; +Cc: Iustin Pop, xen-devel
On Tue, 13 Feb 2007 08:01:39 -0500
"Graham, Simon" <Simon.Graham@stratus.com> wrote:
> > Question - why fadvise64 and not posix_fadvise? posix_fadvise complies
> > to some posix standard, according to the man page, whereas the
> > fadvise64
> > seems to be some glibc internal definition.
> >
>
> It seemed to me (and I am prepared to be wrong) that posix_fadvise64 is
> a Linux only thing and the real official name is fadvise64 (although man
> fadvise64 on Linux claims that it's either fadvise64_64 or
> sys_fadvise64). Given that Solaris doesn't have fadvise, I'll make this
> conditional on being built on linux and use posix_fadvise64 directly.
Linux man(2) documents system calls. Glibc provides POSIX interfaces for
the OS
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
@ 2007-02-13 13:01 Graham, Simon
2007-02-13 18:49 ` Alan
0 siblings, 1 reply; 6+ messages in thread
From: Graham, Simon @ 2007-02-13 13:01 UTC (permalink / raw)
To: Iustin Pop; +Cc: xen-devel
> Question - why fadvise64 and not posix_fadvise? posix_fadvise complies
> to some posix standard, according to the man page, whereas the
> fadvise64
> seems to be some glibc internal definition.
>
It seemed to me (and I am prepared to be wrong) that posix_fadvise64 is
a Linux only thing and the real official name is fadvise64 (although man
fadvise64 on Linux claims that it's either fadvise64_64 or
sys_fadvise64). Given that Solaris doesn't have fadvise, I'll make this
conditional on being built on linux and use posix_fadvise64 directly.
Simon
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
@ 2007-02-13 12:59 Graham, Simon
0 siblings, 0 replies; 6+ messages in thread
From: Graham, Simon @ 2007-02-13 12:59 UTC (permalink / raw)
To: John Levon; +Cc: xen-devel
Sure -- sorry about that; I definitely google'd and thought I'd seen
that it does exist.
Simon
> -----Original Message-----
> From: John Levon [mailto:levon@movementarian.org]
> Sent: Monday, February 12, 2007 5:40 PM
> To: Graham, Simon
> Cc: xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] [PATCH][TOOLS] Reducing impact of domain
> save/restore/dump on Dom0
>
> On Mon, Feb 12, 2007 at 04:48:09PM -0500, Graham, Simon wrote:
>
> > Attached is a patch to unstable that stops save/restore/dump from
> hosing
> > Dom0 when dealing with large domains - I'm actually resubmitting the
>
> There's no fadvise at all on Solaris, so could you make this
> conditional
> please?
>
> thanks
> john
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2007-02-13 18:49 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-12 21:48 [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0 Graham, Simon
2007-02-12 22:40 ` John Levon
2007-02-12 23:41 ` Iustin Pop
2007-02-13 12:59 Graham, Simon
2007-02-13 13:01 Graham, Simon
2007-02-13 18:49 ` Alan
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.