All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
@ 2007-02-12 21:48 Graham, Simon
  2007-02-12 22:40 ` John Levon
  2007-02-12 23:41 ` Iustin Pop
  0 siblings, 2 replies; 6+ messages in thread
From: Graham, Simon @ 2007-02-12 21:48 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 846 bytes --]

Attached is a patch to unstable that stops save/restore/dump from hosing
Dom0 when dealing with large domains - I'm actually resubmitting the
dump patch I previously submitted in addition as it hasn't been
incorporated yet; this is based on using fadvise64(DONTNEED) to throw
the page cache away once it has been written to disk -- with this in
place, memory usage does go up somewhat but then immediately drops again
when the action is done and this change, in conjunction with setting the
vm.dirty_ratio sysctl parameter seems to gives very good results.

Simon

-----------------------------------
Reduce impact of saving/restoring/dumping large domains on Dom0 memory
usage by means of fadvise64() to tell the OS to discard the cache pages
used for the save/dump file.

Signed-off-by: Simon Graham <Simon.Graham@stratus.com>

[-- Attachment #2: xen-dom0-cache.patch --]
[-- Type: application/octet-stream, Size: 8320 bytes --]

Index: trunk/tools/libxc/xc_linux_save.c
===================================================================
--- trunk/tools/libxc/xc_linux_save.c	(revision 10373)
+++ trunk/tools/libxc/xc_linux_save.c	(working copy)
@@ -16,6 +16,10 @@
 #include "xg_private.h"
 #include "xg_save_restore.h"
 
+#ifdef __linux__
+#define fadvise64 posix_fadvise64
+#endif
+
 /*
 ** Default values for important tuning parameters. Can override by passing
 ** non-zero replacement values to xc_linux_save().
@@ -26,7 +30,6 @@
 #define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
 #define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
 
-
 /* max mfn of the whole machine */
 static unsigned long max_mfn;
 
@@ -171,7 +174,39 @@
         (new->tv_usec - old->tv_usec);
 }
 
+static int noncached_write(int fd, int live, void *buffer, int len) 
+{
+    static int write_count = 0;
 
+    int rc = write(fd,buffer,len);
+
+    if (!live) {
+        write_count += len;
+
+        if (write_count >= MAX_PAGECACHE_USAGE*PAGE_SIZE) {
+            int serrno = errno;
+
+            /* Time to discard cache -- do up to last page boundary
+               amount written so far */
+
+            off_t cur = lseek(fd, 0, SEEK_CUR);
+
+            if (cur != (off_t)-1) {
+                cur &= ~(PAGE_SIZE-1);
+
+                if (fadvise64(fd, 0, cur, POSIX_FADV_DONTNEED) < 0) {
+                    DPRINTF("Failed to discard cache: %s", strerror(errno));
+                }
+            }
+
+            write_count = 0;
+
+            errno = serrno;
+        }
+    }
+    return rc;
+}
+
 #ifdef ADAPTIVE_SAVE
 
 
@@ -204,7 +239,7 @@
 }
 
 
-static int ratewrite(int io_fd, void *buf, int n)
+static int ratewrite(int io_fd, int live, void *buf, int n)
 {
     static int budget = 0;
     static int burst_time_us = -1;
@@ -214,7 +249,7 @@
     long long delta;
 
     if (START_MBIT_RATE == 0)
-        return write(io_fd, buf, n);
+        return noncached_write(io_fd, live, buf, n);
 
     budget -= n;
     if (budget < 0) {
@@ -250,13 +285,13 @@
             }
         }
     }
-    return write(io_fd, buf, n);
+    return noncached_write(io_fd, live, buf, n);
 }
 
 #else /* ! ADAPTIVE SAVE */
 
 #define RATE_IS_MAX() (0)
-#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
 #define initialize_mbit_rate()
 
 #endif
@@ -1069,7 +1104,7 @@
                     if(race && !live) 
                         goto out; 
 
-                    if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
+                    if (ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE) {
                         ERROR("Error when writing to state file (4)"
                               " (errno %d)", errno);
                         goto out;
@@ -1078,7 +1113,7 @@
                 }  else {
 
                     /* We have a normal page: just write it directly. */
-                    if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
+                    if (ratewrite(io_fd, live, spage, PAGE_SIZE) != PAGE_SIZE) {
                         ERROR("Error when writing to state file (5)"
                               " (errno %d)", errno);
                         goto out;
@@ -1248,6 +1283,15 @@
             DPRINTF("Warning - couldn't disable shadow mode");
         }
     }
+    else {
+        // flush last write and discard cache for file
+        if (fsync(io_fd) < 0) {
+            DPRINTF("Failed to flush file: %s", strerror(errno));
+        }
+        if (fadvise64(io_fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
+            DPRINTF("Failed to discard cache: %s", strerror(errno));
+        }
+    }            
 
     if (live_shinfo)
         munmap(live_shinfo, PAGE_SIZE);
Index: trunk/tools/libxc/Makefile
===================================================================
--- trunk/tools/libxc/Makefile	(revision 10373)
+++ trunk/tools/libxc/Makefile	(working copy)
@@ -67,6 +67,7 @@
 CFLAGS   += -Werror -Wmissing-prototypes
 CFLAGS   += -fno-strict-aliasing
 CFLAGS   += $(INCLUDES) -I.
+CFLAGS   += -D_GNU_SOURCE
 
 # Define this to make it possible to run valgrind on code linked with these
 # libraries.
Index: trunk/tools/libxc/xc_private.h
===================================================================
--- trunk/tools/libxc/xc_private.h	(revision 10373)
+++ trunk/tools/libxc/xc_private.h	(working copy)
@@ -41,6 +41,13 @@
 #define INFO     1
 #define PROGRESS 0
 
+/*
+** Define max dirty page cache to permit during save/restore -- need to balance 
+** keeping cache usage down with CPU impact of invalidating too often.
+** (Currently 16MB)
+*/
+#define MAX_PAGECACHE_USAGE (4*1024)
+
 #if INFO
 #define IPRINTF(_f, _a...) printf(_f , ## _a)
 #else
Index: trunk/tools/libxc/xc_core.c
===================================================================
--- trunk/tools/libxc/xc_core.c	(revision 10373)
+++ trunk/tools/libxc/xc_core.c	(working copy)
@@ -2,6 +2,10 @@
 #include <stdlib.h>
 #include <unistd.h>
 
+#ifdef __linux__
+#define fadvise64 posix_fadvise64
+#endif
+
 /* number of pages to write at a time */
 #define DUMP_INCREMENT (4 * 1024)
 #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
@@ -129,12 +133,27 @@
     int     fd;
 };
 
+/* Flush file to disk and discard page cache */
+static int discard_file_cache(int fd, int flush) 
+{ 
+    if (flush && fsync(fd) < 0) {
+        PERROR("Failed to flush file: %s", strerror(errno));
+        return -errno;
+    }
+    if (fadvise64(fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
+        PERROR("Failed to discard cache: %s", strerror(errno));
+        return -errno;
+    }
+
+    return 0;
+}
+
 /* Callback routine for writing to a local dump file. */
 static int local_file_dump(void *args, char *buffer, unsigned int length)
 {
     struct dump_args *da = args;
     int bytes, offset;
-
+    
     for ( offset = 0; offset < length; offset += bytes )
     {
         bytes = write(da->fd, &buffer[offset], length-offset);
@@ -145,6 +164,12 @@
         }
     }
 
+    if (length >= DUMP_INCREMENT*PAGE_SIZE) {
+        // Now dumping pages -- make sure we discard clean pages from
+        // the cache after each write
+        discard_file_cache(da->fd, 0 /* no flush */);
+    }
+
     return 0;
 }
 
@@ -165,6 +190,9 @@
     sts = xc_domain_dumpcore_via_callback(
         xc_handle, domid, &da, &local_file_dump);
 
+    /* flush and discard any remaining portion of the file from cache */
+    discard_file_cache(da.fd, 1/* flush first*/);
+
     close(da.fd);
 
     return sts;
Index: trunk/tools/libxc/xc_linux_restore.c
===================================================================
--- trunk/tools/libxc/xc_linux_restore.c	(revision 10373)
+++ trunk/tools/libxc/xc_linux_restore.c	(working copy)
@@ -12,6 +12,10 @@
 #include "xg_private.h"
 #include "xg_save_restore.h"
 
+#ifdef __linux__
+#define fadvise64 posix_fadvise64
+#endif
+
 /* max mfn of the current host machine */
 static unsigned long max_mfn;
 
@@ -143,7 +147,7 @@
                      unsigned int console_evtchn, unsigned long *console_mfn)
 {
     DECLARE_DOMCTL;
-    int rc = 1, i, n, pae_extended_cr3 = 0;
+    int rc = 1, i, n, m, pae_extended_cr3 = 0;
     unsigned long mfn, pfn;
     unsigned int prev_pc, this_pc;
     int verify = 0;
@@ -330,7 +334,7 @@
      */
     prev_pc = 0;
 
-    n = 0;
+    n = m = 0;
     while (1) {
 
         int j, nr_mfns = 0; 
@@ -529,6 +533,20 @@
 
         munmap(region_base, j*PAGE_SIZE);
         n+= j; /* crude stats */
+
+	/* discard cache for portion of file read so far up to last page boundary
+	   every 16MB (4k pages) or so */
+	m += j;
+	if (m > MAX_PAGECACHE_USAGE) {
+	    off_t cur = lseek(io_fd, 0, SEEK_CUR);
+	    if (cur != (off_t)-1) {
+		cur &= ~(PAGE_SIZE-1);
+		if (fadvise64(io_fd, 0, cur, POSIX_FADV_DONTNEED) < 0) {
+		    DPRINTF("Failed to discard cache: %s\n", strerror(errno));
+		}
+	    }
+	    m = 0;
+	}
     }
 
     /*
@@ -863,6 +881,11 @@
     free(p2m);
     free(pfn_type);
 
+    /* discard cache for save file  */
+    if (fadvise64(io_fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
+	DPRINTF("Failed to discard cache: %s\n", strerror(errno));
+    }
+
     DPRINTF("Restore exit with rc=%d\n", rc);
     
     return rc;

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
  2007-02-12 21:48 [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0 Graham, Simon
@ 2007-02-12 22:40 ` John Levon
  2007-02-12 23:41 ` Iustin Pop
  1 sibling, 0 replies; 6+ messages in thread
From: John Levon @ 2007-02-12 22:40 UTC (permalink / raw)
  To: Graham, Simon; +Cc: xen-devel

On Mon, Feb 12, 2007 at 04:48:09PM -0500, Graham, Simon wrote:

> Attached is a patch to unstable that stops save/restore/dump from hosing
> Dom0 when dealing with large domains - I'm actually resubmitting the

There's no fadvise at all on Solaris, so could you make this conditional
please?

thanks
john

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
  2007-02-12 21:48 [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0 Graham, Simon
  2007-02-12 22:40 ` John Levon
@ 2007-02-12 23:41 ` Iustin Pop
  1 sibling, 0 replies; 6+ messages in thread
From: Iustin Pop @ 2007-02-12 23:41 UTC (permalink / raw)
  To: Graham, Simon; +Cc: xen-devel

On Mon, Feb 12, 2007 at 04:48:09PM -0500, Graham, Simon wrote:
> Attached is a patch to unstable that stops save/restore/dump from hosing
> Dom0 when dealing with large domains - I'm actually resubmitting the
> dump patch I previously submitted in addition as it hasn't been
> incorporated yet; this is based on using fadvise64(DONTNEED) to throw
> the page cache away once it has been written to disk -- with this in
> place, memory usage does go up somewhat but then immediately drops again
> when the action is done and this change, in conjunction with setting the
> vm.dirty_ratio sysctl parameter seems to gives very good results.

Question - why fadvise64 and not posix_fadvise? posix_fadvise complies
to some posix standard, according to the man page, whereas the fadvise64
seems to be some glibc internal definition.

Just a thought.

Iustin

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
  2007-02-13 13:01 Graham, Simon
@ 2007-02-13 18:49 ` Alan
  0 siblings, 0 replies; 6+ messages in thread
From: Alan @ 2007-02-13 18:49 UTC (permalink / raw)
  To: Graham, Simon; +Cc: Iustin Pop, xen-devel

On Tue, 13 Feb 2007 08:01:39 -0500
"Graham, Simon" <Simon.Graham@stratus.com> wrote:

> > Question - why fadvise64 and not posix_fadvise? posix_fadvise complies
> > to some posix standard, according to the man page, whereas the
> > fadvise64
> > seems to be some glibc internal definition.
> >
> 
> It seemed to me (and I am prepared to be wrong) that posix_fadvise64 is
> a Linux only thing and the real official name is fadvise64 (although man
> fadvise64 on Linux claims that it's either fadvise64_64 or
> sys_fadvise64). Given that Solaris doesn't have fadvise, I'll make this
> conditional on being built on linux and use posix_fadvise64 directly.

Linux man(2) documents system calls. Glibc provides POSIX interfaces for
the OS

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
@ 2007-02-13 13:01 Graham, Simon
  2007-02-13 18:49 ` Alan
  0 siblings, 1 reply; 6+ messages in thread
From: Graham, Simon @ 2007-02-13 13:01 UTC (permalink / raw)
  To: Iustin Pop; +Cc: xen-devel

> Question - why fadvise64 and not posix_fadvise? posix_fadvise complies
> to some posix standard, according to the man page, whereas the
> fadvise64
> seems to be some glibc internal definition.
>

It seemed to me (and I am prepared to be wrong) that posix_fadvise64 is
a Linux only thing and the real official name is fadvise64 (although man
fadvise64 on Linux claims that it's either fadvise64_64 or
sys_fadvise64). Given that Solaris doesn't have fadvise, I'll make this
conditional on being built on linux and use posix_fadvise64 directly.

Simon
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0
@ 2007-02-13 12:59 Graham, Simon
  0 siblings, 0 replies; 6+ messages in thread
From: Graham, Simon @ 2007-02-13 12:59 UTC (permalink / raw)
  To: John Levon; +Cc: xen-devel

Sure -- sorry about that; I definitely google'd and thought I'd seen
that it does exist.

Simon

> -----Original Message-----
> From: John Levon [mailto:levon@movementarian.org]
> Sent: Monday, February 12, 2007 5:40 PM
> To: Graham, Simon
> Cc: xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] [PATCH][TOOLS] Reducing impact of domain
> save/restore/dump on Dom0
> 
> On Mon, Feb 12, 2007 at 04:48:09PM -0500, Graham, Simon wrote:
> 
> > Attached is a patch to unstable that stops save/restore/dump from
> hosing
> > Dom0 when dealing with large domains - I'm actually resubmitting the
> 
> There's no fadvise at all on Solaris, so could you make this
> conditional
> please?
> 
> thanks
> john

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-02-13 18:49 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-12 21:48 [PATCH][TOOLS] Reducing impact of domain save/restore/dump on Dom0 Graham, Simon
2007-02-12 22:40 ` John Levon
2007-02-12 23:41 ` Iustin Pop
2007-02-13 12:59 Graham, Simon
2007-02-13 13:01 Graham, Simon
2007-02-13 18:49 ` Alan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.