All of lore.kernel.org
 help / color / mirror / Atom feed
From: Orit Wasserman <owasserm@redhat.com>
To: Isaku Yamahata <yamahata@valinux.co.jp>, satoshi.itoh@aist.go.jp
Cc: kvm@vger.kernel.org, qemu-devel@nongnu.org, t.hirofuchi@aist.go.jp
Subject: Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration
Date: Thu, 29 Dec 2011 17:51:36 +0200	[thread overview]
Message-ID: <4EFC8C88.70701@redhat.com> (raw)
In-Reply-To: <f5061e756dfd4ebf8456170a2c146a81be01c685.1325055140.git.yamahata@valinux.co.jp>

Hi,
A general comment this patch is a bit too long,which makes it hard to review.
Can you split it please?

On 12/29/2011 03:26 AM, Isaku Yamahata wrote:
> This patch implements postcopy livemigration.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> ---
>  Makefile.target           |    4 +
>  arch_init.c               |   26 +-
>  cpu-all.h                 |    7 +
>  exec.c                    |   20 +-
>  migration-exec.c          |    8 +
>  migration-fd.c            |   30 +
>  migration-postcopy-stub.c |   77 ++
>  migration-postcopy.c      | 1891 +++++++++++++++++++++++++++++++++++++++++++++
>  migration-tcp.c           |   37 +-
>  migration-unix.c          |   32 +-
>  migration.c               |   31 +
>  migration.h               |   30 +
>  qemu-common.h             |    1 +
>  qemu-options.hx           |    5 +-
>  umem.c                    |  379 +++++++++
>  umem.h                    |  105 +++
>  vl.c                      |   14 +-
>  17 files changed, 2677 insertions(+), 20 deletions(-)
>  create mode 100644 migration-postcopy-stub.c
>  create mode 100644 migration-postcopy.c
>  create mode 100644 umem.c
>  create mode 100644 umem.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index 3261383..d94c53f 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h
>  CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y)
>  CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y)
>  CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y)
> +CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y)
>  
>  include ../config-host.mak
>  include config-devices.mak
> @@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o
>  obj-y += memory.o
>  LIBS+=-lz
>  
> +common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o
> +common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o
> +
>  QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
>  QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
>  QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
> diff --git a/arch_init.c b/arch_init.c
> index bc53092..8b3130d 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch)
>      return 1;
>  }
>  
> +static bool outgoing_postcopy = false;
> +
> +void ram_save_set_params(const MigrationParams *params, void *opaque)
> +{
> +    outgoing_postcopy = params->postcopy;
> +}
> +
>  static RAMBlock *last_block_sent = NULL;
>  
>  int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
> @@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>      uint64_t expected_time = 0;
>      int ret;
>  
> +    if (stage == 1) {
> +        last_block_sent = NULL;
> +
> +        bytes_transferred = 0;
> +        last_block = NULL;
> +        last_offset = 0;

Changing of line order + new empty line

> +    }
> +    if (outgoing_postcopy) {
> +        return postcopy_outgoing_ram_save_live(mon, f, stage, opaque);
> +    }
> +

I would just do :

unregister_savevm_live and then register_savevm_live(...,postcopy_outgoing_ram_save_live,...)
when starting outgoing postcopy migration.

>      if (stage < 0) {
>          cpu_physical_memory_set_dirty_tracking(0);
>          return 0;
> @@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>      }
>  
>      if (stage == 1) {
> -        bytes_transferred = 0;
> -        last_block_sent = NULL;
> -        last_block = NULL;
> -        last_offset = 0;
>          sort_ram_list();
>  
>          /* Make sure all dirty bits are set */
> @@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
>      int flags;
>      int error;
>  
> +    if (incoming_postcopy) {
> +        return postcopy_incoming_ram_load(f, opaque, version_id);
> +    }
> +
why not call register_savevm_live(...,postcopy_incoming_ram_load,...) when starting guest with postcopy_incoming

>      if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) {
>          return -EINVAL;
>      }
> diff --git a/cpu-all.h b/cpu-all.h
> index 0244f7a..2e9d8a7 100644
> --- a/cpu-all.h
> +++ b/cpu-all.h
> @@ -475,6 +475,9 @@ extern ram_addr_t ram_size;
>  /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
>  #define RAM_PREALLOC_MASK   (1 << 0)
>  
> +/* RAM is allocated via umem for postcopy incoming mode */
> +#define RAM_POSTCOPY_UMEM_MASK  (1 << 1)
> +
>  typedef struct RAMBlock {
>      uint8_t *host;
>      ram_addr_t offset;
> @@ -485,6 +488,10 @@ typedef struct RAMBlock {
>  #if defined(__linux__) && !defined(TARGET_S390X)
>      int fd;
>  #endif
> +
> +#ifdef CONFIG_POSTCOPY
> +    UMem *umem;    /* for incoming postcopy mode */
> +#endif
>  } RAMBlock;
>  
>  typedef struct RAMList {
> diff --git a/exec.c b/exec.c
> index c8c6692..90b0491 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -35,6 +35,7 @@
>  #include "qemu-timer.h"
>  #include "memory.h"
>  #include "exec-memory.h"
> +#include "migration.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>          new_block->host = host;
>          new_block->flags |= RAM_PREALLOC_MASK;
>      } else {
> +#ifdef CONFIG_POSTCOPY
> +        if (incoming_postcopy) {
> +            postcopy_incoming_ram_alloc(name, size,
> +                                        &new_block->host, &new_block->umem);
> +            new_block->flags |= RAM_POSTCOPY_UMEM_MASK;
> +        } else
> +#endif
>          if (mem_path) {
>  #if defined (__linux__) && !defined(TARGET_S390X)
>              new_block->host = file_ram_alloc(new_block, size, mem_path);
> @@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr)
>              QLIST_REMOVE(block, next);
>              if (block->flags & RAM_PREALLOC_MASK) {
>                  ;
> -            } else if (mem_path) {
> +            }
> +#ifdef CONFIG_POSTCOPY
> +            else if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
> +                postcopy_incoming_ram_free(block->umem);
> +            }
> +#endif
> +            else if (mem_path) {
>  #if defined (__linux__) && !defined(TARGET_S390X)
>                  if (block->fd) {
>                      munmap(block->host, block->length);
> @@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>              } else {
>                  flags = MAP_FIXED;
>                  munmap(vaddr, length);
> +                if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
> +                    postcopy_incoming_qemu_pages_unmapped(addr, length);
> +                    block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> +                }
>                  if (mem_path) {
>  #if defined(__linux__) && !defined(TARGET_S390X)
>                      if (block->fd) {
> diff --git a/migration-exec.c b/migration-exec.c
> index e14552e..2bd0c3b 100644
> --- a/migration-exec.c
> +++ b/migration-exec.c
> @@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s, const char *command)
>  {
>      FILE *f;
>  
> +    if (s->params.postcopy) {
> +        return -ENOSYS;
> +    }
> +
>      f = popen(command, "w");
>      if (f == NULL) {
>          DPRINTF("Unable to popen exec target\n");
> @@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command)
>  {
>      QEMUFile *f;
>  
> +    if (incoming_postcopy) {
> +        return -ENOSYS;
> +    }
> +
>      DPRINTF("Attempting to start an incoming migration\n");
>      f = qemu_popen_cmd(command, "r");
>      if(f == NULL) {
> diff --git a/migration-fd.c b/migration-fd.c
> index 6211124..5a62ab9 100644
> --- a/migration-fd.c
> +++ b/migration-fd.c
> @@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const char *fdname)
>      s->write = fd_write;
>      s->close = fd_close;
>  
> +    if (s->params.postcopy) {
> +        int flags = fcntl(s->fd, F_GETFL);
> +        if ((flags & O_ACCMODE) != O_RDWR) {
> +            goto err_after_open;
> +        }
> +
> +        s->fd_read = dup(s->fd);
> +        if (s->fd_read == -1) {
> +            goto err_after_open;
> +        }
> +        s->file_read = qemu_fdopen(s->fd_read, "r");
> +        if (s->file_read == NULL) {
> +            close(s->fd_read);
> +            goto err_after_open;
> +        }
> +    }
> +
>      migrate_fd_connect(s);
>      return 0;
>  
> @@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque)
>  
>      process_incoming_migration(f);
>      qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f);
> +    }
>      qemu_fclose(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_qemu_ready();
> +    }
> +    return;
>  }
>  
>  int fd_start_incoming_migration(const char *infd)
> @@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd)
>      DPRINTF("Attempting to start an incoming migration via fd\n");
>  
>      fd = strtol(infd, NULL, 0);
> +    if (incoming_postcopy) {
> +        int flags = fcntl(fd, F_GETFL);
> +        if ((flags & O_ACCMODE) != O_RDWR) {
> +            return -EINVAL;
> +        }
> +    }
>      f = qemu_fdopen(fd, "rb");
>      if(f == NULL) {
>          DPRINTF("Unable to apply qemu wrapper to file descriptor\n");
> diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c
> new file mode 100644
> index 0000000..0b78de7
> --- /dev/null
> +++ b/migration-postcopy-stub.c
> @@ -0,0 +1,77 @@
> +/*
> + * migration-postcopy-stub.c: postcopy livemigration
> + *                            stub functions for non-supported hosts
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sysemu.h"
> +#include "migration.h"
> +
> +int postcopy_outgoing_create_read_socket(MigrationState *s)
> +{
> +    return -ENOSYS;
> +}
> +
> +int postcopy_outgoing_ram_save_live(Monitor *mon,
> +                                    QEMUFile *f, int stage, void *opaque)
> +{
> +    return -ENOSYS;
> +}
> +
> +void *postcopy_outgoing_begin(MigrationState *ms)
> +{
> +    return NULL;
> +}
> +
> +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> +                                          void *postcopy)
> +{
> +    return -ENOSYS;
> +}
> +
> +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
> +{
> +    return -ENOSYS;
> +}
> +
> +void postcopy_incoming_prepare(void)
> +{
> +}
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    return -ENOSYS;
> +}
> +
> +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
> +{
> +}
> +
> +void postcopy_incoming_qemu_ready(void)
> +{
> +}
> +
> +void postcopy_incoming_qemu_cleanup(void)
> +{
> +}
> +
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
> +{
> +}
> diff --git a/migration-postcopy.c b/migration-postcopy.c
> new file mode 100644
> index 0000000..ed0d574
> --- /dev/null
> +++ b/migration-postcopy.c
> @@ -0,0 +1,1891 @@
> +/*
> + * migration-postcopy.c: postcopy livemigration
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "bitmap.h"
> +#include "sysemu.h"
> +#include "hw/hw.h"
> +#include "arch_init.h"
> +#include "migration.h"
> +#include "umem.h"
> +
> +#include "memory.h"
> +#define WANT_EXEC_OBSOLETE
> +#include "exec-obsolete.h"
> +
> +//#define DEBUG_POSTCOPY
> +#ifdef DEBUG_POSTCOPY
> +#include <sys/syscall.h>
> +#define DPRINTF(fmt, ...)                                               \
> +    do {                                                                \
> +        printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid),     \
> +               __func__, __LINE__, ## __VA_ARGS__);                     \
> +    } while (0)
> +#else
> +#define DPRINTF(fmt, ...)       do { } while (0)
> +#endif
> +
> +#define ALIGN_UP(size, align)   (((size) + (align) - 1) & ~((align) - 1))
> +
> +static void fd_close(int *fd)
> +{
> +    if (*fd >= 0) {
> +        close(*fd);
> +        *fd = -1;
> +    }
> +}
> +
> +/***************************************************************************
> + * QEMUFile for non blocking pipe
> + */
> +
> +/* read only */
> +struct QEMUFilePipe {
> +    int fd;
> +    QEMUFile *file;
> +};

Why not use QEMUFileSocket ?

> +typedef struct QEMUFilePipe QEMUFilePipe;
> +
> +static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFilePipe *s = opaque;
> +    ssize_t len = 0;
> +
> +    while (size > 0) {
> +        ssize_t ret = read(s->fd, buf, size);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            }
> +            if (len == 0) {
> +                len = -errno;
> +            }
> +            break;
> +        }
> +
> +        if (ret == 0) {
> +            /* the write end of the pipe is closed */
> +            break;
> +        }
> +        len += ret;
> +        buf += ret;
> +        size -= ret;
> +    }
> +
> +    return len;
> +}
> +
> +static int pipe_close(void *opaque)
> +{
> +    QEMUFilePipe *s = opaque;
> +    g_free(s);
> +    return 0;
> +}
> +
> +static QEMUFile *qemu_fopen_pipe(int fd)
> +{
> +    QEMUFilePipe *s = g_malloc0(sizeof(*s));
> +
> +    s->fd = fd;
> +    fcntl_setfl(fd, O_NONBLOCK);
> +    s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close,
> +                             NULL, NULL, NULL);
> +    return s->file;
> +}
> +
> +/* write only */
> +struct QEMUFileNonblock {
> +    int fd;
> +    QEMUFile *file;
> +
> +    /* for pipe-write nonblocking mode */
> +#define BUF_SIZE_INC    (32 * 1024)     /* = IO_BUF_SIZE */
> +    uint8_t *buffer;
> +    size_t buffer_size;
> +    size_t buffer_capacity;
> +    bool freeze_output;
> +};
> +typedef struct QEMUFileNonblock QEMUFileNonblock;
> +

Couldn't you use QEMUFileBuffered ?

> +static void nonblock_flush_buffer(QEMUFileNonblock *s)
> +{
> +    size_t offset = 0;
> +    ssize_t ret;
> +
> +    while (offset < s->buffer_size) {
> +        ret = write(s->fd, s->buffer + offset, s->buffer_size - offset);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EAGAIN) {
> +                s->freeze_output = true;
> +            } else {
> +                qemu_file_set_error(s->file, errno);
> +            }
> +            break;
> +        }
> +
> +        if (ret == 0) {
> +            DPRINTF("ret == 0\n");
> +            break;
> +        }
> +
> +        offset += ret;
> +    }
> +
> +    if (offset > 0) {
> +        assert(s->buffer_size >= offset);
> +        memmove(s->buffer, s->buffer + offset, s->buffer_size - offset);
> +        s->buffer_size -= offset;
> +    }
> +    if (s->buffer_size > 0) {
> +        s->freeze_output = true;
> +    }
> +}
> +
> +static int nonblock_put_buffer(void *opaque,
> +                               const uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFileNonblock *s = opaque;
> +    int error;
> +    ssize_t len = 0;
> +
> +    error = qemu_file_get_error(s->file);
> +    if (error) {
> +        return error;
> +    }
> +
> +    nonblock_flush_buffer(s);
> +    error = qemu_file_get_error(s->file);
> +    if (error) {
> +        return error;
> +    }
> +
> +    while (!s->freeze_output && size > 0) {
> +        ssize_t ret;
> +        assert(s->buffer_size == 0);
> +
> +        ret = write(s->fd, buf, size);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EAGAIN) {
> +                s->freeze_output = true;
> +            } else {
> +                qemu_file_set_error(s->file, errno);
> +            }
> +            break;
> +        }
> +
> +        len += ret;
> +        buf += ret;
> +        size -= ret;
> +    }
> +
> +    if (size > 0) {
> +        int inc = size - (s->buffer_capacity - s->buffer_size);
> +        if (inc > 0) {
> +            s->buffer_capacity +=
> +                DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC;
> +            s->buffer = g_realloc(s->buffer, s->buffer_capacity);
> +        }
> +        memcpy(s->buffer + s->buffer_size, buf, size);
> +        s->buffer_size += size;
> +
> +        len += size;
> +    }
> +
> +    return len;
> +}
> +
> +static int nonblock_pending_size(QEMUFileNonblock *s)
> +{
> +    return qemu_pending_size(s->file) + s->buffer_size;
> +}
> +
> +static void nonblock_fflush(QEMUFileNonblock *s)
> +{
> +    s->freeze_output = false;
> +    nonblock_flush_buffer(s);
> +    if (!s->freeze_output) {
> +        qemu_fflush(s->file);
> +    }
> +}
> +
> +static void nonblock_wait_for_flush(QEMUFileNonblock *s)
> +{
> +    while (nonblock_pending_size(s) > 0) {
> +        fd_set fds;
> +        FD_ZERO(&fds);
> +        FD_SET(s->fd, &fds);
> +        select(s->fd + 1, NULL, &fds, NULL, NULL);
> +
> +        nonblock_fflush(s);
> +    }
> +}
> +
> +static int nonblock_close(void *opaque)
> +{
> +    QEMUFileNonblock *s = opaque;
> +    nonblock_wait_for_flush(s);
> +    g_free(s->buffer);
> +    g_free(s);
> +    return 0;
> +}
> +
> +static QEMUFileNonblock *qemu_fopen_nonblock(int fd)
> +{
> +    QEMUFileNonblock *s = g_malloc0(sizeof(*s));
> +
> +    s->fd = fd;
> +    fcntl_setfl(fd, O_NONBLOCK);
> +    s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close,
> +                             NULL, NULL, NULL);
> +    return s;
> +}
> +
> +/***************************************************************************
> + * umem daemon on destination <-> qemu on source protocol
> + */
> +
> +#define QEMU_UMEM_REQ_INIT              0x00
> +#define QEMU_UMEM_REQ_ON_DEMAND         0x01
> +#define QEMU_UMEM_REQ_ON_DEMAND_CONT    0x02
> +#define QEMU_UMEM_REQ_BACKGROUND        0x03
> +#define QEMU_UMEM_REQ_BACKGROUND_CONT   0x04
> +#define QEMU_UMEM_REQ_REMOVE            0x05
> +#define QEMU_UMEM_REQ_EOC               0x06
> +
> +struct qemu_umem_req {
> +    int8_t cmd;
> +    uint8_t len;
> +    char *idstr;        /* ON_DEMAND, BACKGROUND, REMOVE */
> +    uint32_t nr;        /* ON_DEMAND, ON_DEMAND_CONT,
> +                           BACKGROUND, BACKGROUND_CONT, REMOVE */
> +
> +    /* in target page size as qemu migration protocol */
> +    uint64_t *pgoffs;   /* ON_DEMAND, ON_DEMAND_CONT,
> +                           BACKGROUND, BACKGROUND_CONT, REMOVE */
> +};
> +
> +static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr)
> +{
> +    qemu_put_byte(f, strlen(idstr));
> +    qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr));
> +}
> +
> +static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr,
> +                                              const uint64_t *pgoffs)
> +{
> +    uint32_t i;
> +
> +    qemu_put_be32(f, nr);
> +    for (i = 0; i < nr; i++) {
> +        qemu_put_be64(f, pgoffs[i]);
> +    }
> +}
> +
> +static void postcopy_incoming_send_req_one(QEMUFile *f,
> +                                           const struct qemu_umem_req *req)
> +{
> +    DPRINTF("cmd %d\n", req->cmd);
> +    qemu_put_byte(f, req->cmd);
> +    switch (req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +    case QEMU_UMEM_REQ_EOC:
> +        /* nothing */
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +    case QEMU_UMEM_REQ_REMOVE:
> +        postcopy_incoming_send_req_idstr(f, req->idstr);
> +        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +}
> +
> +/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024.
> + * So one message size must be <= IO_BUF_SIZE
> + * cmd: 1
> + * id len: 1
> + * id: 256
> + * nr: 2
> + */
> +#define MAX_PAGE_NR     ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t))
> +static void postcopy_incoming_send_req(QEMUFile *f,
> +                                       const struct qemu_umem_req *req)
> +{
> +    uint32_t nr = req->nr;
> +    struct qemu_umem_req tmp = *req;
> +
> +    switch (req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +    case QEMU_UMEM_REQ_EOC:
> +        postcopy_incoming_send_req_one(f, &tmp);
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +        tmp.nr = MIN(nr, MAX_PAGE_NR);
> +        postcopy_incoming_send_req_one(f, &tmp);
> +
> +        nr -= tmp.nr;
> +        tmp.pgoffs += tmp.nr;
> +        if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) {
> +            tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> +        }else {
> +            tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT;
> +        }
> +        /* fall through */
> +    case QEMU_UMEM_REQ_REMOVE:
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        while (nr > 0) {
> +            tmp.nr = MIN(nr, MAX_PAGE_NR);
> +            postcopy_incoming_send_req_one(f, &tmp);
> +
> +            nr -= tmp.nr;
> +            tmp.pgoffs += tmp.nr;
> +        }
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +}
> +
> +static int postcopy_outgoing_recv_req_idstr(QEMUFile *f,
> +                                            struct qemu_umem_req *req,
> +                                            size_t *offset)
> +{
> +    int ret;
> +
> +    req->len = qemu_peek_byte(f, *offset);
> +    *offset += 1;
> +    if (req->len == 0) {
> +        return -EAGAIN;
> +    }
> +    req->idstr = g_malloc((int)req->len + 1);
> +    ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset);
> +    *offset += ret;
> +    if (ret != req->len) {
> +        g_free(req->idstr);
> +        req->idstr = NULL;
> +        return -EAGAIN;
> +    }
> +    req->idstr[req->len] = 0;
> +    return 0;
> +}
> +
> +static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f,
> +                                             struct qemu_umem_req *req,
> +                                             size_t *offset)
> +{
> +    int ret;
> +    uint32_t be32;
> +    uint32_t i;
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset);
> +    *offset += sizeof(be32);
> +    if (ret != sizeof(be32)) {
> +        return -EAGAIN;
> +    }
> +
> +    req->nr = be32_to_cpu(be32);
> +    req->pgoffs = g_new(uint64_t, req->nr);
> +    for (i = 0; i < req->nr; i++) {
> +        uint64_t be64;
> +        ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset);
> +        *offset += sizeof(be64);
> +        if (ret != sizeof(be64)) {
> +            g_free(req->pgoffs);
> +            req->pgoffs = NULL;
> +            return -EAGAIN;
> +        }
> +        req->pgoffs[i] = be64_to_cpu(be64);
> +    }
> +    return 0;
> +}
> +
> +static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req *req)
> +{
> +    int size;
> +    int ret;
> +    size_t offset = 0;
> +
> +    size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset);
> +    if (size <= 0) {
> +        return -EAGAIN;
> +    }
> +    offset += 1;
> +
> +    switch (req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +    case QEMU_UMEM_REQ_EOC:
> +        /* nothing */
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +    case QEMU_UMEM_REQ_REMOVE:
> +        ret = postcopy_outgoing_recv_req_idstr(f, req, &offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +    qemu_file_skip(f, offset);
> +    DPRINTF("cmd %d\n", req->cmd);
> +    return 0;
> +}
> +
> +static void postcopy_outgoing_free_req(struct qemu_umem_req *req)
> +{
> +    g_free(req->idstr);
> +    g_free(req->pgoffs);
> +}
> +
> +/***************************************************************************
> + * outgoing part
> + */
> +
> +#define QEMU_SAVE_LIVE_STAGE_START      0x01    /* = QEMU_VM_SECTION_START */
> +#define QEMU_SAVE_LIVE_STAGE_PART       0x02    /* = QEMU_VM_SECTION_PART */
> +#define QEMU_SAVE_LIVE_STAGE_END        0x03    /* = QEMU_VM_SECTION_END */
> +
> +enum POState {
> +    PO_STATE_ERROR_RECEIVE,
> +    PO_STATE_ACTIVE,
> +    PO_STATE_EOC_RECEIVED,
> +    PO_STATE_ALL_PAGES_SENT,
> +    PO_STATE_COMPLETED,
> +};
> +typedef enum POState POState;
> +
> +struct PostcopyOutgoingState {
> +    POState state;
> +    QEMUFile *mig_read;
> +    int fd_read;
> +    RAMBlock *last_block_read;
> +
> +    QEMUFile *mig_buffered_write;
> +    MigrationState *ms;
> +
> +    /* For nobg mode. Check if all pages are sent */
> +    RAMBlock *block;
> +    ram_addr_t addr;
> +};
> +typedef struct PostcopyOutgoingState PostcopyOutgoingState;
> +
> +int postcopy_outgoing_create_read_socket(MigrationState *s)
> +{
> +    if (!s->params.postcopy) {
> +        return 0;
> +    }
> +
> +    s->fd_read = dup(s->fd);
> +    if (s->fd_read == -1) {
> +        int ret = -errno;
> +        perror("dup");
> +        return ret;
> +    }
> +    s->file_read = qemu_fopen_socket(s->fd_read);
> +    if (s->file_read == NULL) {
> +        return -EINVAL;
> +    }
> +    return 0;
> +}
> +
> +int postcopy_outgoing_ram_save_live(Monitor *mon,
> +                                    QEMUFile *f, int stage, void *opaque)
> +{
> +    int ret = 0;
> +    DPRINTF("stage %d\n", stage);
> +    if (stage == QEMU_SAVE_LIVE_STAGE_START) {
> +        sort_ram_list();
> +        ram_save_live_mem_size(f);
> +    }
> +    if (stage == QEMU_SAVE_LIVE_STAGE_PART) {
> +        ret = 1;
> +    }
> +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +    return ret;
> +}
> +
> +static RAMBlock *postcopy_outgoing_find_block(const char *idstr)
> +{
> +    RAMBlock *block;
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (!strncmp(idstr, block->idstr, strlen(idstr))) {
> +            return block;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +/*
> + * return value
> + *   0: continue postcopy mode
> + * > 0: completed postcopy mode.
> + * < 0: error
> + */
> +static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s,
> +                                        const struct qemu_umem_req *req,
> +                                        bool *written)
> +{
> +    int i;
> +    RAMBlock *block;
> +
> +    DPRINTF("cmd %d state %d\n", req->cmd, s->state);
> +    switch(req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +        /* nothing */
> +        break;
> +    case QEMU_UMEM_REQ_EOC:
> +        /* tell to finish migration. */
> +        if (s->state == PO_STATE_ALL_PAGES_SENT) {
> +            s->state = PO_STATE_COMPLETED;
> +            DPRINTF("-> PO_STATE_COMPLETED\n");
> +        } else {
> +            s->state = PO_STATE_EOC_RECEIVED;
> +            DPRINTF("-> PO_STATE_EOC_RECEIVED\n");
> +        }
> +        return 1;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +        DPRINTF("idstr: %s\n", req->idstr);
> +        block = postcopy_outgoing_find_block(req->idstr);
> +        if (block == NULL) {
> +            return -EINVAL;
> +        }
> +        s->last_block_read = block;
> +        /* fall through */
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        DPRINTF("nr %d\n", req->nr);
> +        for (i = 0; i < req->nr; i++) {
> +            DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]);
> +            int ret = ram_save_page(s->mig_buffered_write, s->last_block_read,
> +                                    req->pgoffs[i] << TARGET_PAGE_BITS);
> +            if (ret > 0) {
> +                *written = true;
> +            }
> +        }
> +        break;
> +    case QEMU_UMEM_REQ_REMOVE:
> +        block = postcopy_outgoing_find_block(req->idstr);
> +        if (block == NULL) {
> +            return -EINVAL;
> +        }
> +        for (i = 0; i < req->nr; i++) {
> +            ram_addr_t addr = block->offset +
> +                (req->pgoffs[i] << TARGET_PAGE_BITS);
> +            cpu_physical_memory_reset_dirty(addr,
> +                                            addr + TARGET_PAGE_SIZE,
> +                                            MIGRATION_DIRTY_FLAG);
> +        }
> +        break;
> +    default:
> +        return -EINVAL;
> +    }
> +    return 0;
> +}
> +
> +static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s)
> +{
> +    if (s->mig_read != NULL) {
> +        qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL);
> +        qemu_fclose(s->mig_read);
> +        s->mig_read = NULL;
> +        fd_close(&s->fd_read);
> +
> +        s->ms->file_read = NULL;
> +        s->ms->fd_read = -1;
> +    }
> +}
> +
> +static void postcopy_outgoing_completed(PostcopyOutgoingState *s)
> +{
> +    postcopy_outgoing_close_mig_read(s);
> +    s->ms->postcopy = NULL;
> +    g_free(s);
> +}
> +
> +static void postcopy_outgoing_recv_handler(void *opaque)
> +{
> +    PostcopyOutgoingState *s = opaque;
> +    bool written = false;
> +    int ret = 0;
> +
> +    assert(s->state == PO_STATE_ACTIVE ||
> +           s->state == PO_STATE_ALL_PAGES_SENT);
> +
> +    do {
> +        struct qemu_umem_req req = {.idstr = NULL,
> +                                    .pgoffs = NULL};
> +
> +        ret = postcopy_outgoing_recv_req(s->mig_read, &req);
> +        if (ret < 0) {
> +            if (ret == -EAGAIN) {
> +                ret = 0;
> +            }
> +            break;
> +        }
> +        if (s->state == PO_STATE_ACTIVE) {
> +            ret = postcopy_outgoing_handle_req(s, &req, &written);
> +        }
> +        postcopy_outgoing_free_req(&req);
> +    } while (ret == 0);
> +
> +    /*
> +     * flush buffered_file.
> +     * Although mig_write is rate-limited buffered file, those written pages
> +     * are requested on demand by the destination. So forcibly push
> +     * those pages ignoring rate limiting
> +     */
> +    if (written) {
> +        qemu_fflush(s->mig_buffered_write);
> +        /* qemu_buffered_file_drain(s->mig_buffered_write); */
> +    }
> +
> +    if (ret < 0) {
> +        switch (s->state) {
> +        case PO_STATE_ACTIVE:
> +            s->state = PO_STATE_ERROR_RECEIVE;
> +            DPRINTF("-> PO_STATE_ERROR_RECEIVE\n");
> +            break;
> +        case PO_STATE_ALL_PAGES_SENT:
> +            s->state = PO_STATE_COMPLETED;
> +            DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n");
> +            break;
> +        default:
> +            abort();
> +        }
> +    }
> +    if (s->state == PO_STATE_ERROR_RECEIVE || s->state == PO_STATE_COMPLETED) {
> +        postcopy_outgoing_close_mig_read(s);
> +    }
> +    if (s->state == PO_STATE_COMPLETED) {
> +        DPRINTF("PO_STATE_COMPLETED\n");
> +        MigrationState *ms = s->ms;
> +        postcopy_outgoing_completed(s);
> +        migrate_fd_completed(ms);
> +    }
> +}
> +
> +void *postcopy_outgoing_begin(MigrationState *ms)
> +{
> +    PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1);
> +    DPRINTF("outgoing begin\n");
> +    qemu_fflush(ms->file);
> +
> +    s->ms = ms;
> +    s->state = PO_STATE_ACTIVE;
> +    s->fd_read = ms->fd_read;
> +    s->mig_read = ms->file_read;
> +    s->mig_buffered_write = ms->file;
> +    s->block = NULL;
> +    s->addr = 0;
> +
> +    /* Make sure all dirty bits are set */
> +    ram_save_memory_set_dirty();
> +
> +    qemu_set_fd_handler(s->fd_read,
> +                        &postcopy_outgoing_recv_handler, NULL, s);
> +    return s;
> +}
> +
> +static void postcopy_outgoing_ram_all_sent(QEMUFile *f,
> +                                           PostcopyOutgoingState *s)
> +{
> +    assert(s->state == PO_STATE_ACTIVE);
> +
> +    s->state = PO_STATE_ALL_PAGES_SENT;
> +    /* tell incoming side that all pages are sent */
> +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +    qemu_fflush(f);
> +    qemu_buffered_file_drain(f);
> +    DPRINTF("sent RAM_SAVE_FLAG_EOS\n");
> +    migrate_fd_cleanup(s->ms);
> +
> +    /* Later migrate_fd_complete() will be called which calls
> +     * migrate_fd_cleanup() again. So dummy file is created
> +     * for qemu monitor to keep working.
> +     */
> +    s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL,
> +                                 NULL, NULL);
> +}
> +
> +static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s,
> +                                                RAMBlock *block,
> +                                                ram_addr_t addr)
> +{
> +    if (block == NULL) {
> +        block = QLIST_FIRST(&ram_list.blocks);
> +        addr = block->offset;
> +    }
> +
> +    for (; block != NULL;
> +         s->block = QLIST_NEXT(s->block, next), addr = block->offset) {
> +        for (; addr < block->offset + block->length;
> +             addr += TARGET_PAGE_SIZE) {
> +            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
> +                s->block = block;
> +                s->addr = addr;
> +                return 0;
> +            }
> +        }
> +    }
> +
> +    return 1;
> +}
> +
> +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> +                                          void *postcopy)
> +{
> +    PostcopyOutgoingState *s = postcopy;
> +
> +    assert(s->state == PO_STATE_ACTIVE ||
> +           s->state == PO_STATE_EOC_RECEIVED ||
> +           s->state == PO_STATE_ERROR_RECEIVE);
> +
> +    switch (s->state) {
> +    case PO_STATE_ACTIVE:
> +        /* nothing. processed below */
> +        break;
> +    case PO_STATE_EOC_RECEIVED:
> +        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +        s->state = PO_STATE_COMPLETED;
> +        postcopy_outgoing_completed(s);
> +        DPRINTF("PO_STATE_COMPLETED\n");
> +        return 1;
> +    case PO_STATE_ERROR_RECEIVE:
> +        postcopy_outgoing_completed(s);
> +        DPRINTF("PO_STATE_ERROR_RECEIVE\n");
> +        return -1;
> +    default:
> +        abort();
> +    }
> +
> +    if (s->ms->params.nobg) {
> +        /* See if all pages are sent. */
> +        if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) == 0) {
> +            return 0;
> +        }
> +        /* ram_list can be reordered. (it doesn't seem so during migration,
> +           though) So the whole list needs to be checked again */
> +        if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) {
> +            return 0;
> +        }
> +
> +        postcopy_outgoing_ram_all_sent(f, s);
> +        return 0;
> +    }
> +
> +    DPRINTF("outgoing background state: %d\n", s->state);
> +
> +    while (qemu_file_rate_limit(f) == 0) {
> +        if (ram_save_block(f) == 0) { /* no more blocks */
> +            assert(s->state == PO_STATE_ACTIVE);
> +            postcopy_outgoing_ram_all_sent(f, s);
> +            return 0;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/***************************************************************************
> + * incoming part
> + */
> +
> +/* flags for incoming mode to modify the behavior.
> +   This is for benchmark/debug purpose */
> +#define INCOMING_FLAGS_FAULT_REQUEST 0x01
> +
> +
> +static void postcopy_incoming_umemd(void);
> +
> +#define PIS_STATE_QUIT_RECEIVED         0x01
> +#define PIS_STATE_QUIT_QUEUED           0x02
> +#define PIS_STATE_QUIT_SENT             0x04
> +
> +#define PIS_STATE_QUIT_MASK             (PIS_STATE_QUIT_RECEIVED | \
> +                                         PIS_STATE_QUIT_QUEUED | \
> +                                         PIS_STATE_QUIT_SENT)
> +
> +struct PostcopyIncomingState {
> +    /* dest qemu state */
> +    uint32_t    state;
> +
> +    UMemDev *dev;
> +    int host_page_size;
> +    int host_page_shift;
> +
> +    /* qemu side */
> +    int to_umemd_fd;
> +    QEMUFileNonblock *to_umemd;
> +#define MAX_FAULTED_PAGES       256
> +    struct umem_pages *faulted_pages;
> +
> +    int from_umemd_fd;
> +    QEMUFile *from_umemd;
> +    int version_id;     /* save/load format version id */
> +};
> +typedef struct PostcopyIncomingState PostcopyIncomingState;
> +
> +
> +#define UMEM_STATE_EOS_RECEIVED         0x01    /* umem daemon <-> src qemu */
> +#define UMEM_STATE_EOC_SENT             0x02    /* umem daemon <-> src qemu */
> +#define UMEM_STATE_QUIT_RECEIVED        0x04    /* umem daemon <-> dst qemu */
> +#define UMEM_STATE_QUIT_QUEUED          0x08    /* umem daemon <-> dst qemu */
> +#define UMEM_STATE_QUIT_SENT            0x10    /* umem daemon <-> dst qemu */
> +
> +#define UMEM_STATE_QUIT_MASK            (UMEM_STATE_QUIT_QUEUED | \
> +                                         UMEM_STATE_QUIT_SENT | \
> +                                         UMEM_STATE_QUIT_RECEIVED)
> +#define UMEM_STATE_END_MASK             (UMEM_STATE_EOS_RECEIVED | \
> +                                         UMEM_STATE_EOC_SENT | \
> +                                         UMEM_STATE_QUIT_MASK)
> +
> +struct PostcopyIncomingUMemDaemon {
> +    /* umem daemon side */
> +    uint32_t state;
> +
> +    int host_page_size;
> +    int host_page_shift;
> +    int nr_host_pages_per_target_page;
> +    int host_to_target_page_shift;
> +    int nr_target_pages_per_host_page;
> +    int target_to_host_page_shift;
> +    int version_id;     /* save/load format version id */
> +
> +    int to_qemu_fd;
> +    QEMUFileNonblock *to_qemu;
> +    int from_qemu_fd;
> +    QEMUFile *from_qemu;
> +
> +    int mig_read_fd;
> +    QEMUFile *mig_read;         /* qemu on source -> umem daemon */
> +
> +    int mig_write_fd;
> +    QEMUFileNonblock *mig_write;        /* umem daemon -> qemu on source */
> +
> +    /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */
> +#define MAX_REQUESTS    (512 * (64 + 1))
> +
> +    struct umem_page_request page_request;
> +    struct umem_page_cached page_cached;
> +
> +#define MAX_PRESENT_REQUESTS    MAX_FAULTED_PAGES
> +    struct umem_pages *present_request;
> +
> +    uint64_t *target_pgoffs;
> +
> +    /* bitmap indexed by target page offset */
> +    unsigned long *phys_requested;
> +
> +    /* bitmap indexed by target page offset */
> +    unsigned long *phys_received;
> +
> +    RAMBlock *last_block_read;  /* qemu on source -> umem daemon */
> +    RAMBlock *last_block_write; /* umem daemon -> qemu on source */
> +};
> +typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon;
> +
> +static PostcopyIncomingState state = {
> +    .state = 0,
> +    .dev = NULL,
> +    .to_umemd_fd = -1,
> +    .to_umemd = NULL,
> +    .from_umemd_fd = -1,
> +    .from_umemd = NULL,
> +};
> +
> +static PostcopyIncomingUMemDaemon umemd = {
> +    .state = 0,
> +    .to_qemu_fd = -1,
> +    .to_qemu = NULL,
> +    .from_qemu_fd = -1,
> +    .from_qemu = NULL,
> +    .mig_read_fd = -1,
> +    .mig_read = NULL,
> +    .mig_write_fd = -1,
> +    .mig_write = NULL,
> +};
> +
> +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
> +{
> +    /* incoming_postcopy makes sense only when incoming migration mode */
> +    if (!incoming && incoming_postcopy) {
> +        return -EINVAL;
> +    }
> +
> +    if (!incoming_postcopy) {
> +        return 0;
> +    }
> +
> +    state.state = 0;
> +    state.dev = umem_dev_new();
> +    state.host_page_size = getpagesize();
> +    state.host_page_shift = ffs(state.host_page_size) - 1;
> +    state.version_id = RAM_SAVE_VERSION_ID; /* = save version of
> +                                               ram_save_live() */
> +    return 0;
> +}
> +
> +void postcopy_incoming_ram_alloc(const char *name,
> +                                 size_t size, uint8_t **hostp, UMem **umemp)
> +{
> +    UMem *umem;
> +    size = ALIGN_UP(size, state.host_page_size);
> +    umem = umem_dev_create(state.dev, size, name);
> +
> +    *umemp = umem;
> +    *hostp = umem->umem;
> +}
> +
> +void postcopy_incoming_ram_free(UMem *umem)
> +{
> +    umem_unmap(umem);
> +    umem_close(umem);
> +    umem_destroy(umem);
> +}
> +
> +void postcopy_incoming_prepare(void)
> +{
> +    RAMBlock *block;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (block->umem != NULL) {
> +            umem_mmap(block->umem);
> +        }
> +    }
> +}
> +
> +static int postcopy_incoming_ram_load_get64(QEMUFile *f,
> +                                             ram_addr_t *addr, int *flags)
> +{
> +    *addr = qemu_get_be64(f);
> +    *flags = *addr & ~TARGET_PAGE_MASK;
> +    *addr &= TARGET_PAGE_MASK;
> +    return qemu_file_get_error(f);
> +}
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    ram_addr_t addr;
> +    int flags;
> +    int error;
> +
> +    DPRINTF("incoming ram load\n");
> +    /*
> +     * RAM_SAVE_FLAGS_EOS or
> +     * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS
> +     * see postcopy_outgoing_ram_save_live()
> +     */
> +
> +    if (version_id != RAM_SAVE_VERSION_ID) {
> +        DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n",
> +                version_id, RAM_SAVE_VERSION_ID);
> +        return -EINVAL;
> +    }
> +    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> +    DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags);
> +    if (error) {
> +        DPRINTF("error %d\n", error);
> +        return error;
> +    }
> +    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> +        DPRINTF("EOS\n");
> +        return 0;
> +    }
> +
> +    if (flags != RAM_SAVE_FLAG_MEM_SIZE) {
> +        DPRINTF("-EINVAL flags 0x%x\n", flags);
> +        return -EINVAL;
> +    }
> +    error = ram_load_mem_size(f, addr);
> +    if (error) {
> +        DPRINTF("addr 0x%lx error %d\n", addr, error);
> +        return error;
> +    }
> +
> +    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> +    if (error) {
> +        DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error);
> +        return error;
> +    }
> +    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> +        DPRINTF("done\n");
> +        return 0;
> +    }
> +    DPRINTF("-EINVAL\n");
> +    return -EINVAL;
> +}
> +
> +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
> +{
> +    int fds[2];
> +    RAMBlock *block;
> +
> +    DPRINTF("fork\n");
> +
> +    /* socketpair(AF_UNIX)? */
> +
> +    if (qemu_pipe(fds) == -1) {
> +        perror("qemu_pipe");
> +        abort();
> +    }
> +    state.from_umemd_fd = fds[0];
> +    umemd.to_qemu_fd = fds[1];
> +
> +    if (qemu_pipe(fds) == -1) {
> +        perror("qemu_pipe");
> +        abort();
> +    }
> +    umemd.from_qemu_fd = fds[0];
> +    state.to_umemd_fd = fds[1];
> +
> +    pid_t child = fork();
> +    if (child < 0) {
> +        perror("fork");
> +        abort();
> +    }
> +
> +    if (child == 0) {
> +        int mig_write_fd;
> +
> +        fd_close(&state.to_umemd_fd);
> +        fd_close(&state.from_umemd_fd);
> +        umemd.host_page_size = state.host_page_size;
> +        umemd.host_page_shift = state.host_page_shift;
> +
> +        umemd.nr_host_pages_per_target_page =
> +            TARGET_PAGE_SIZE / umemd.host_page_size;
> +        umemd.nr_target_pages_per_host_page =
> +            umemd.host_page_size / TARGET_PAGE_SIZE;
> +
> +        umemd.target_to_host_page_shift =
> +            ffs(umemd.nr_host_pages_per_target_page) - 1;
> +        umemd.host_to_target_page_shift =
> +            ffs(umemd.nr_target_pages_per_host_page) - 1;
> +
> +        umemd.state = 0;
> +        umemd.version_id = state.version_id;
> +        umemd.mig_read_fd = mig_read_fd;
> +        umemd.mig_read = mig_read;
> +
> +        mig_write_fd = dup(mig_read_fd);
> +        if (mig_write_fd < 0) {
> +            perror("could not dup for writable socket \n");
> +            abort();
> +        }
> +        umemd.mig_write_fd = mig_write_fd;
> +        umemd.mig_write = qemu_fopen_nonblock(mig_write_fd);
> +
> +        postcopy_incoming_umemd(); /* noreturn */
> +    }
> +
> +    DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child);
> +    fd_close(&umemd.to_qemu_fd);
> +    fd_close(&umemd.from_qemu_fd);
> +    state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES));
> +    state.faulted_pages->nr = 0;
> +
> +    /* close all UMem.shmem_fd */
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        umem_close_shmem(block->umem);
> +    }
> +    umem_qemu_wait_for_daemon(state.from_umemd_fd);
> +}
> +
> +static void postcopy_incoming_qemu_recv_quit(void)
> +{
> +    RAMBlock *block;
> +    if (state.state & PIS_STATE_QUIT_RECEIVED) {
> +        return;
> +    }
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (block->umem != NULL) {
> +            umem_destroy(block->umem);
> +            block->umem = NULL;
> +            block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> +        }
> +    }
> +
> +    DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n");
> +    state.state |= PIS_STATE_QUIT_RECEIVED;
> +    qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL);
> +    qemu_fclose(state.from_umemd);
> +    state.from_umemd = NULL;
> +    fd_close(&state.from_umemd_fd);
> +}
> +
> +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque)
> +{
> +    assert(state.to_umemd != NULL);
> +
> +    nonblock_fflush(state.to_umemd);
> +    if (nonblock_pending_size(state.to_umemd) > 0) {
> +        return;
> +    }
> +
> +    qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL);
> +    if (state.state & PIS_STATE_QUIT_QUEUED) {
> +        DPRINTF("|= PIS_STATE_QUIT_SENT\n");
> +        state.state |= PIS_STATE_QUIT_SENT;
> +        qemu_fclose(state.to_umemd->file);
> +        state.to_umemd = NULL;
> +        fd_close(&state.to_umemd_fd);
> +        g_free(state.faulted_pages);
> +        state.faulted_pages = NULL;
> +    }
> +}
> +
> +static void postcopy_incoming_qemu_fflush_to_umemd(void)
> +{
> +    qemu_set_fd_handler(state.to_umemd->fd, NULL,
> +                        postcopy_incoming_qemu_fflush_to_umemd_handler, NULL);
> +    postcopy_incoming_qemu_fflush_to_umemd_handler(NULL);
> +}
> +
> +static void postcopy_incoming_qemu_queue_quit(void)
> +{
> +    if (state.state & PIS_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +
> +    DPRINTF("|= PIS_STATE_QUIT_QUEUED\n");
> +    umem_qemu_quit(state.to_umemd->file);
> +    state.state |= PIS_STATE_QUIT_QUEUED;
> +}
> +
> +static void postcopy_incoming_qemu_send_pages_present(void)
> +{
> +    if (state.faulted_pages->nr > 0) {
> +        umem_qemu_send_pages_present(state.to_umemd->file,
> +                                     state.faulted_pages);
> +        state.faulted_pages->nr = 0;
> +    }
> +}
> +
> +static void postcopy_incoming_qemu_faulted_pages(
> +    const struct umem_pages *pages)
> +{
> +    assert(pages->nr <= MAX_FAULTED_PAGES);
> +    assert(state.faulted_pages != NULL);
> +
> +    if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) {
> +        postcopy_incoming_qemu_send_pages_present();
> +    }
> +    memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr],
> +           &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr);
> +    state.faulted_pages->nr += pages->nr;
> +}
> +
> +static void postcopy_incoming_qemu_cleanup_umem(void);
> +
> +static int postcopy_incoming_qemu_handle_req_one(void)
> +{
> +    int offset = 0;
> +    int ret;
> +    uint8_t cmd;
> +
> +    ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset);
> +    offset += sizeof(cmd);
> +    if (ret != sizeof(cmd)) {
> +        return -EAGAIN;
> +    }
> +    DPRINTF("cmd %c\n", cmd);
> +
> +    switch (cmd) {
> +    case UMEM_DAEMON_QUIT:
> +        postcopy_incoming_qemu_recv_quit();
> +        postcopy_incoming_qemu_queue_quit();
> +        postcopy_incoming_qemu_cleanup_umem();
> +        break;
> +    case UMEM_DAEMON_TRIGGER_PAGE_FAULT: {
> +        struct umem_pages *pages =
> +            umem_qemu_trigger_page_fault(state.from_umemd, &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (state.to_umemd_fd >= 0 && !(state.state & PIS_STATE_QUIT_QUEUED)) {
> +            postcopy_incoming_qemu_faulted_pages(pages);
> +            g_free(pages);
> +        }
> +        break;
> +    }
> +    case UMEM_DAEMON_ERROR:
> +        /* umem daemon hit troubles, so it warned us to stop vm execution */
> +        vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +
> +    if (state.from_umemd != NULL) {
> +        qemu_file_skip(state.from_umemd, offset);
> +    }
> +    return 0;
> +}
> +
> +static void postcopy_incoming_qemu_handle_req(void *opaque)
> +{
> +    do {
> +        int ret = postcopy_incoming_qemu_handle_req_one();
> +        if (ret == -EAGAIN) {
> +            break;
> +        }
> +    } while (state.from_umemd != NULL &&
> +             qemu_pending_size(state.from_umemd) > 0);
> +
> +    if (state.to_umemd != NULL) {
> +        if (state.faulted_pages->nr > 0) {
> +            postcopy_incoming_qemu_send_pages_present();
> +        }
> +        postcopy_incoming_qemu_fflush_to_umemd();
> +    }
> +}
> +
> +void postcopy_incoming_qemu_ready(void)
> +{
> +    umem_qemu_ready(state.to_umemd_fd);
> +
> +    state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd);
> +    state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd);
> +    qemu_set_fd_handler(state.from_umemd_fd,
> +                        postcopy_incoming_qemu_handle_req, NULL, NULL);
> +}
> +
> +static void postcopy_incoming_qemu_cleanup_umem(void)
> +{
> +    /* when qemu will quit before completing postcopy, tell umem daemon
> +       to tear down umem device and exit. */
> +    if (state.to_umemd_fd >= 0) {
> +        postcopy_incoming_qemu_queue_quit();
> +        postcopy_incoming_qemu_fflush_to_umemd();
> +    }
> +
> +    if (state.dev) {
> +        umem_dev_destroy(state.dev);
> +        state.dev = NULL;
> +    }
> +}
> +
> +void postcopy_incoming_qemu_cleanup(void)
> +{
> +    postcopy_incoming_qemu_cleanup_umem();
> +    if (state.to_umemd != NULL) {
> +        nonblock_wait_for_flush(state.to_umemd);
> +    }
> +}
> +
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
> +{
> +    uint64_t nr = DIV_ROUND_UP(size, state.host_page_size);
> +    size_t len = umem_pages_size(nr);
> +    ram_addr_t end = addr + size;
> +    struct umem_pages *pages;
> +    int i;
> +
> +    if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +    pages = g_malloc(len);
> +    pages->nr = nr;
> +    for (i = 0; addr < end; addr += state.host_page_size, i++) {
> +        pages->pgoffs[i] = addr >> state.host_page_shift;
> +    }
> +    umem_qemu_send_pages_unmapped(state.to_umemd->file, pages);
> +    g_free(pages);
> +    assert(state.to_umemd != NULL);
> +    postcopy_incoming_qemu_fflush_to_umemd();
> +}
> +
> +/**************************************************************************
> + * incoming umem daemon
> + */
> +
> +static void postcopy_incoming_umem_recv_quit(void)
> +{
> +    if (umemd.state & UMEM_STATE_QUIT_RECEIVED) {
> +        return;
> +    }
> +    DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n");
> +    umemd.state |= UMEM_STATE_QUIT_RECEIVED;
> +    qemu_fclose(umemd.from_qemu);
> +    umemd.from_qemu = NULL;
> +    fd_close(&umemd.from_qemu_fd);
> +}
> +
> +static void postcopy_incoming_umem_queue_quit(void)
> +{
> +    if (umemd.state & UMEM_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +    DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n");
> +    umem_daemon_quit(umemd.to_qemu->file);
> +    umemd.state |= UMEM_STATE_QUIT_QUEUED;
> +}
> +
> +static void postcopy_incoming_umem_send_eoc_req(void)
> +{
> +    struct qemu_umem_req req;
> +
> +    if (umemd.state & UMEM_STATE_EOC_SENT) {
> +        return;
> +    }
> +
> +    DPRINTF("|= UMEM_STATE_EOC_SENT\n");
> +    req.cmd = QEMU_UMEM_REQ_EOC;
> +    postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +    umemd.state |= UMEM_STATE_EOC_SENT;
> +    qemu_fclose(umemd.mig_write->file);
> +    umemd.mig_write = NULL;
> +    fd_close(&umemd.mig_write_fd);
> +}
> +
> +static void postcopy_incoming_umem_send_page_req(RAMBlock *block)
> +{
> +    struct qemu_umem_req req;
> +    int bit;
> +    uint64_t target_pgoff;
> +    int i;
> +
> +    umemd.page_request.nr = MAX_REQUESTS;
> +    umem_get_page_request(block->umem, &umemd.page_request);
> +    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> +            block->idstr, umemd.page_request.nr,
> +            (uint64_t)umemd.page_request.pgoffs[0],
> +            (uint64_t)umemd.page_request.pgoffs[1]);
> +
> +    if (umemd.last_block_write != block) {
> +        req.cmd = QEMU_UMEM_REQ_ON_DEMAND;
> +        req.idstr = block->idstr;
> +    } else {
> +        req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> +    }
> +
> +    req.nr = 0;
> +    req.pgoffs = umemd.target_pgoffs;
> +    if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> +        for (i = 0; i < umemd.page_request.nr; i++) {
> +            target_pgoff =
> +                umemd.page_request.pgoffs[i] >> umemd.host_to_target_page_shift;
> +            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> +
> +            if (!test_and_set_bit(bit, umemd.phys_requested)) {
> +                req.pgoffs[req.nr] = target_pgoff;
> +                req.nr++;
> +            }
> +        }
> +    } else {
> +        for (i = 0; i < umemd.page_request.nr; i++) {
> +            int j;
> +            target_pgoff =
> +                umemd.page_request.pgoffs[i] << umemd.host_to_target_page_shift;
> +            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> +
> +            for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) {
> +                if (!test_and_set_bit(bit + j, umemd.phys_requested)) {
> +                    req.pgoffs[req.nr] = target_pgoff + j;
> +                    req.nr++;
> +                }
> +            }
> +        }
> +    }
> +
> +    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> +            block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]);
> +    if (req.nr > 0 && umemd.mig_write != NULL) {
> +        postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +        umemd.last_block_write = block;
> +    }
> +}
> +
> +static void postcopy_incoming_umem_send_pages_present(void)
> +{
> +    if (umemd.present_request->nr > 0) {
> +        umem_daemon_send_pages_present(umemd.to_qemu->file,
> +                                       umemd.present_request);
> +        umemd.present_request->nr = 0;
> +    }
> +}
> +
> +static void postcopy_incoming_umem_pages_present_one(
> +    uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset)
> +{
> +    uint32_t i;
> +    assert(nr <= MAX_PRESENT_REQUESTS);
> +
> +    if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) {
> +        postcopy_incoming_umem_send_pages_present();
> +    }
> +
> +    for (i = 0; i < nr; i++) {
> +        umemd.present_request->pgoffs[umemd.present_request->nr + i] =
> +            pgoffs[i] + ramblock_pgoffset;
> +    }
> +    umemd.present_request->nr += nr;
> +}
> +
> +static void postcopy_incoming_umem_pages_present(
> +    const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset)
> +{
> +    uint32_t left = page_cached->nr;
> +    uint32_t offset = 0;
> +
> +    while (left > 0) {
> +        uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS);
> +        postcopy_incoming_umem_pages_present_one(
> +            nr, &page_cached->pgoffs[offset], ramblock_pgoffset);
> +
> +        left -= nr;
> +        offset += nr;
> +    }
> +}
> +
> +static int postcopy_incoming_umem_ram_load(void)
> +{
> +    ram_addr_t offset;
> +    int flags;
> +    int error;
> +    void *shmem;
> +    int i;
> +    int bit;
> +
> +    if (umemd.version_id != RAM_SAVE_VERSION_ID) {
> +        return -EINVAL;
> +    }
> +
> +    offset = qemu_get_be64(umemd.mig_read);
> +
> +    flags = offset & ~TARGET_PAGE_MASK;
> +    offset &= TARGET_PAGE_MASK;
> +
> +    assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE));
> +
> +    if (flags & RAM_SAVE_FLAG_EOS) {
> +        DPRINTF("RAM_SAVE_FLAG_EOS\n");
> +        postcopy_incoming_umem_send_eoc_req();
> +
> +        qemu_fclose(umemd.mig_read);
> +        umemd.mig_read = NULL;
> +        fd_close(&umemd.mig_read_fd);
> +        umemd.state |= UMEM_STATE_EOS_RECEIVED;
> +
> +        postcopy_incoming_umem_queue_quit();
> +        DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n");
> +        return 0;
> +    }
> +
> +    shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags,
> +                                             &umemd.last_block_read);
> +    if (!shmem) {
> +        DPRINTF("shmem == NULL\n");
> +        return -EINVAL;
> +    }
> +
> +    if (flags & RAM_SAVE_FLAG_COMPRESS) {
> +        uint8_t ch = qemu_get_byte(umemd.mig_read);
> +        memset(shmem, ch, TARGET_PAGE_SIZE);
> +    } else if (flags & RAM_SAVE_FLAG_PAGE) {
> +        qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE);
> +    }
> +
> +    error = qemu_file_get_error(umemd.mig_read);
> +    if (error) {
> +        DPRINTF("error %d\n", error);
> +        return error;
> +    }
> +
> +    umemd.page_cached.nr = 0;
> +    bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS;
> +    if (!test_and_set_bit(bit, umemd.phys_received)) {
> +        if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> +            __u64 pgoff = offset >> umemd.host_page_shift;
> +            for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) {
> +                umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i;
> +                umemd.page_cached.nr++;
> +            }
> +        } else {
> +            bool mark_cache = true;
> +            for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) {
> +                if (!test_bit(bit + i, umemd.phys_received)) {
> +                    mark_cache = false;
> +                    break;
> +                }
> +            }
> +            if (mark_cache) {
> +                umemd.page_cached.pgoffs[0] = offset >> umemd.host_page_shift;
> +                umemd.page_cached.nr = 1;
> +            }
> +        }
> +    }
> +
> +    if (umemd.page_cached.nr > 0) {
> +        umem_mark_page_cached(umemd.last_block_read->umem, &umemd.page_cached);
> +
> +        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 &&
> +            (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) {
> +            uint64_t ramblock_pgoffset;
> +
> +            ramblock_pgoffset =
> +                umemd.last_block_read->offset >> umemd.host_page_shift;
> +            postcopy_incoming_umem_pages_present(&umemd.page_cached,
> +                                                 ramblock_pgoffset);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static bool postcopy_incoming_umem_check_umem_done(void)
> +{
> +    bool all_done = true;
> +    RAMBlock *block;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        UMem *umem = block->umem;
> +        if (umem != NULL && umem->nsets == umem->nbits) {
> +            umem_unmap_shmem(umem);
> +            umem_destroy(umem);
> +            block->umem = NULL;
> +        }
> +        if (block->umem != NULL) {
> +            all_done = false;
> +        }
> +    }
> +    return all_done;
> +}
> +
> +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages *pages)
> +{
> +    int i;
> +
> +    for (i = 0; i < pages->nr; i++) {
> +        ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift;
> +        RAMBlock *block = qemu_get_ram_block(addr);
> +        addr -= block->offset;
> +        umem_remove_shmem(block->umem, addr, umemd.host_page_size);
> +    }
> +    return postcopy_incoming_umem_check_umem_done();
> +}
> +
> +static bool
> +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages)
> +{
> +    RAMBlock *block;
> +    ram_addr_t addr;
> +    int i;
> +
> +    struct qemu_umem_req req = {
> +        .cmd = QEMU_UMEM_REQ_REMOVE,
> +        .nr = 0,
> +        .pgoffs = (uint64_t*)pages->pgoffs,
> +    };
> +
> +    addr = pages->pgoffs[0] << umemd.host_page_shift;
> +    block = qemu_get_ram_block(addr);
> +
> +    for (i = 0; i < pages->nr; i++)  {
> +        int pgoff;
> +
> +        addr = pages->pgoffs[i] << umemd.host_page_shift;
> +        pgoff = addr >> TARGET_PAGE_BITS;
> +        if (!test_bit(pgoff, umemd.phys_received) &&
> +            !test_bit(pgoff, umemd.phys_requested)) {
> +            req.pgoffs[req.nr] = pgoff;
> +            req.nr++;
> +        }
> +        set_bit(pgoff, umemd.phys_received);
> +        set_bit(pgoff, umemd.phys_requested);
> +
> +        umem_remove_shmem(block->umem,
> +                          addr - block->offset, umemd.host_page_size);
> +    }
> +    if (req.nr > 0 && umemd.mig_write != NULL) {
> +        req.idstr = block->idstr;
> +        postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +    }
> +
> +    return postcopy_incoming_umem_check_umem_done();
> +}
> +
> +static void postcopy_incoming_umem_done(void)
> +{
> +    postcopy_incoming_umem_send_eoc_req();
> +    postcopy_incoming_umem_queue_quit();
> +}
> +
> +static int postcopy_incoming_umem_handle_qemu(void)
> +{
> +    int ret;
> +    int offset = 0;
> +    uint8_t cmd;
> +
> +    ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset);
> +    offset += sizeof(cmd);
> +    if (ret != sizeof(cmd)) {
> +        return -EAGAIN;
> +    }
> +    DPRINTF("cmd %c\n", cmd);
> +    switch (cmd) {
> +    case UMEM_QEMU_QUIT:
> +        postcopy_incoming_umem_recv_quit();
> +        postcopy_incoming_umem_done();
> +        break;
> +    case UMEM_QEMU_PAGE_FAULTED: {
> +        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> +                                                   &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (postcopy_incoming_umem_page_faulted(pages)){
> +            postcopy_incoming_umem_done();
> +        }
> +        g_free(pages);
> +        break;
> +    }
> +    case UMEM_QEMU_PAGE_UNMAPPED: {
> +        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> +                                                   &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (postcopy_incoming_umem_page_unmapped(pages)){
> +            postcopy_incoming_umem_done();
> +        }
> +        g_free(pages);
> +        break;
> +    }
> +    default:
> +        abort();
> +        break;
> +    }
> +    if (umemd.from_qemu != NULL) {
> +        qemu_file_skip(umemd.from_qemu, offset);
> +    }
> +    return 0;
> +}
> +
> +static void set_fd(int fd, fd_set *fds, int *nfds)
> +{
> +    FD_SET(fd, fds);
> +    if (fd > *nfds) {
> +        *nfds = fd;
> +    }
> +}
> +
> +static int postcopy_incoming_umemd_main_loop(void)
> +{
> +    fd_set writefds;
> +    fd_set readfds;
> +    int nfds;
> +    RAMBlock *block;
> +    int ret;
> +
> +    int pending_size;
> +    bool get_page_request;
> +
> +    nfds = -1;
> +    FD_ZERO(&writefds);
> +    FD_ZERO(&readfds);
> +
> +    if (umemd.mig_write != NULL) {
> +        pending_size = nonblock_pending_size(umemd.mig_write);
> +        if (pending_size > 0) {
> +            set_fd(umemd.mig_write_fd, &writefds, &nfds);
> +        }
> +    } else {
> +        pending_size = 0;
> +    }
> +
> +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2)
> +    /* If page request to the migration source is accumulated,
> +       suspend getting page fault request. */
> +    get_page_request = (pending_size <= PENDING_SIZE_MAX);
> +
> +    if (get_page_request) {
> +        QLIST_FOREACH(block, &ram_list.blocks, next) {
> +            if (block->umem != NULL) {
> +                set_fd(block->umem->fd, &readfds, &nfds);
> +            }
> +        }
> +    }
> +
> +    if (umemd.mig_read_fd >= 0) {
> +        set_fd(umemd.mig_read_fd, &readfds, &nfds);
> +    }
> +
> +    if (umemd.to_qemu != NULL &&
> +        nonblock_pending_size(umemd.to_qemu) > 0) {
> +        set_fd(umemd.to_qemu_fd, &writefds, &nfds);
> +    }
> +    if (umemd.from_qemu_fd >= 0) {
> +        set_fd(umemd.from_qemu_fd, &readfds, &nfds);
> +    }
> +
> +    ret = select(nfds + 1, &readfds, &writefds, NULL, NULL);
> +    if (ret == -1) {
> +        if (errno == EINTR) {
> +            return 0;
> +        }
> +        return ret;
> +    }
> +
> +    if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) {
> +        nonblock_fflush(umemd.mig_write);
> +    }
> +    if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) {
> +        nonblock_fflush(umemd.to_qemu);
> +    }
> +    if (get_page_request) {
> +        QLIST_FOREACH(block, &ram_list.blocks, next) {
> +            if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) {
> +                postcopy_incoming_umem_send_page_req(block);
> +            }
> +        }
> +    }
> +    if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) {
> +        do {
> +            ret = postcopy_incoming_umem_ram_load();
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        } while (umemd.mig_read != NULL &&
> +                 qemu_pending_size(umemd.mig_read) > 0);
> +    }
> +    if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) {
> +        do {
> +            ret = postcopy_incoming_umem_handle_qemu();
> +            if (ret == -EAGAIN) {
> +                break;
> +            }
> +        } while (umemd.from_qemu != NULL &&
> +                 qemu_pending_size(umemd.from_qemu) > 0);
> +    }
> +
> +    if (umemd.mig_write != NULL) {
> +        nonblock_fflush(umemd.mig_write);
> +    }
> +    if (umemd.to_qemu != NULL) {
> +        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) {
> +            postcopy_incoming_umem_send_pages_present();
> +        }
> +        nonblock_fflush(umemd.to_qemu);
> +        if ((umemd.state & UMEM_STATE_QUIT_QUEUED) &&
> +            nonblock_pending_size(umemd.to_qemu) == 0) {
> +            DPRINTF("|= UMEM_STATE_QUIT_SENT\n");
> +            qemu_fclose(umemd.to_qemu->file);
> +            umemd.to_qemu = NULL;
> +            fd_close(&umemd.to_qemu_fd);
> +            umemd.state |= UMEM_STATE_QUIT_SENT;
> +        }
> +    }
> +
> +    return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK;
> +}
> +
> +static void postcopy_incoming_umemd(void)
> +{
> +    ram_addr_t last_ram_offset;
> +    int nbits;
> +    RAMBlock *block;
> +    int ret;
> +
> +    qemu_daemon(1, 1);
> +    signal(SIGPIPE, SIG_IGN);
> +    DPRINTF("daemon pid: %d\n", getpid());
> +
> +    umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS);
> +    umemd.page_cached.pgoffs =
> +        g_new(__u64, MAX_REQUESTS *
> +              (TARGET_PAGE_SIZE >= umemd.host_page_size ?
> +               1: umemd.nr_host_pages_per_target_page));
> +    umemd.target_pgoffs =
> +        g_new(uint64_t, MAX_REQUESTS *
> +              MAX(umemd.nr_host_pages_per_target_page,
> +                  umemd.nr_target_pages_per_host_page));
> +    umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS));
> +    umemd.present_request->nr = 0;
> +
> +    last_ram_offset = qemu_last_ram_offset();
> +    nbits = last_ram_offset >> TARGET_PAGE_BITS;
> +    umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> +    umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> +    umemd.last_block_read = NULL;
> +    umemd.last_block_write = NULL;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        UMem *umem = block->umem;
> +        umem->umem = NULL;      /* umem mapping area has VM_DONT_COPY flag,
> +                                   so we lost those mappings by fork */
> +        block->host = umem_map_shmem(umem);
> +        umem_close_shmem(umem);
> +    }
> +    umem_daemon_ready(umemd.to_qemu_fd);
> +    umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd);
> +
> +    /* wait for qemu to disown migration_fd */
> +    umem_daemon_wait_for_qemu(umemd.from_qemu_fd);
> +    umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd);
> +
> +    DPRINTF("entering umemd main loop\n");
> +    for (;;) {
> +        ret = postcopy_incoming_umemd_main_loop();
> +        if (ret != 0) {
> +            break;
> +        }
> +    }
> +    DPRINTF("exiting umemd main loop\n");
> +
> +    /* This daemon forked from qemu and the parent qemu is still running.
> +     * Cleanups of linked libraries like SDL should not be triggered,
> +     * otherwise the parent qemu may use resources which was already freed.
> +     */
> +    fflush(stdout);
> +    fflush(stderr);
> +    _exit(ret < 0? EXIT_FAILURE: 0);
> +}
> diff --git a/migration-tcp.c b/migration-tcp.c
> index cf6a9b8..aa35050 100644
> --- a/migration-tcp.c
> +++ b/migration-tcp.c
> @@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque)
>      } while (ret == -1 && (socket_error()) == EINTR);
>  
>      if (ret < 0) {
> -        migrate_fd_error(s);
> -        return;
> +        goto error_out;
>      }
>  
>      qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
>  
> -    if (val == 0)
> +    if (val == 0) {
> +        ret = postcopy_outgoing_create_read_socket(s);
> +        if (ret < 0) {
> +            goto error_out;
> +        }
>          migrate_fd_connect(s);
> -    else {
> +    } else {
>          DPRINTF("error connecting %d\n", val);
> -        migrate_fd_error(s);
> +        goto error_out;
>      }
> +    return;
> +
> +error_out:
> +    migrate_fd_error(s);
>  }
>  
>  int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
> @@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
>  
>      if (ret < 0) {
>          DPRINTF("connect failed\n");
> -        migrate_fd_error(s);
> -        return ret;
> +        goto error_out;
> +    }
> +
> +    ret = postcopy_outgoing_create_read_socket(s);
> +    if (ret < 0) {
> +        goto error_out;
>      }
>      migrate_fd_connect(s);
>      return 0;
> +
> +error_out:
> +    migrate_fd_error(s);
> +    return ret;
>  }
>  
>  static void tcp_accept_incoming_migration(void *opaque)
> @@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque)
>      }
>  
>      process_incoming_migration(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_fork_umemd(c, f);
> +    }
>      qemu_fclose(f);
> +    if (incoming_postcopy) {
> +        /* now socket is disowned.
> +           So tell umem server that it's safe to use it */
> +        postcopy_incoming_qemu_ready();
> +    }
>  out:
>      close(c);
>  out2:
> diff --git a/migration-unix.c b/migration-unix.c
> index dfcf203..3707505 100644
> --- a/migration-unix.c
> +++ b/migration-unix.c
> @@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque)
>  
>      qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
>  
> -    if (val == 0)
> +    if (val == 0) {
> +        ret = postcopy_outgoing_create_read_socket(s);
> +        if (ret < 0) {
> +            goto error_out;
> +        }
>          migrate_fd_connect(s);
> -    else {
> +    } else {
>          DPRINTF("error connecting %d\n", val);
> -        migrate_fd_error(s);
> +        goto error_out;
>      }
> +    return;
> +
> +error_out:
> +    migrate_fd_error(s);
>  }
>  
>  int unix_start_outgoing_migration(MigrationState *s, const char *path)
> @@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s, const char *path)
>  
>      if (ret < 0) {
>          DPRINTF("connect failed\n");
> -        migrate_fd_error(s);
> -        return ret;
> +        goto error_out;
> +    }
> +
> +    ret = postcopy_outgoing_create_read_socket(s);
> +    if (ret < 0) {
> +        goto error_out;
>      }
>      migrate_fd_connect(s);
>      return 0;
> +
> +error_out:
> +    migrate_fd_error(s);
> +    return ret;
>  }
>  
>  static void unix_accept_incoming_migration(void *opaque)
> @@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void *opaque)
>      }
>  
>      process_incoming_migration(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_fork_umemd(c, f);
> +    }
>      qemu_fclose(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_qemu_ready();
> +    }
>  out:
>      close(c);
>  out2:
> diff --git a/migration.c b/migration.c
> index 0149ab3..51efe44 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -39,6 +39,11 @@ enum {
>      MIG_STATE_COMPLETED,
>  };
>  
> +enum {
> +    MIG_SUBSTATE_PRECOPY,
> +    MIG_SUBSTATE_POSTCOPY,
> +};
> +
>  #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
>  
>  static NotifierList migration_state_notifiers =
> @@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque)
>          return;
>      }
>  
> +    if (s->substate == MIG_SUBSTATE_POSTCOPY) {
> +        /* PRINTF("postcopy background\n"); */
> +        ret = postcopy_outgoing_ram_save_background(s->mon, s->file,
> +                                                    s->postcopy);
> +        if (ret > 0) {
> +            migrate_fd_completed(s);
> +        } else if (ret < 0) {
> +            migrate_fd_error(s);
> +        }
> +        return;
> +    }
> +
>      DPRINTF("iterate\n");
>      ret = qemu_savevm_state_iterate(s->mon, s->file);
>      if (ret < 0) {
> @@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque)
>          DPRINTF("done iterating\n");
>          vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
>  
> +        if (s->params.postcopy) {
> +            if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
> +                migrate_fd_error(s);
> +                if (old_vm_running) {
> +                    vm_start();
> +                }
> +                return;
> +            }
> +            s->substate = MIG_SUBSTATE_POSTCOPY;
> +            s->postcopy = postcopy_outgoing_begin(s);
> +            return;
> +        }
> +
>          if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
>              migrate_fd_error(s);
>          } else {
> @@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s)
>      int ret;
>  
>      s->state = MIG_STATE_ACTIVE;
> +    s->substate = MIG_SUBSTATE_PRECOPY;
>      s->file = qemu_fopen_ops_buffered(s,
>                                        s->bandwidth_limit,
>                                        migrate_fd_put_buffer,
> diff --git a/migration.h b/migration.h
> index 90ae362..2809e99 100644
> --- a/migration.h
> +++ b/migration.h
> @@ -40,6 +40,12 @@ struct MigrationState
>      int (*write)(MigrationState *s, const void *buff, size_t size);
>      void *opaque;
>      MigrationParams params;
> +
> +    /* for postcopy */
> +    int substate;              /* precopy or postcopy */
> +    int fd_read;
> +    QEMUFile *file_read;        /* connection from the detination */
> +    void *postcopy;
>  };
>  
>  void process_incoming_migration(QEMUFile *f);
> @@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void);
>  uint64_t ram_bytes_transferred(void);
>  uint64_t ram_bytes_total(void);
>  
> +void ram_save_set_params(const MigrationParams *params, void *opaque);
>  void sort_ram_list(void);
>  int ram_save_block(QEMUFile *f);
>  void ram_save_memory_set_dirty(void);
> @@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason);
>   */
>  void migrate_del_blocker(Error *reason);
>  
> +/* For outgoing postcopy */
> +int postcopy_outgoing_create_read_socket(MigrationState *s);
> +int postcopy_outgoing_ram_save_live(Monitor *mon,
> +                                    QEMUFile *f, int stage, void *opaque);
> +void *postcopy_outgoing_begin(MigrationState *s);
> +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> +                                          void *postcopy);
> +
> +/* For incoming postcopy */
>  extern bool incoming_postcopy;
>  extern unsigned long incoming_postcopy_flags;
>  
> +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy);
> +void postcopy_incoming_ram_alloc(const char *name,
> +                                 size_t size, uint8_t **hostp, UMem **umemp);
> +void postcopy_incoming_ram_free(UMem *umem);
> +void postcopy_incoming_prepare(void);
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id);
> +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read);
> +void postcopy_incoming_qemu_ready(void);
> +void postcopy_incoming_qemu_cleanup(void);
> +#ifdef NEED_CPU_H
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size);
> +#endif
> +
>  #endif
> diff --git a/qemu-common.h b/qemu-common.h
> index 725922b..d74a8c9 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState;
>  
>  struct Monitor;
>  typedef struct Monitor Monitor;
> +typedef struct UMem UMem;
>  
>  /* we put basic includes here to avoid repeating them in device drivers */
>  #include <stdlib.h>
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 5c5b8f3..19e20f9 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG, QEMU_OPTION_postcopy_flags,
>      "-postcopy-flags unsigned-int(flags)\n"
>      "	                flags for postcopy incoming migration\n"
>      "                   when -incoming and -postcopy are specified.\n"
> -    "                   This is for benchmark/debug purpose (default: 0)\n",
> +    "                   This is for benchmark/debug purpose (default: 0)\n"
> +    "                   Currently supprted flags are\n"
> +    "                   1: enable fault request from umemd to qemu\n"
> +    "                      (default: disabled)\n",
>      QEMU_ARCH_ALL)
>  STEXI
>  @item -postcopy-flags int

Can you move umem.h and umem.h to a separate patch please ,
this patch
> diff --git a/umem.c b/umem.c
> new file mode 100644
> index 0000000..b7be006
> --- /dev/null
> +++ b/umem.c
> @@ -0,0 +1,379 @@
> +/*
> + * umem.c: user process backed memory module for postcopy livemigration
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +
> +#include <linux/umem.h>
> +
> +#include "bitops.h"
> +#include "sysemu.h"
> +#include "hw/hw.h"
> +#include "umem.h"
> +
> +//#define DEBUG_UMEM
> +#ifdef DEBUG_UMEM
> +#include <sys/syscall.h>
> +#define DPRINTF(format, ...)                                            \
> +    do {                                                                \
> +        printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid),    \
> +               __func__, __LINE__, ## __VA_ARGS__);                     \
> +    } while (0)
> +#else
> +#define DPRINTF(format, ...)    do { } while (0)
> +#endif
> +
> +#define DEV_UMEM        "/dev/umem"
> +
> +struct UMemDev {
> +    int fd;
> +    int page_shift;
> +};
> +
> +UMemDev *umem_dev_new(void)
> +{
> +    UMemDev *umem_dev;
> +    int umem_dev_fd = open(DEV_UMEM, O_RDWR);
> +    if (umem_dev_fd < 0) {
> +        perror("can't open "DEV_UMEM);
> +        abort();
> +    }
> +
> +    umem_dev = g_new(UMemDev, 1);
> +    umem_dev->fd = umem_dev_fd;
> +    umem_dev->page_shift = ffs(getpagesize()) - 1;
> +    return umem_dev;
> +}
> +
> +void umem_dev_destroy(UMemDev *dev)
> +{
> +    close(dev->fd);
> +    g_free(dev);
> +}
> +
> +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name)
> +{
> +    struct umem_create create = {
> +        .size = size,
> +        .async_req_max = 0,
> +        .sync_req_max = 0,
> +    };
> +    UMem *umem;
> +
> +    snprintf(create.name.id, sizeof(create.name.id),
> +             "pid-%"PRId64, (uint64_t)getpid());
> +    create.name.id[UMEM_ID_MAX - 1] = 0;
> +    strncpy(create.name.name, name, sizeof(create.name.name));
> +    create.name.name[UMEM_NAME_MAX - 1] = 0;
> +
> +    assert((size % getpagesize()) == 0);
> +    if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) {
> +        perror("UMEM_DEV_CREATE_UMEM");
> +        abort();
> +    }
> +    if (ftruncate(create.shmem_fd, create.size) < 0) {
> +        perror("truncate(\"shmem_fd\")");
> +        abort();
> +    }
> +
> +    umem = g_new(UMem, 1);
> +    umem->nbits = 0;
> +    umem->nsets = 0;
> +    umem->faulted = NULL;
> +    umem->page_shift = dev->page_shift;
> +    umem->fd = create.umem_fd;
> +    umem->shmem_fd = create.shmem_fd;
> +    umem->size = create.size;
> +    umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE,
> +                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +    if (umem->umem == MAP_FAILED) {
> +        perror("mmap(UMem) failed");
> +        abort();
> +    }
> +    return umem;
> +}
> +
> +void umem_mmap(UMem *umem)
> +{
> +    void *ret = mmap(umem->umem, umem->size,
> +                     PROT_EXEC | PROT_READ | PROT_WRITE,
> +                     MAP_PRIVATE | MAP_FIXED, umem->fd, 0);
> +    if (ret == MAP_FAILED) {
> +        perror("umem_mmap(UMem) failed");
> +        abort();
> +    }
> +}
> +
> +void umem_destroy(UMem *umem)
> +{
> +    if (umem->fd != -1) {
> +        close(umem->fd);
> +    }
> +    if (umem->shmem_fd != -1) {
> +        close(umem->shmem_fd);
> +    }
> +    g_free(umem->faulted);
> +    g_free(umem);
> +}
> +
> +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request)
> +{
> +    if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) {
> +        perror("daemon: UMEM_GET_PAGE_REQUEST");
> +        abort();
> +    }
> +}
> +
> +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached)
> +{
> +    if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) {
> +        perror("daemon: UMEM_MARK_PAGE_CACHED");
> +        abort();
> +    }
> +}
> +
> +void umem_unmap(UMem *umem)
> +{
> +    munmap(umem->umem, umem->size);
> +    umem->umem = NULL;
> +}
> +
> +void umem_close(UMem *umem)
> +{
> +    close(umem->fd);
> +    umem->fd = -1;
> +}
> +
> +void *umem_map_shmem(UMem *umem)
> +{
> +    umem->nbits = umem->size >> umem->page_shift;
> +    umem->nsets = 0;
> +    umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits));
> +
> +    umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +                       umem->shmem_fd, 0);
> +    if (umem->shmem == MAP_FAILED) {
> +        perror("daemon: mmap(\"shmem\")");
> +        abort();
> +    }
> +    return umem->shmem;
> +}
> +
> +void umem_unmap_shmem(UMem *umem)
> +{
> +    munmap(umem->shmem, umem->size);
> +    umem->shmem = NULL;
> +}
> +
> +void umem_remove_shmem(UMem *umem, size_t offset, size_t size)
> +{
> +    int s = offset >> umem->page_shift;
> +    int e = (offset + size) >> umem->page_shift;
> +    int i;
> +
> +    for (i = s; i < e; i++) {
> +        if (!test_and_set_bit(i, umem->faulted)) {
> +            umem->nsets++;
> +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE)
> +            madvise(umem->shmem + offset, size, MADV_REMOVE);
> +#endif
> +        }
> +    }
> +}
> +
> +void umem_close_shmem(UMem *umem)
> +{
> +    close(umem->shmem_fd);
> +    umem->shmem_fd = -1;
> +}
> +
> +/***************************************************************************/
> +/* qemu <-> umem daemon communication */
> +
> +size_t umem_pages_size(uint64_t nr)
> +{
> +    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
> +}
> +
> +static void umem_write_cmd(int fd, uint8_t cmd)
> +{
> +    DPRINTF("write cmd %c\n", cmd);
> +
> +    for (;;) {
> +        ssize_t ret = write(fd, &cmd, 1);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EPIPE) {
> +                perror("pipe");
> +                DPRINTF("write cmd %c %zd %d: pipe is closed\n",
> +                        cmd, ret, errno);
> +                break;
> +            }
> +
> +            perror("pipe");
> +            DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
> +            abort();
> +        }
> +
> +        break;
> +    }
> +}
> +
> +static void umem_read_cmd(int fd, uint8_t expect)
> +{
> +    uint8_t cmd;
> +    for (;;) {
> +        ssize_t ret = read(fd, &cmd, 1);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            }
> +            perror("pipe");
> +            DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
> +            abort();
> +        }
> +
> +        if (ret == 0) {
> +            DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
> +            abort();
> +        }
> +
> +        break;
> +    }
> +
> +    DPRINTF("read cmd %c\n", cmd);
> +    if (cmd != expect) {
> +        DPRINTF("cmd %c expect %d\n", cmd, expect);
> +        abort();
> +    }
> +}
> +
> +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
> +{
> +    int ret;
> +    uint64_t nr;
> +    size_t size;
> +    struct umem_pages *pages;
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
> +    *offset += sizeof(nr);
> +    DPRINTF("ret %d nr %ld\n", ret, nr);
> +    if (ret != sizeof(nr) || nr == 0) {
> +        return NULL;
> +    }
> +
> +    size = umem_pages_size(nr);
> +    pages = g_malloc(size);
> +    pages->nr = nr;
> +    size -= sizeof(pages->nr);
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset);
> +    *offset += size;
> +    if (ret != size) {
> +        g_free(pages);
> +        return NULL;
> +    }
> +    return pages;
> +}
> +
> +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages)
> +{
> +    size_t len = umem_pages_size(pages->nr);
> +    qemu_put_buffer(f, (const uint8_t*)pages, len);
> +}
> +
> +/* umem daemon -> qemu */
> +void umem_daemon_ready(int to_qemu_fd)
> +{
> +    umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY);
> +}
> +
> +void umem_daemon_quit(QEMUFile *to_qemu)
> +{
> +    qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT);
> +}
> +
> +void umem_daemon_send_pages_present(QEMUFile *to_qemu,
> +                                    struct umem_pages *pages)
> +{
> +    qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT);
> +    umem_send_pages(to_qemu, pages);
> +}
> +
> +void umem_daemon_wait_for_qemu(int from_qemu_fd)
> +{
> +    umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY);
> +}
> +
> +/* qemu -> umem daemon */
> +void umem_qemu_wait_for_daemon(int from_umemd_fd)
> +{
> +    umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY);
> +}
> +
> +void umem_qemu_ready(int to_umemd_fd)
> +{
> +    umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY);
> +}
> +
> +void umem_qemu_quit(QEMUFile *to_umemd)
> +{
> +    qemu_put_byte(to_umemd, UMEM_QEMU_QUIT);
> +}
> +
> +/* qemu side handler */
> +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> +                                                int *offset)
> +{
> +    uint64_t i;
> +    int page_shift = ffs(getpagesize()) - 1;
> +    struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
> +    if (pages == NULL) {
> +        return NULL;
> +    }
> +
> +    for (i = 0; i < pages->nr; i++) {
> +        ram_addr_t addr = pages->pgoffs[i] << page_shift;
> +
> +        /* make pages present by forcibly triggering page fault. */
> +        volatile uint8_t *ram = qemu_get_ram_ptr(addr);
> +        uint8_t dummy_read = ram[0];
> +        (void)dummy_read;   /* suppress unused variable warning */
> +    }
> +
> +    return pages;
> +}
> +
> +void umem_qemu_send_pages_present(QEMUFile *to_umemd,
> +                                  const struct umem_pages *pages)
> +{
> +    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED);
> +    umem_send_pages(to_umemd, pages);
> +}
> +
> +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
> +                                   const struct umem_pages *pages)
> +{
> +    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED);
> +    umem_send_pages(to_umemd, pages);
> +}
> diff --git a/umem.h b/umem.h
> new file mode 100644
> index 0000000..5ca19ef
> --- /dev/null
> +++ b/umem.h
> @@ -0,0 +1,105 @@
> +/*
> + * umem.h: user process backed memory module for postcopy livemigration
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_UMEM_H
> +#define QEMU_UMEM_H
> +
> +#include <linux/umem.h>
> +
> +#include "qemu-common.h"
> +
> +typedef struct UMemDev UMemDev;
> +
> +struct UMem {
> +    void *umem;
> +    int fd;
> +    void *shmem;
> +    int shmem_fd;
> +    uint64_t size;
> +
> +    /* indexed by host page size */
> +    int page_shift;
> +    int nbits;
> +    int nsets;
> +    unsigned long *faulted;
> +};
> +
> +UMemDev *umem_dev_new(void);
> +void umem_dev_destroy(UMemDev *dev);
> +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name);
> +void umem_mmap(UMem *umem);
> +
> +void umem_destroy(UMem *umem);
> +
> +/* umem device operations */
> +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request);
> +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached);
> +void umem_unmap(UMem *umem);
> +void umem_close(UMem *umem);
> +
> +/* umem shmem operations */
> +void *umem_map_shmem(UMem *umem);
> +void umem_unmap_shmem(UMem *umem);
> +void umem_remove_shmem(UMem *umem, size_t offset, size_t size);
> +void umem_close_shmem(UMem *umem);
> +
> +/* qemu on source <-> umem daemon communication */
> +
> +struct umem_pages {
> +    uint64_t nr;        /* nr = 0 means completed */
> +    uint64_t pgoffs[0];
> +};
> +
> +/* daemon -> qemu */
> +#define UMEM_DAEMON_READY               'R'
> +#define UMEM_DAEMON_QUIT                'Q'
> +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT  'T'
> +#define UMEM_DAEMON_ERROR               'E'
> +
> +/* qemu -> daemon */
> +#define UMEM_QEMU_READY                 'r'
> +#define UMEM_QEMU_QUIT                  'q'
> +#define UMEM_QEMU_PAGE_FAULTED          't'
> +#define UMEM_QEMU_PAGE_UNMAPPED         'u'
> +
> +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset);
> +size_t umem_pages_size(uint64_t nr);
> +
> +/* for umem daemon */
> +void umem_daemon_ready(int to_qemu_fd);
> +void umem_daemon_wait_for_qemu(int from_qemu_fd);
> +void umem_daemon_quit(QEMUFile *to_qemu);
> +void umem_daemon_send_pages_present(QEMUFile *to_qemu,
> +                                    struct umem_pages *pages);
> +
> +/* for qemu */
> +void umem_qemu_wait_for_daemon(int from_umemd_fd);
> +void umem_qemu_ready(int to_umemd_fd);
> +void umem_qemu_quit(QEMUFile *to_umemd);
> +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> +                                                int *offset);
> +void umem_qemu_send_pages_present(QEMUFile *to_umemd,
> +                                  const struct umem_pages *pages);
> +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
> +                                   const struct umem_pages *pages);
> +
> +#endif /* QEMU_UMEM_H */
> diff --git a/vl.c b/vl.c
> index 5430b8c..17427a0 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp)
>      default_drive(default_sdcard, snapshot, machine->use_scsi,
>                    IF_SD, 0, SD_OPTS);
>  
> -    register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL,
> -                         ram_save_live, NULL, ram_load, NULL);
> +    if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) {
> +        exit(1);
> +    }
> +    register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID,
> +                         ram_save_set_params, ram_save_live, NULL,
> +                         ram_load, NULL);
>  
>      if (nb_numa_nodes > 0) {
>          int i;
> @@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp)
>  
>      if (incoming) {
>          runstate_set(RUN_STATE_INMIGRATE);
> +        if (incoming_postcopy) {
> +            postcopy_incoming_prepare();
>+        }

how about moving postcopy_incoming_prepare into qemu_start_incoming_migration ?

>          int ret = qemu_start_incoming_migration(incoming);
>          if (ret < 0) {
>              fprintf(stderr, "Migration failed. Exit code %s(%d), exiting.\n",
> @@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp)
>      bdrv_close_all();
>      pause_all_vcpus();
>      net_cleanup();
> +    if (incoming_postcopy) {
> +        postcopy_incoming_qemu_cleanup();
> +    }
>      res_free();
>  
>      return 0;

Orit

WARNING: multiple messages have this Message-ID (diff)
From: Orit Wasserman <owasserm@redhat.com>
To: Isaku Yamahata <yamahata@valinux.co.jp>, satoshi.itoh@aist.go.jp
Cc: t.hirofuchi@aist.go.jp, qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration
Date: Thu, 29 Dec 2011 17:51:36 +0200	[thread overview]
Message-ID: <4EFC8C88.70701@redhat.com> (raw)
In-Reply-To: <f5061e756dfd4ebf8456170a2c146a81be01c685.1325055140.git.yamahata@valinux.co.jp>

Hi,
A general comment this patch is a bit too long,which makes it hard to review.
Can you split it please?

On 12/29/2011 03:26 AM, Isaku Yamahata wrote:
> This patch implements postcopy livemigration.
> 
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> ---
>  Makefile.target           |    4 +
>  arch_init.c               |   26 +-
>  cpu-all.h                 |    7 +
>  exec.c                    |   20 +-
>  migration-exec.c          |    8 +
>  migration-fd.c            |   30 +
>  migration-postcopy-stub.c |   77 ++
>  migration-postcopy.c      | 1891 +++++++++++++++++++++++++++++++++++++++++++++
>  migration-tcp.c           |   37 +-
>  migration-unix.c          |   32 +-
>  migration.c               |   31 +
>  migration.h               |   30 +
>  qemu-common.h             |    1 +
>  qemu-options.hx           |    5 +-
>  umem.c                    |  379 +++++++++
>  umem.h                    |  105 +++
>  vl.c                      |   14 +-
>  17 files changed, 2677 insertions(+), 20 deletions(-)
>  create mode 100644 migration-postcopy-stub.c
>  create mode 100644 migration-postcopy.c
>  create mode 100644 umem.c
>  create mode 100644 umem.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index 3261383..d94c53f 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h
>  CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y)
>  CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y)
>  CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y)
> +CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y)
>  
>  include ../config-host.mak
>  include config-devices.mak
> @@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o
>  obj-y += memory.o
>  LIBS+=-lz
>  
> +common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o
> +common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o
> +
>  QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
>  QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
>  QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
> diff --git a/arch_init.c b/arch_init.c
> index bc53092..8b3130d 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch)
>      return 1;
>  }
>  
> +static bool outgoing_postcopy = false;
> +
> +void ram_save_set_params(const MigrationParams *params, void *opaque)
> +{
> +    outgoing_postcopy = params->postcopy;
> +}
> +
>  static RAMBlock *last_block_sent = NULL;
>  
>  int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
> @@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>      uint64_t expected_time = 0;
>      int ret;
>  
> +    if (stage == 1) {
> +        last_block_sent = NULL;
> +
> +        bytes_transferred = 0;
> +        last_block = NULL;
> +        last_offset = 0;

Changing of line order + new empty line

> +    }
> +    if (outgoing_postcopy) {
> +        return postcopy_outgoing_ram_save_live(mon, f, stage, opaque);
> +    }
> +

I would just do :

unregister_savevm_live and then register_savevm_live(...,postcopy_outgoing_ram_save_live,...)
when starting outgoing postcopy migration.

>      if (stage < 0) {
>          cpu_physical_memory_set_dirty_tracking(0);
>          return 0;
> @@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>      }
>  
>      if (stage == 1) {
> -        bytes_transferred = 0;
> -        last_block_sent = NULL;
> -        last_block = NULL;
> -        last_offset = 0;
>          sort_ram_list();
>  
>          /* Make sure all dirty bits are set */
> @@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
>      int flags;
>      int error;
>  
> +    if (incoming_postcopy) {
> +        return postcopy_incoming_ram_load(f, opaque, version_id);
> +    }
> +
why not call register_savevm_live(...,postcopy_incoming_ram_load,...) when starting guest with postcopy_incoming

>      if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) {
>          return -EINVAL;
>      }
> diff --git a/cpu-all.h b/cpu-all.h
> index 0244f7a..2e9d8a7 100644
> --- a/cpu-all.h
> +++ b/cpu-all.h
> @@ -475,6 +475,9 @@ extern ram_addr_t ram_size;
>  /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
>  #define RAM_PREALLOC_MASK   (1 << 0)
>  
> +/* RAM is allocated via umem for postcopy incoming mode */
> +#define RAM_POSTCOPY_UMEM_MASK  (1 << 1)
> +
>  typedef struct RAMBlock {
>      uint8_t *host;
>      ram_addr_t offset;
> @@ -485,6 +488,10 @@ typedef struct RAMBlock {
>  #if defined(__linux__) && !defined(TARGET_S390X)
>      int fd;
>  #endif
> +
> +#ifdef CONFIG_POSTCOPY
> +    UMem *umem;    /* for incoming postcopy mode */
> +#endif
>  } RAMBlock;
>  
>  typedef struct RAMList {
> diff --git a/exec.c b/exec.c
> index c8c6692..90b0491 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -35,6 +35,7 @@
>  #include "qemu-timer.h"
>  #include "memory.h"
>  #include "exec-memory.h"
> +#include "migration.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>          new_block->host = host;
>          new_block->flags |= RAM_PREALLOC_MASK;
>      } else {
> +#ifdef CONFIG_POSTCOPY
> +        if (incoming_postcopy) {
> +            postcopy_incoming_ram_alloc(name, size,
> +                                        &new_block->host, &new_block->umem);
> +            new_block->flags |= RAM_POSTCOPY_UMEM_MASK;
> +        } else
> +#endif
>          if (mem_path) {
>  #if defined (__linux__) && !defined(TARGET_S390X)
>              new_block->host = file_ram_alloc(new_block, size, mem_path);
> @@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr)
>              QLIST_REMOVE(block, next);
>              if (block->flags & RAM_PREALLOC_MASK) {
>                  ;
> -            } else if (mem_path) {
> +            }
> +#ifdef CONFIG_POSTCOPY
> +            else if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
> +                postcopy_incoming_ram_free(block->umem);
> +            }
> +#endif
> +            else if (mem_path) {
>  #if defined (__linux__) && !defined(TARGET_S390X)
>                  if (block->fd) {
>                      munmap(block->host, block->length);
> @@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>              } else {
>                  flags = MAP_FIXED;
>                  munmap(vaddr, length);
> +                if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
> +                    postcopy_incoming_qemu_pages_unmapped(addr, length);
> +                    block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> +                }
>                  if (mem_path) {
>  #if defined(__linux__) && !defined(TARGET_S390X)
>                      if (block->fd) {
> diff --git a/migration-exec.c b/migration-exec.c
> index e14552e..2bd0c3b 100644
> --- a/migration-exec.c
> +++ b/migration-exec.c
> @@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s, const char *command)
>  {
>      FILE *f;
>  
> +    if (s->params.postcopy) {
> +        return -ENOSYS;
> +    }
> +
>      f = popen(command, "w");
>      if (f == NULL) {
>          DPRINTF("Unable to popen exec target\n");
> @@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command)
>  {
>      QEMUFile *f;
>  
> +    if (incoming_postcopy) {
> +        return -ENOSYS;
> +    }
> +
>      DPRINTF("Attempting to start an incoming migration\n");
>      f = qemu_popen_cmd(command, "r");
>      if(f == NULL) {
> diff --git a/migration-fd.c b/migration-fd.c
> index 6211124..5a62ab9 100644
> --- a/migration-fd.c
> +++ b/migration-fd.c
> @@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const char *fdname)
>      s->write = fd_write;
>      s->close = fd_close;
>  
> +    if (s->params.postcopy) {
> +        int flags = fcntl(s->fd, F_GETFL);
> +        if ((flags & O_ACCMODE) != O_RDWR) {
> +            goto err_after_open;
> +        }
> +
> +        s->fd_read = dup(s->fd);
> +        if (s->fd_read == -1) {
> +            goto err_after_open;
> +        }
> +        s->file_read = qemu_fdopen(s->fd_read, "r");
> +        if (s->file_read == NULL) {
> +            close(s->fd_read);
> +            goto err_after_open;
> +        }
> +    }
> +
>      migrate_fd_connect(s);
>      return 0;
>  
> @@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque)
>  
>      process_incoming_migration(f);
>      qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f);
> +    }
>      qemu_fclose(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_qemu_ready();
> +    }
> +    return;
>  }
>  
>  int fd_start_incoming_migration(const char *infd)
> @@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd)
>      DPRINTF("Attempting to start an incoming migration via fd\n");
>  
>      fd = strtol(infd, NULL, 0);
> +    if (incoming_postcopy) {
> +        int flags = fcntl(fd, F_GETFL);
> +        if ((flags & O_ACCMODE) != O_RDWR) {
> +            return -EINVAL;
> +        }
> +    }
>      f = qemu_fdopen(fd, "rb");
>      if(f == NULL) {
>          DPRINTF("Unable to apply qemu wrapper to file descriptor\n");
> diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c
> new file mode 100644
> index 0000000..0b78de7
> --- /dev/null
> +++ b/migration-postcopy-stub.c
> @@ -0,0 +1,77 @@
> +/*
> + * migration-postcopy-stub.c: postcopy livemigration
> + *                            stub functions for non-supported hosts
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sysemu.h"
> +#include "migration.h"
> +
> +int postcopy_outgoing_create_read_socket(MigrationState *s)
> +{
> +    return -ENOSYS;
> +}
> +
> +int postcopy_outgoing_ram_save_live(Monitor *mon,
> +                                    QEMUFile *f, int stage, void *opaque)
> +{
> +    return -ENOSYS;
> +}
> +
> +void *postcopy_outgoing_begin(MigrationState *ms)
> +{
> +    return NULL;
> +}
> +
> +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> +                                          void *postcopy)
> +{
> +    return -ENOSYS;
> +}
> +
> +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
> +{
> +    return -ENOSYS;
> +}
> +
> +void postcopy_incoming_prepare(void)
> +{
> +}
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    return -ENOSYS;
> +}
> +
> +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
> +{
> +}
> +
> +void postcopy_incoming_qemu_ready(void)
> +{
> +}
> +
> +void postcopy_incoming_qemu_cleanup(void)
> +{
> +}
> +
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
> +{
> +}
> diff --git a/migration-postcopy.c b/migration-postcopy.c
> new file mode 100644
> index 0000000..ed0d574
> --- /dev/null
> +++ b/migration-postcopy.c
> @@ -0,0 +1,1891 @@
> +/*
> + * migration-postcopy.c: postcopy livemigration
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "bitmap.h"
> +#include "sysemu.h"
> +#include "hw/hw.h"
> +#include "arch_init.h"
> +#include "migration.h"
> +#include "umem.h"
> +
> +#include "memory.h"
> +#define WANT_EXEC_OBSOLETE
> +#include "exec-obsolete.h"
> +
> +//#define DEBUG_POSTCOPY
> +#ifdef DEBUG_POSTCOPY
> +#include <sys/syscall.h>
> +#define DPRINTF(fmt, ...)                                               \
> +    do {                                                                \
> +        printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid),     \
> +               __func__, __LINE__, ## __VA_ARGS__);                     \
> +    } while (0)
> +#else
> +#define DPRINTF(fmt, ...)       do { } while (0)
> +#endif
> +
> +#define ALIGN_UP(size, align)   (((size) + (align) - 1) & ~((align) - 1))
> +
> +static void fd_close(int *fd)
> +{
> +    if (*fd >= 0) {
> +        close(*fd);
> +        *fd = -1;
> +    }
> +}
> +
> +/***************************************************************************
> + * QEMUFile for non blocking pipe
> + */
> +
> +/* read only */
> +struct QEMUFilePipe {
> +    int fd;
> +    QEMUFile *file;
> +};

Why not use QEMUFileSocket ?

> +typedef struct QEMUFilePipe QEMUFilePipe;
> +
> +static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFilePipe *s = opaque;
> +    ssize_t len = 0;
> +
> +    while (size > 0) {
> +        ssize_t ret = read(s->fd, buf, size);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            }
> +            if (len == 0) {
> +                len = -errno;
> +            }
> +            break;
> +        }
> +
> +        if (ret == 0) {
> +            /* the write end of the pipe is closed */
> +            break;
> +        }
> +        len += ret;
> +        buf += ret;
> +        size -= ret;
> +    }
> +
> +    return len;
> +}
> +
> +static int pipe_close(void *opaque)
> +{
> +    QEMUFilePipe *s = opaque;
> +    g_free(s);
> +    return 0;
> +}
> +
> +static QEMUFile *qemu_fopen_pipe(int fd)
> +{
> +    QEMUFilePipe *s = g_malloc0(sizeof(*s));
> +
> +    s->fd = fd;
> +    fcntl_setfl(fd, O_NONBLOCK);
> +    s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close,
> +                             NULL, NULL, NULL);
> +    return s->file;
> +}
> +
> +/* write only */
> +struct QEMUFileNonblock {
> +    int fd;
> +    QEMUFile *file;
> +
> +    /* for pipe-write nonblocking mode */
> +#define BUF_SIZE_INC    (32 * 1024)     /* = IO_BUF_SIZE */
> +    uint8_t *buffer;
> +    size_t buffer_size;
> +    size_t buffer_capacity;
> +    bool freeze_output;
> +};
> +typedef struct QEMUFileNonblock QEMUFileNonblock;
> +

Couldn't you use QEMUFileBuffered ?

> +static void nonblock_flush_buffer(QEMUFileNonblock *s)
> +{
> +    size_t offset = 0;
> +    ssize_t ret;
> +
> +    while (offset < s->buffer_size) {
> +        ret = write(s->fd, s->buffer + offset, s->buffer_size - offset);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EAGAIN) {
> +                s->freeze_output = true;
> +            } else {
> +                qemu_file_set_error(s->file, errno);
> +            }
> +            break;
> +        }
> +
> +        if (ret == 0) {
> +            DPRINTF("ret == 0\n");
> +            break;
> +        }
> +
> +        offset += ret;
> +    }
> +
> +    if (offset > 0) {
> +        assert(s->buffer_size >= offset);
> +        memmove(s->buffer, s->buffer + offset, s->buffer_size - offset);
> +        s->buffer_size -= offset;
> +    }
> +    if (s->buffer_size > 0) {
> +        s->freeze_output = true;
> +    }
> +}
> +
> +static int nonblock_put_buffer(void *opaque,
> +                               const uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFileNonblock *s = opaque;
> +    int error;
> +    ssize_t len = 0;
> +
> +    error = qemu_file_get_error(s->file);
> +    if (error) {
> +        return error;
> +    }
> +
> +    nonblock_flush_buffer(s);
> +    error = qemu_file_get_error(s->file);
> +    if (error) {
> +        return error;
> +    }
> +
> +    while (!s->freeze_output && size > 0) {
> +        ssize_t ret;
> +        assert(s->buffer_size == 0);
> +
> +        ret = write(s->fd, buf, size);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EAGAIN) {
> +                s->freeze_output = true;
> +            } else {
> +                qemu_file_set_error(s->file, errno);
> +            }
> +            break;
> +        }
> +
> +        len += ret;
> +        buf += ret;
> +        size -= ret;
> +    }
> +
> +    if (size > 0) {
> +        int inc = size - (s->buffer_capacity - s->buffer_size);
> +        if (inc > 0) {
> +            s->buffer_capacity +=
> +                DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC;
> +            s->buffer = g_realloc(s->buffer, s->buffer_capacity);
> +        }
> +        memcpy(s->buffer + s->buffer_size, buf, size);
> +        s->buffer_size += size;
> +
> +        len += size;
> +    }
> +
> +    return len;
> +}
> +
> +static int nonblock_pending_size(QEMUFileNonblock *s)
> +{
> +    return qemu_pending_size(s->file) + s->buffer_size;
> +}
> +
> +static void nonblock_fflush(QEMUFileNonblock *s)
> +{
> +    s->freeze_output = false;
> +    nonblock_flush_buffer(s);
> +    if (!s->freeze_output) {
> +        qemu_fflush(s->file);
> +    }
> +}
> +
> +static void nonblock_wait_for_flush(QEMUFileNonblock *s)
> +{
> +    while (nonblock_pending_size(s) > 0) {
> +        fd_set fds;
> +        FD_ZERO(&fds);
> +        FD_SET(s->fd, &fds);
> +        select(s->fd + 1, NULL, &fds, NULL, NULL);
> +
> +        nonblock_fflush(s);
> +    }
> +}
> +
> +static int nonblock_close(void *opaque)
> +{
> +    QEMUFileNonblock *s = opaque;
> +    nonblock_wait_for_flush(s);
> +    g_free(s->buffer);
> +    g_free(s);
> +    return 0;
> +}
> +
> +static QEMUFileNonblock *qemu_fopen_nonblock(int fd)
> +{
> +    QEMUFileNonblock *s = g_malloc0(sizeof(*s));
> +
> +    s->fd = fd;
> +    fcntl_setfl(fd, O_NONBLOCK);
> +    s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close,
> +                             NULL, NULL, NULL);
> +    return s;
> +}
> +
> +/***************************************************************************
> + * umem daemon on destination <-> qemu on source protocol
> + */
> +
> +#define QEMU_UMEM_REQ_INIT              0x00
> +#define QEMU_UMEM_REQ_ON_DEMAND         0x01
> +#define QEMU_UMEM_REQ_ON_DEMAND_CONT    0x02
> +#define QEMU_UMEM_REQ_BACKGROUND        0x03
> +#define QEMU_UMEM_REQ_BACKGROUND_CONT   0x04
> +#define QEMU_UMEM_REQ_REMOVE            0x05
> +#define QEMU_UMEM_REQ_EOC               0x06
> +
> +struct qemu_umem_req {
> +    int8_t cmd;
> +    uint8_t len;
> +    char *idstr;        /* ON_DEMAND, BACKGROUND, REMOVE */
> +    uint32_t nr;        /* ON_DEMAND, ON_DEMAND_CONT,
> +                           BACKGROUND, BACKGROUND_CONT, REMOVE */
> +
> +    /* in target page size as qemu migration protocol */
> +    uint64_t *pgoffs;   /* ON_DEMAND, ON_DEMAND_CONT,
> +                           BACKGROUND, BACKGROUND_CONT, REMOVE */
> +};
> +
> +static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr)
> +{
> +    qemu_put_byte(f, strlen(idstr));
> +    qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr));
> +}
> +
> +static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr,
> +                                              const uint64_t *pgoffs)
> +{
> +    uint32_t i;
> +
> +    qemu_put_be32(f, nr);
> +    for (i = 0; i < nr; i++) {
> +        qemu_put_be64(f, pgoffs[i]);
> +    }
> +}
> +
> +static void postcopy_incoming_send_req_one(QEMUFile *f,
> +                                           const struct qemu_umem_req *req)
> +{
> +    DPRINTF("cmd %d\n", req->cmd);
> +    qemu_put_byte(f, req->cmd);
> +    switch (req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +    case QEMU_UMEM_REQ_EOC:
> +        /* nothing */
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +    case QEMU_UMEM_REQ_REMOVE:
> +        postcopy_incoming_send_req_idstr(f, req->idstr);
> +        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +}
> +
> +/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024.
> + * So one message size must be <= IO_BUF_SIZE
> + * cmd: 1
> + * id len: 1
> + * id: 256
> + * nr: 2
> + */
> +#define MAX_PAGE_NR     ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t))
> +static void postcopy_incoming_send_req(QEMUFile *f,
> +                                       const struct qemu_umem_req *req)
> +{
> +    uint32_t nr = req->nr;
> +    struct qemu_umem_req tmp = *req;
> +
> +    switch (req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +    case QEMU_UMEM_REQ_EOC:
> +        postcopy_incoming_send_req_one(f, &tmp);
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +        tmp.nr = MIN(nr, MAX_PAGE_NR);
> +        postcopy_incoming_send_req_one(f, &tmp);
> +
> +        nr -= tmp.nr;
> +        tmp.pgoffs += tmp.nr;
> +        if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) {
> +            tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> +        }else {
> +            tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT;
> +        }
> +        /* fall through */
> +    case QEMU_UMEM_REQ_REMOVE:
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        while (nr > 0) {
> +            tmp.nr = MIN(nr, MAX_PAGE_NR);
> +            postcopy_incoming_send_req_one(f, &tmp);
> +
> +            nr -= tmp.nr;
> +            tmp.pgoffs += tmp.nr;
> +        }
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +}
> +
> +static int postcopy_outgoing_recv_req_idstr(QEMUFile *f,
> +                                            struct qemu_umem_req *req,
> +                                            size_t *offset)
> +{
> +    int ret;
> +
> +    req->len = qemu_peek_byte(f, *offset);
> +    *offset += 1;
> +    if (req->len == 0) {
> +        return -EAGAIN;
> +    }
> +    req->idstr = g_malloc((int)req->len + 1);
> +    ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset);
> +    *offset += ret;
> +    if (ret != req->len) {
> +        g_free(req->idstr);
> +        req->idstr = NULL;
> +        return -EAGAIN;
> +    }
> +    req->idstr[req->len] = 0;
> +    return 0;
> +}
> +
> +static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f,
> +                                             struct qemu_umem_req *req,
> +                                             size_t *offset)
> +{
> +    int ret;
> +    uint32_t be32;
> +    uint32_t i;
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset);
> +    *offset += sizeof(be32);
> +    if (ret != sizeof(be32)) {
> +        return -EAGAIN;
> +    }
> +
> +    req->nr = be32_to_cpu(be32);
> +    req->pgoffs = g_new(uint64_t, req->nr);
> +    for (i = 0; i < req->nr; i++) {
> +        uint64_t be64;
> +        ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset);
> +        *offset += sizeof(be64);
> +        if (ret != sizeof(be64)) {
> +            g_free(req->pgoffs);
> +            req->pgoffs = NULL;
> +            return -EAGAIN;
> +        }
> +        req->pgoffs[i] = be64_to_cpu(be64);
> +    }
> +    return 0;
> +}
> +
> +static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req *req)
> +{
> +    int size;
> +    int ret;
> +    size_t offset = 0;
> +
> +    size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset);
> +    if (size <= 0) {
> +        return -EAGAIN;
> +    }
> +    offset += 1;
> +
> +    switch (req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +    case QEMU_UMEM_REQ_EOC:
> +        /* nothing */
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +    case QEMU_UMEM_REQ_REMOVE:
> +        ret = postcopy_outgoing_recv_req_idstr(f, req, &offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        break;
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +    qemu_file_skip(f, offset);
> +    DPRINTF("cmd %d\n", req->cmd);
> +    return 0;
> +}
> +
> +static void postcopy_outgoing_free_req(struct qemu_umem_req *req)
> +{
> +    g_free(req->idstr);
> +    g_free(req->pgoffs);
> +}
> +
> +/***************************************************************************
> + * outgoing part
> + */
> +
> +#define QEMU_SAVE_LIVE_STAGE_START      0x01    /* = QEMU_VM_SECTION_START */
> +#define QEMU_SAVE_LIVE_STAGE_PART       0x02    /* = QEMU_VM_SECTION_PART */
> +#define QEMU_SAVE_LIVE_STAGE_END        0x03    /* = QEMU_VM_SECTION_END */
> +
> +enum POState {
> +    PO_STATE_ERROR_RECEIVE,
> +    PO_STATE_ACTIVE,
> +    PO_STATE_EOC_RECEIVED,
> +    PO_STATE_ALL_PAGES_SENT,
> +    PO_STATE_COMPLETED,
> +};
> +typedef enum POState POState;
> +
> +struct PostcopyOutgoingState {
> +    POState state;
> +    QEMUFile *mig_read;
> +    int fd_read;
> +    RAMBlock *last_block_read;
> +
> +    QEMUFile *mig_buffered_write;
> +    MigrationState *ms;
> +
> +    /* For nobg mode. Check if all pages are sent */
> +    RAMBlock *block;
> +    ram_addr_t addr;
> +};
> +typedef struct PostcopyOutgoingState PostcopyOutgoingState;
> +
> +int postcopy_outgoing_create_read_socket(MigrationState *s)
> +{
> +    if (!s->params.postcopy) {
> +        return 0;
> +    }
> +
> +    s->fd_read = dup(s->fd);
> +    if (s->fd_read == -1) {
> +        int ret = -errno;
> +        perror("dup");
> +        return ret;
> +    }
> +    s->file_read = qemu_fopen_socket(s->fd_read);
> +    if (s->file_read == NULL) {
> +        return -EINVAL;
> +    }
> +    return 0;
> +}
> +
> +int postcopy_outgoing_ram_save_live(Monitor *mon,
> +                                    QEMUFile *f, int stage, void *opaque)
> +{
> +    int ret = 0;
> +    DPRINTF("stage %d\n", stage);
> +    if (stage == QEMU_SAVE_LIVE_STAGE_START) {
> +        sort_ram_list();
> +        ram_save_live_mem_size(f);
> +    }
> +    if (stage == QEMU_SAVE_LIVE_STAGE_PART) {
> +        ret = 1;
> +    }
> +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +    return ret;
> +}
> +
> +static RAMBlock *postcopy_outgoing_find_block(const char *idstr)
> +{
> +    RAMBlock *block;
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (!strncmp(idstr, block->idstr, strlen(idstr))) {
> +            return block;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +/*
> + * return value
> + *   0: continue postcopy mode
> + * > 0: completed postcopy mode.
> + * < 0: error
> + */
> +static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s,
> +                                        const struct qemu_umem_req *req,
> +                                        bool *written)
> +{
> +    int i;
> +    RAMBlock *block;
> +
> +    DPRINTF("cmd %d state %d\n", req->cmd, s->state);
> +    switch(req->cmd) {
> +    case QEMU_UMEM_REQ_INIT:
> +        /* nothing */
> +        break;
> +    case QEMU_UMEM_REQ_EOC:
> +        /* tell to finish migration. */
> +        if (s->state == PO_STATE_ALL_PAGES_SENT) {
> +            s->state = PO_STATE_COMPLETED;
> +            DPRINTF("-> PO_STATE_COMPLETED\n");
> +        } else {
> +            s->state = PO_STATE_EOC_RECEIVED;
> +            DPRINTF("-> PO_STATE_EOC_RECEIVED\n");
> +        }
> +        return 1;
> +    case QEMU_UMEM_REQ_ON_DEMAND:
> +    case QEMU_UMEM_REQ_BACKGROUND:
> +        DPRINTF("idstr: %s\n", req->idstr);
> +        block = postcopy_outgoing_find_block(req->idstr);
> +        if (block == NULL) {
> +            return -EINVAL;
> +        }
> +        s->last_block_read = block;
> +        /* fall through */
> +    case QEMU_UMEM_REQ_ON_DEMAND_CONT:
> +    case QEMU_UMEM_REQ_BACKGROUND_CONT:
> +        DPRINTF("nr %d\n", req->nr);
> +        for (i = 0; i < req->nr; i++) {
> +            DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]);
> +            int ret = ram_save_page(s->mig_buffered_write, s->last_block_read,
> +                                    req->pgoffs[i] << TARGET_PAGE_BITS);
> +            if (ret > 0) {
> +                *written = true;
> +            }
> +        }
> +        break;
> +    case QEMU_UMEM_REQ_REMOVE:
> +        block = postcopy_outgoing_find_block(req->idstr);
> +        if (block == NULL) {
> +            return -EINVAL;
> +        }
> +        for (i = 0; i < req->nr; i++) {
> +            ram_addr_t addr = block->offset +
> +                (req->pgoffs[i] << TARGET_PAGE_BITS);
> +            cpu_physical_memory_reset_dirty(addr,
> +                                            addr + TARGET_PAGE_SIZE,
> +                                            MIGRATION_DIRTY_FLAG);
> +        }
> +        break;
> +    default:
> +        return -EINVAL;
> +    }
> +    return 0;
> +}
> +
> +static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s)
> +{
> +    if (s->mig_read != NULL) {
> +        qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL);
> +        qemu_fclose(s->mig_read);
> +        s->mig_read = NULL;
> +        fd_close(&s->fd_read);
> +
> +        s->ms->file_read = NULL;
> +        s->ms->fd_read = -1;
> +    }
> +}
> +
> +static void postcopy_outgoing_completed(PostcopyOutgoingState *s)
> +{
> +    postcopy_outgoing_close_mig_read(s);
> +    s->ms->postcopy = NULL;
> +    g_free(s);
> +}
> +
> +static void postcopy_outgoing_recv_handler(void *opaque)
> +{
> +    PostcopyOutgoingState *s = opaque;
> +    bool written = false;
> +    int ret = 0;
> +
> +    assert(s->state == PO_STATE_ACTIVE ||
> +           s->state == PO_STATE_ALL_PAGES_SENT);
> +
> +    do {
> +        struct qemu_umem_req req = {.idstr = NULL,
> +                                    .pgoffs = NULL};
> +
> +        ret = postcopy_outgoing_recv_req(s->mig_read, &req);
> +        if (ret < 0) {
> +            if (ret == -EAGAIN) {
> +                ret = 0;
> +            }
> +            break;
> +        }
> +        if (s->state == PO_STATE_ACTIVE) {
> +            ret = postcopy_outgoing_handle_req(s, &req, &written);
> +        }
> +        postcopy_outgoing_free_req(&req);
> +    } while (ret == 0);
> +
> +    /*
> +     * flush buffered_file.
> +     * Although mig_write is rate-limited buffered file, those written pages
> +     * are requested on demand by the destination. So forcibly push
> +     * those pages ignoring rate limiting
> +     */
> +    if (written) {
> +        qemu_fflush(s->mig_buffered_write);
> +        /* qemu_buffered_file_drain(s->mig_buffered_write); */
> +    }
> +
> +    if (ret < 0) {
> +        switch (s->state) {
> +        case PO_STATE_ACTIVE:
> +            s->state = PO_STATE_ERROR_RECEIVE;
> +            DPRINTF("-> PO_STATE_ERROR_RECEIVE\n");
> +            break;
> +        case PO_STATE_ALL_PAGES_SENT:
> +            s->state = PO_STATE_COMPLETED;
> +            DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n");
> +            break;
> +        default:
> +            abort();
> +        }
> +    }
> +    if (s->state == PO_STATE_ERROR_RECEIVE || s->state == PO_STATE_COMPLETED) {
> +        postcopy_outgoing_close_mig_read(s);
> +    }
> +    if (s->state == PO_STATE_COMPLETED) {
> +        DPRINTF("PO_STATE_COMPLETED\n");
> +        MigrationState *ms = s->ms;
> +        postcopy_outgoing_completed(s);
> +        migrate_fd_completed(ms);
> +    }
> +}
> +
> +void *postcopy_outgoing_begin(MigrationState *ms)
> +{
> +    PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1);
> +    DPRINTF("outgoing begin\n");
> +    qemu_fflush(ms->file);
> +
> +    s->ms = ms;
> +    s->state = PO_STATE_ACTIVE;
> +    s->fd_read = ms->fd_read;
> +    s->mig_read = ms->file_read;
> +    s->mig_buffered_write = ms->file;
> +    s->block = NULL;
> +    s->addr = 0;
> +
> +    /* Make sure all dirty bits are set */
> +    ram_save_memory_set_dirty();
> +
> +    qemu_set_fd_handler(s->fd_read,
> +                        &postcopy_outgoing_recv_handler, NULL, s);
> +    return s;
> +}
> +
> +static void postcopy_outgoing_ram_all_sent(QEMUFile *f,
> +                                           PostcopyOutgoingState *s)
> +{
> +    assert(s->state == PO_STATE_ACTIVE);
> +
> +    s->state = PO_STATE_ALL_PAGES_SENT;
> +    /* tell incoming side that all pages are sent */
> +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +    qemu_fflush(f);
> +    qemu_buffered_file_drain(f);
> +    DPRINTF("sent RAM_SAVE_FLAG_EOS\n");
> +    migrate_fd_cleanup(s->ms);
> +
> +    /* Later migrate_fd_complete() will be called which calls
> +     * migrate_fd_cleanup() again. So dummy file is created
> +     * for qemu monitor to keep working.
> +     */
> +    s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL,
> +                                 NULL, NULL);
> +}
> +
> +static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s,
> +                                                RAMBlock *block,
> +                                                ram_addr_t addr)
> +{
> +    if (block == NULL) {
> +        block = QLIST_FIRST(&ram_list.blocks);
> +        addr = block->offset;
> +    }
> +
> +    for (; block != NULL;
> +         s->block = QLIST_NEXT(s->block, next), addr = block->offset) {
> +        for (; addr < block->offset + block->length;
> +             addr += TARGET_PAGE_SIZE) {
> +            if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) {
> +                s->block = block;
> +                s->addr = addr;
> +                return 0;
> +            }
> +        }
> +    }
> +
> +    return 1;
> +}
> +
> +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> +                                          void *postcopy)
> +{
> +    PostcopyOutgoingState *s = postcopy;
> +
> +    assert(s->state == PO_STATE_ACTIVE ||
> +           s->state == PO_STATE_EOC_RECEIVED ||
> +           s->state == PO_STATE_ERROR_RECEIVE);
> +
> +    switch (s->state) {
> +    case PO_STATE_ACTIVE:
> +        /* nothing. processed below */
> +        break;
> +    case PO_STATE_EOC_RECEIVED:
> +        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +        s->state = PO_STATE_COMPLETED;
> +        postcopy_outgoing_completed(s);
> +        DPRINTF("PO_STATE_COMPLETED\n");
> +        return 1;
> +    case PO_STATE_ERROR_RECEIVE:
> +        postcopy_outgoing_completed(s);
> +        DPRINTF("PO_STATE_ERROR_RECEIVE\n");
> +        return -1;
> +    default:
> +        abort();
> +    }
> +
> +    if (s->ms->params.nobg) {
> +        /* See if all pages are sent. */
> +        if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) == 0) {
> +            return 0;
> +        }
> +        /* ram_list can be reordered. (it doesn't seem so during migration,
> +           though) So the whole list needs to be checked again */
> +        if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) {
> +            return 0;
> +        }
> +
> +        postcopy_outgoing_ram_all_sent(f, s);
> +        return 0;
> +    }
> +
> +    DPRINTF("outgoing background state: %d\n", s->state);
> +
> +    while (qemu_file_rate_limit(f) == 0) {
> +        if (ram_save_block(f) == 0) { /* no more blocks */
> +            assert(s->state == PO_STATE_ACTIVE);
> +            postcopy_outgoing_ram_all_sent(f, s);
> +            return 0;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/***************************************************************************
> + * incoming part
> + */
> +
> +/* flags for incoming mode to modify the behavior.
> +   This is for benchmark/debug purpose */
> +#define INCOMING_FLAGS_FAULT_REQUEST 0x01
> +
> +
> +static void postcopy_incoming_umemd(void);
> +
> +#define PIS_STATE_QUIT_RECEIVED         0x01
> +#define PIS_STATE_QUIT_QUEUED           0x02
> +#define PIS_STATE_QUIT_SENT             0x04
> +
> +#define PIS_STATE_QUIT_MASK             (PIS_STATE_QUIT_RECEIVED | \
> +                                         PIS_STATE_QUIT_QUEUED | \
> +                                         PIS_STATE_QUIT_SENT)
> +
> +struct PostcopyIncomingState {
> +    /* dest qemu state */
> +    uint32_t    state;
> +
> +    UMemDev *dev;
> +    int host_page_size;
> +    int host_page_shift;
> +
> +    /* qemu side */
> +    int to_umemd_fd;
> +    QEMUFileNonblock *to_umemd;
> +#define MAX_FAULTED_PAGES       256
> +    struct umem_pages *faulted_pages;
> +
> +    int from_umemd_fd;
> +    QEMUFile *from_umemd;
> +    int version_id;     /* save/load format version id */
> +};
> +typedef struct PostcopyIncomingState PostcopyIncomingState;
> +
> +
> +#define UMEM_STATE_EOS_RECEIVED         0x01    /* umem daemon <-> src qemu */
> +#define UMEM_STATE_EOC_SENT             0x02    /* umem daemon <-> src qemu */
> +#define UMEM_STATE_QUIT_RECEIVED        0x04    /* umem daemon <-> dst qemu */
> +#define UMEM_STATE_QUIT_QUEUED          0x08    /* umem daemon <-> dst qemu */
> +#define UMEM_STATE_QUIT_SENT            0x10    /* umem daemon <-> dst qemu */
> +
> +#define UMEM_STATE_QUIT_MASK            (UMEM_STATE_QUIT_QUEUED | \
> +                                         UMEM_STATE_QUIT_SENT | \
> +                                         UMEM_STATE_QUIT_RECEIVED)
> +#define UMEM_STATE_END_MASK             (UMEM_STATE_EOS_RECEIVED | \
> +                                         UMEM_STATE_EOC_SENT | \
> +                                         UMEM_STATE_QUIT_MASK)
> +
> +struct PostcopyIncomingUMemDaemon {
> +    /* umem daemon side */
> +    uint32_t state;
> +
> +    int host_page_size;
> +    int host_page_shift;
> +    int nr_host_pages_per_target_page;
> +    int host_to_target_page_shift;
> +    int nr_target_pages_per_host_page;
> +    int target_to_host_page_shift;
> +    int version_id;     /* save/load format version id */
> +
> +    int to_qemu_fd;
> +    QEMUFileNonblock *to_qemu;
> +    int from_qemu_fd;
> +    QEMUFile *from_qemu;
> +
> +    int mig_read_fd;
> +    QEMUFile *mig_read;         /* qemu on source -> umem daemon */
> +
> +    int mig_write_fd;
> +    QEMUFileNonblock *mig_write;        /* umem daemon -> qemu on source */
> +
> +    /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */
> +#define MAX_REQUESTS    (512 * (64 + 1))
> +
> +    struct umem_page_request page_request;
> +    struct umem_page_cached page_cached;
> +
> +#define MAX_PRESENT_REQUESTS    MAX_FAULTED_PAGES
> +    struct umem_pages *present_request;
> +
> +    uint64_t *target_pgoffs;
> +
> +    /* bitmap indexed by target page offset */
> +    unsigned long *phys_requested;
> +
> +    /* bitmap indexed by target page offset */
> +    unsigned long *phys_received;
> +
> +    RAMBlock *last_block_read;  /* qemu on source -> umem daemon */
> +    RAMBlock *last_block_write; /* umem daemon -> qemu on source */
> +};
> +typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon;
> +
> +static PostcopyIncomingState state = {
> +    .state = 0,
> +    .dev = NULL,
> +    .to_umemd_fd = -1,
> +    .to_umemd = NULL,
> +    .from_umemd_fd = -1,
> +    .from_umemd = NULL,
> +};
> +
> +static PostcopyIncomingUMemDaemon umemd = {
> +    .state = 0,
> +    .to_qemu_fd = -1,
> +    .to_qemu = NULL,
> +    .from_qemu_fd = -1,
> +    .from_qemu = NULL,
> +    .mig_read_fd = -1,
> +    .mig_read = NULL,
> +    .mig_write_fd = -1,
> +    .mig_write = NULL,
> +};
> +
> +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy)
> +{
> +    /* incoming_postcopy makes sense only when incoming migration mode */
> +    if (!incoming && incoming_postcopy) {
> +        return -EINVAL;
> +    }
> +
> +    if (!incoming_postcopy) {
> +        return 0;
> +    }
> +
> +    state.state = 0;
> +    state.dev = umem_dev_new();
> +    state.host_page_size = getpagesize();
> +    state.host_page_shift = ffs(state.host_page_size) - 1;
> +    state.version_id = RAM_SAVE_VERSION_ID; /* = save version of
> +                                               ram_save_live() */
> +    return 0;
> +}
> +
> +void postcopy_incoming_ram_alloc(const char *name,
> +                                 size_t size, uint8_t **hostp, UMem **umemp)
> +{
> +    UMem *umem;
> +    size = ALIGN_UP(size, state.host_page_size);
> +    umem = umem_dev_create(state.dev, size, name);
> +
> +    *umemp = umem;
> +    *hostp = umem->umem;
> +}
> +
> +void postcopy_incoming_ram_free(UMem *umem)
> +{
> +    umem_unmap(umem);
> +    umem_close(umem);
> +    umem_destroy(umem);
> +}
> +
> +void postcopy_incoming_prepare(void)
> +{
> +    RAMBlock *block;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (block->umem != NULL) {
> +            umem_mmap(block->umem);
> +        }
> +    }
> +}
> +
> +static int postcopy_incoming_ram_load_get64(QEMUFile *f,
> +                                             ram_addr_t *addr, int *flags)
> +{
> +    *addr = qemu_get_be64(f);
> +    *flags = *addr & ~TARGET_PAGE_MASK;
> +    *addr &= TARGET_PAGE_MASK;
> +    return qemu_file_get_error(f);
> +}
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    ram_addr_t addr;
> +    int flags;
> +    int error;
> +
> +    DPRINTF("incoming ram load\n");
> +    /*
> +     * RAM_SAVE_FLAGS_EOS or
> +     * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS
> +     * see postcopy_outgoing_ram_save_live()
> +     */
> +
> +    if (version_id != RAM_SAVE_VERSION_ID) {
> +        DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n",
> +                version_id, RAM_SAVE_VERSION_ID);
> +        return -EINVAL;
> +    }
> +    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> +    DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags);
> +    if (error) {
> +        DPRINTF("error %d\n", error);
> +        return error;
> +    }
> +    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> +        DPRINTF("EOS\n");
> +        return 0;
> +    }
> +
> +    if (flags != RAM_SAVE_FLAG_MEM_SIZE) {
> +        DPRINTF("-EINVAL flags 0x%x\n", flags);
> +        return -EINVAL;
> +    }
> +    error = ram_load_mem_size(f, addr);
> +    if (error) {
> +        DPRINTF("addr 0x%lx error %d\n", addr, error);
> +        return error;
> +    }
> +
> +    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> +    if (error) {
> +        DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error);
> +        return error;
> +    }
> +    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> +        DPRINTF("done\n");
> +        return 0;
> +    }
> +    DPRINTF("-EINVAL\n");
> +    return -EINVAL;
> +}
> +
> +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read)
> +{
> +    int fds[2];
> +    RAMBlock *block;
> +
> +    DPRINTF("fork\n");
> +
> +    /* socketpair(AF_UNIX)? */
> +
> +    if (qemu_pipe(fds) == -1) {
> +        perror("qemu_pipe");
> +        abort();
> +    }
> +    state.from_umemd_fd = fds[0];
> +    umemd.to_qemu_fd = fds[1];
> +
> +    if (qemu_pipe(fds) == -1) {
> +        perror("qemu_pipe");
> +        abort();
> +    }
> +    umemd.from_qemu_fd = fds[0];
> +    state.to_umemd_fd = fds[1];
> +
> +    pid_t child = fork();
> +    if (child < 0) {
> +        perror("fork");
> +        abort();
> +    }
> +
> +    if (child == 0) {
> +        int mig_write_fd;
> +
> +        fd_close(&state.to_umemd_fd);
> +        fd_close(&state.from_umemd_fd);
> +        umemd.host_page_size = state.host_page_size;
> +        umemd.host_page_shift = state.host_page_shift;
> +
> +        umemd.nr_host_pages_per_target_page =
> +            TARGET_PAGE_SIZE / umemd.host_page_size;
> +        umemd.nr_target_pages_per_host_page =
> +            umemd.host_page_size / TARGET_PAGE_SIZE;
> +
> +        umemd.target_to_host_page_shift =
> +            ffs(umemd.nr_host_pages_per_target_page) - 1;
> +        umemd.host_to_target_page_shift =
> +            ffs(umemd.nr_target_pages_per_host_page) - 1;
> +
> +        umemd.state = 0;
> +        umemd.version_id = state.version_id;
> +        umemd.mig_read_fd = mig_read_fd;
> +        umemd.mig_read = mig_read;
> +
> +        mig_write_fd = dup(mig_read_fd);
> +        if (mig_write_fd < 0) {
> +            perror("could not dup for writable socket \n");
> +            abort();
> +        }
> +        umemd.mig_write_fd = mig_write_fd;
> +        umemd.mig_write = qemu_fopen_nonblock(mig_write_fd);
> +
> +        postcopy_incoming_umemd(); /* noreturn */
> +    }
> +
> +    DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child);
> +    fd_close(&umemd.to_qemu_fd);
> +    fd_close(&umemd.from_qemu_fd);
> +    state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES));
> +    state.faulted_pages->nr = 0;
> +
> +    /* close all UMem.shmem_fd */
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        umem_close_shmem(block->umem);
> +    }
> +    umem_qemu_wait_for_daemon(state.from_umemd_fd);
> +}
> +
> +static void postcopy_incoming_qemu_recv_quit(void)
> +{
> +    RAMBlock *block;
> +    if (state.state & PIS_STATE_QUIT_RECEIVED) {
> +        return;
> +    }
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (block->umem != NULL) {
> +            umem_destroy(block->umem);
> +            block->umem = NULL;
> +            block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> +        }
> +    }
> +
> +    DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n");
> +    state.state |= PIS_STATE_QUIT_RECEIVED;
> +    qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL);
> +    qemu_fclose(state.from_umemd);
> +    state.from_umemd = NULL;
> +    fd_close(&state.from_umemd_fd);
> +}
> +
> +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque)
> +{
> +    assert(state.to_umemd != NULL);
> +
> +    nonblock_fflush(state.to_umemd);
> +    if (nonblock_pending_size(state.to_umemd) > 0) {
> +        return;
> +    }
> +
> +    qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL);
> +    if (state.state & PIS_STATE_QUIT_QUEUED) {
> +        DPRINTF("|= PIS_STATE_QUIT_SENT\n");
> +        state.state |= PIS_STATE_QUIT_SENT;
> +        qemu_fclose(state.to_umemd->file);
> +        state.to_umemd = NULL;
> +        fd_close(&state.to_umemd_fd);
> +        g_free(state.faulted_pages);
> +        state.faulted_pages = NULL;
> +    }
> +}
> +
> +static void postcopy_incoming_qemu_fflush_to_umemd(void)
> +{
> +    qemu_set_fd_handler(state.to_umemd->fd, NULL,
> +                        postcopy_incoming_qemu_fflush_to_umemd_handler, NULL);
> +    postcopy_incoming_qemu_fflush_to_umemd_handler(NULL);
> +}
> +
> +static void postcopy_incoming_qemu_queue_quit(void)
> +{
> +    if (state.state & PIS_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +
> +    DPRINTF("|= PIS_STATE_QUIT_QUEUED\n");
> +    umem_qemu_quit(state.to_umemd->file);
> +    state.state |= PIS_STATE_QUIT_QUEUED;
> +}
> +
> +static void postcopy_incoming_qemu_send_pages_present(void)
> +{
> +    if (state.faulted_pages->nr > 0) {
> +        umem_qemu_send_pages_present(state.to_umemd->file,
> +                                     state.faulted_pages);
> +        state.faulted_pages->nr = 0;
> +    }
> +}
> +
> +static void postcopy_incoming_qemu_faulted_pages(
> +    const struct umem_pages *pages)
> +{
> +    assert(pages->nr <= MAX_FAULTED_PAGES);
> +    assert(state.faulted_pages != NULL);
> +
> +    if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) {
> +        postcopy_incoming_qemu_send_pages_present();
> +    }
> +    memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr],
> +           &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr);
> +    state.faulted_pages->nr += pages->nr;
> +}
> +
> +static void postcopy_incoming_qemu_cleanup_umem(void);
> +
> +static int postcopy_incoming_qemu_handle_req_one(void)
> +{
> +    int offset = 0;
> +    int ret;
> +    uint8_t cmd;
> +
> +    ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset);
> +    offset += sizeof(cmd);
> +    if (ret != sizeof(cmd)) {
> +        return -EAGAIN;
> +    }
> +    DPRINTF("cmd %c\n", cmd);
> +
> +    switch (cmd) {
> +    case UMEM_DAEMON_QUIT:
> +        postcopy_incoming_qemu_recv_quit();
> +        postcopy_incoming_qemu_queue_quit();
> +        postcopy_incoming_qemu_cleanup_umem();
> +        break;
> +    case UMEM_DAEMON_TRIGGER_PAGE_FAULT: {
> +        struct umem_pages *pages =
> +            umem_qemu_trigger_page_fault(state.from_umemd, &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (state.to_umemd_fd >= 0 && !(state.state & PIS_STATE_QUIT_QUEUED)) {
> +            postcopy_incoming_qemu_faulted_pages(pages);
> +            g_free(pages);
> +        }
> +        break;
> +    }
> +    case UMEM_DAEMON_ERROR:
> +        /* umem daemon hit troubles, so it warned us to stop vm execution */
> +        vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +
> +    if (state.from_umemd != NULL) {
> +        qemu_file_skip(state.from_umemd, offset);
> +    }
> +    return 0;
> +}
> +
> +static void postcopy_incoming_qemu_handle_req(void *opaque)
> +{
> +    do {
> +        int ret = postcopy_incoming_qemu_handle_req_one();
> +        if (ret == -EAGAIN) {
> +            break;
> +        }
> +    } while (state.from_umemd != NULL &&
> +             qemu_pending_size(state.from_umemd) > 0);
> +
> +    if (state.to_umemd != NULL) {
> +        if (state.faulted_pages->nr > 0) {
> +            postcopy_incoming_qemu_send_pages_present();
> +        }
> +        postcopy_incoming_qemu_fflush_to_umemd();
> +    }
> +}
> +
> +void postcopy_incoming_qemu_ready(void)
> +{
> +    umem_qemu_ready(state.to_umemd_fd);
> +
> +    state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd);
> +    state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd);
> +    qemu_set_fd_handler(state.from_umemd_fd,
> +                        postcopy_incoming_qemu_handle_req, NULL, NULL);
> +}
> +
> +static void postcopy_incoming_qemu_cleanup_umem(void)
> +{
> +    /* when qemu will quit before completing postcopy, tell umem daemon
> +       to tear down umem device and exit. */
> +    if (state.to_umemd_fd >= 0) {
> +        postcopy_incoming_qemu_queue_quit();
> +        postcopy_incoming_qemu_fflush_to_umemd();
> +    }
> +
> +    if (state.dev) {
> +        umem_dev_destroy(state.dev);
> +        state.dev = NULL;
> +    }
> +}
> +
> +void postcopy_incoming_qemu_cleanup(void)
> +{
> +    postcopy_incoming_qemu_cleanup_umem();
> +    if (state.to_umemd != NULL) {
> +        nonblock_wait_for_flush(state.to_umemd);
> +    }
> +}
> +
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
> +{
> +    uint64_t nr = DIV_ROUND_UP(size, state.host_page_size);
> +    size_t len = umem_pages_size(nr);
> +    ram_addr_t end = addr + size;
> +    struct umem_pages *pages;
> +    int i;
> +
> +    if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +    pages = g_malloc(len);
> +    pages->nr = nr;
> +    for (i = 0; addr < end; addr += state.host_page_size, i++) {
> +        pages->pgoffs[i] = addr >> state.host_page_shift;
> +    }
> +    umem_qemu_send_pages_unmapped(state.to_umemd->file, pages);
> +    g_free(pages);
> +    assert(state.to_umemd != NULL);
> +    postcopy_incoming_qemu_fflush_to_umemd();
> +}
> +
> +/**************************************************************************
> + * incoming umem daemon
> + */
> +
> +static void postcopy_incoming_umem_recv_quit(void)
> +{
> +    if (umemd.state & UMEM_STATE_QUIT_RECEIVED) {
> +        return;
> +    }
> +    DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n");
> +    umemd.state |= UMEM_STATE_QUIT_RECEIVED;
> +    qemu_fclose(umemd.from_qemu);
> +    umemd.from_qemu = NULL;
> +    fd_close(&umemd.from_qemu_fd);
> +}
> +
> +static void postcopy_incoming_umem_queue_quit(void)
> +{
> +    if (umemd.state & UMEM_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +    DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n");
> +    umem_daemon_quit(umemd.to_qemu->file);
> +    umemd.state |= UMEM_STATE_QUIT_QUEUED;
> +}
> +
> +static void postcopy_incoming_umem_send_eoc_req(void)
> +{
> +    struct qemu_umem_req req;
> +
> +    if (umemd.state & UMEM_STATE_EOC_SENT) {
> +        return;
> +    }
> +
> +    DPRINTF("|= UMEM_STATE_EOC_SENT\n");
> +    req.cmd = QEMU_UMEM_REQ_EOC;
> +    postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +    umemd.state |= UMEM_STATE_EOC_SENT;
> +    qemu_fclose(umemd.mig_write->file);
> +    umemd.mig_write = NULL;
> +    fd_close(&umemd.mig_write_fd);
> +}
> +
> +static void postcopy_incoming_umem_send_page_req(RAMBlock *block)
> +{
> +    struct qemu_umem_req req;
> +    int bit;
> +    uint64_t target_pgoff;
> +    int i;
> +
> +    umemd.page_request.nr = MAX_REQUESTS;
> +    umem_get_page_request(block->umem, &umemd.page_request);
> +    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> +            block->idstr, umemd.page_request.nr,
> +            (uint64_t)umemd.page_request.pgoffs[0],
> +            (uint64_t)umemd.page_request.pgoffs[1]);
> +
> +    if (umemd.last_block_write != block) {
> +        req.cmd = QEMU_UMEM_REQ_ON_DEMAND;
> +        req.idstr = block->idstr;
> +    } else {
> +        req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> +    }
> +
> +    req.nr = 0;
> +    req.pgoffs = umemd.target_pgoffs;
> +    if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> +        for (i = 0; i < umemd.page_request.nr; i++) {
> +            target_pgoff =
> +                umemd.page_request.pgoffs[i] >> umemd.host_to_target_page_shift;
> +            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> +
> +            if (!test_and_set_bit(bit, umemd.phys_requested)) {
> +                req.pgoffs[req.nr] = target_pgoff;
> +                req.nr++;
> +            }
> +        }
> +    } else {
> +        for (i = 0; i < umemd.page_request.nr; i++) {
> +            int j;
> +            target_pgoff =
> +                umemd.page_request.pgoffs[i] << umemd.host_to_target_page_shift;
> +            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> +
> +            for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) {
> +                if (!test_and_set_bit(bit + j, umemd.phys_requested)) {
> +                    req.pgoffs[req.nr] = target_pgoff + j;
> +                    req.nr++;
> +                }
> +            }
> +        }
> +    }
> +
> +    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> +            block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]);
> +    if (req.nr > 0 && umemd.mig_write != NULL) {
> +        postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +        umemd.last_block_write = block;
> +    }
> +}
> +
> +static void postcopy_incoming_umem_send_pages_present(void)
> +{
> +    if (umemd.present_request->nr > 0) {
> +        umem_daemon_send_pages_present(umemd.to_qemu->file,
> +                                       umemd.present_request);
> +        umemd.present_request->nr = 0;
> +    }
> +}
> +
> +static void postcopy_incoming_umem_pages_present_one(
> +    uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset)
> +{
> +    uint32_t i;
> +    assert(nr <= MAX_PRESENT_REQUESTS);
> +
> +    if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) {
> +        postcopy_incoming_umem_send_pages_present();
> +    }
> +
> +    for (i = 0; i < nr; i++) {
> +        umemd.present_request->pgoffs[umemd.present_request->nr + i] =
> +            pgoffs[i] + ramblock_pgoffset;
> +    }
> +    umemd.present_request->nr += nr;
> +}
> +
> +static void postcopy_incoming_umem_pages_present(
> +    const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset)
> +{
> +    uint32_t left = page_cached->nr;
> +    uint32_t offset = 0;
> +
> +    while (left > 0) {
> +        uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS);
> +        postcopy_incoming_umem_pages_present_one(
> +            nr, &page_cached->pgoffs[offset], ramblock_pgoffset);
> +
> +        left -= nr;
> +        offset += nr;
> +    }
> +}
> +
> +static int postcopy_incoming_umem_ram_load(void)
> +{
> +    ram_addr_t offset;
> +    int flags;
> +    int error;
> +    void *shmem;
> +    int i;
> +    int bit;
> +
> +    if (umemd.version_id != RAM_SAVE_VERSION_ID) {
> +        return -EINVAL;
> +    }
> +
> +    offset = qemu_get_be64(umemd.mig_read);
> +
> +    flags = offset & ~TARGET_PAGE_MASK;
> +    offset &= TARGET_PAGE_MASK;
> +
> +    assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE));
> +
> +    if (flags & RAM_SAVE_FLAG_EOS) {
> +        DPRINTF("RAM_SAVE_FLAG_EOS\n");
> +        postcopy_incoming_umem_send_eoc_req();
> +
> +        qemu_fclose(umemd.mig_read);
> +        umemd.mig_read = NULL;
> +        fd_close(&umemd.mig_read_fd);
> +        umemd.state |= UMEM_STATE_EOS_RECEIVED;
> +
> +        postcopy_incoming_umem_queue_quit();
> +        DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n");
> +        return 0;
> +    }
> +
> +    shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags,
> +                                             &umemd.last_block_read);
> +    if (!shmem) {
> +        DPRINTF("shmem == NULL\n");
> +        return -EINVAL;
> +    }
> +
> +    if (flags & RAM_SAVE_FLAG_COMPRESS) {
> +        uint8_t ch = qemu_get_byte(umemd.mig_read);
> +        memset(shmem, ch, TARGET_PAGE_SIZE);
> +    } else if (flags & RAM_SAVE_FLAG_PAGE) {
> +        qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE);
> +    }
> +
> +    error = qemu_file_get_error(umemd.mig_read);
> +    if (error) {
> +        DPRINTF("error %d\n", error);
> +        return error;
> +    }
> +
> +    umemd.page_cached.nr = 0;
> +    bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS;
> +    if (!test_and_set_bit(bit, umemd.phys_received)) {
> +        if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> +            __u64 pgoff = offset >> umemd.host_page_shift;
> +            for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) {
> +                umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i;
> +                umemd.page_cached.nr++;
> +            }
> +        } else {
> +            bool mark_cache = true;
> +            for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) {
> +                if (!test_bit(bit + i, umemd.phys_received)) {
> +                    mark_cache = false;
> +                    break;
> +                }
> +            }
> +            if (mark_cache) {
> +                umemd.page_cached.pgoffs[0] = offset >> umemd.host_page_shift;
> +                umemd.page_cached.nr = 1;
> +            }
> +        }
> +    }
> +
> +    if (umemd.page_cached.nr > 0) {
> +        umem_mark_page_cached(umemd.last_block_read->umem, &umemd.page_cached);
> +
> +        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 &&
> +            (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) {
> +            uint64_t ramblock_pgoffset;
> +
> +            ramblock_pgoffset =
> +                umemd.last_block_read->offset >> umemd.host_page_shift;
> +            postcopy_incoming_umem_pages_present(&umemd.page_cached,
> +                                                 ramblock_pgoffset);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static bool postcopy_incoming_umem_check_umem_done(void)
> +{
> +    bool all_done = true;
> +    RAMBlock *block;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        UMem *umem = block->umem;
> +        if (umem != NULL && umem->nsets == umem->nbits) {
> +            umem_unmap_shmem(umem);
> +            umem_destroy(umem);
> +            block->umem = NULL;
> +        }
> +        if (block->umem != NULL) {
> +            all_done = false;
> +        }
> +    }
> +    return all_done;
> +}
> +
> +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages *pages)
> +{
> +    int i;
> +
> +    for (i = 0; i < pages->nr; i++) {
> +        ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift;
> +        RAMBlock *block = qemu_get_ram_block(addr);
> +        addr -= block->offset;
> +        umem_remove_shmem(block->umem, addr, umemd.host_page_size);
> +    }
> +    return postcopy_incoming_umem_check_umem_done();
> +}
> +
> +static bool
> +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages)
> +{
> +    RAMBlock *block;
> +    ram_addr_t addr;
> +    int i;
> +
> +    struct qemu_umem_req req = {
> +        .cmd = QEMU_UMEM_REQ_REMOVE,
> +        .nr = 0,
> +        .pgoffs = (uint64_t*)pages->pgoffs,
> +    };
> +
> +    addr = pages->pgoffs[0] << umemd.host_page_shift;
> +    block = qemu_get_ram_block(addr);
> +
> +    for (i = 0; i < pages->nr; i++)  {
> +        int pgoff;
> +
> +        addr = pages->pgoffs[i] << umemd.host_page_shift;
> +        pgoff = addr >> TARGET_PAGE_BITS;
> +        if (!test_bit(pgoff, umemd.phys_received) &&
> +            !test_bit(pgoff, umemd.phys_requested)) {
> +            req.pgoffs[req.nr] = pgoff;
> +            req.nr++;
> +        }
> +        set_bit(pgoff, umemd.phys_received);
> +        set_bit(pgoff, umemd.phys_requested);
> +
> +        umem_remove_shmem(block->umem,
> +                          addr - block->offset, umemd.host_page_size);
> +    }
> +    if (req.nr > 0 && umemd.mig_write != NULL) {
> +        req.idstr = block->idstr;
> +        postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +    }
> +
> +    return postcopy_incoming_umem_check_umem_done();
> +}
> +
> +static void postcopy_incoming_umem_done(void)
> +{
> +    postcopy_incoming_umem_send_eoc_req();
> +    postcopy_incoming_umem_queue_quit();
> +}
> +
> +static int postcopy_incoming_umem_handle_qemu(void)
> +{
> +    int ret;
> +    int offset = 0;
> +    uint8_t cmd;
> +
> +    ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset);
> +    offset += sizeof(cmd);
> +    if (ret != sizeof(cmd)) {
> +        return -EAGAIN;
> +    }
> +    DPRINTF("cmd %c\n", cmd);
> +    switch (cmd) {
> +    case UMEM_QEMU_QUIT:
> +        postcopy_incoming_umem_recv_quit();
> +        postcopy_incoming_umem_done();
> +        break;
> +    case UMEM_QEMU_PAGE_FAULTED: {
> +        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> +                                                   &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (postcopy_incoming_umem_page_faulted(pages)){
> +            postcopy_incoming_umem_done();
> +        }
> +        g_free(pages);
> +        break;
> +    }
> +    case UMEM_QEMU_PAGE_UNMAPPED: {
> +        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> +                                                   &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (postcopy_incoming_umem_page_unmapped(pages)){
> +            postcopy_incoming_umem_done();
> +        }
> +        g_free(pages);
> +        break;
> +    }
> +    default:
> +        abort();
> +        break;
> +    }
> +    if (umemd.from_qemu != NULL) {
> +        qemu_file_skip(umemd.from_qemu, offset);
> +    }
> +    return 0;
> +}
> +
> +static void set_fd(int fd, fd_set *fds, int *nfds)
> +{
> +    FD_SET(fd, fds);
> +    if (fd > *nfds) {
> +        *nfds = fd;
> +    }
> +}
> +
> +static int postcopy_incoming_umemd_main_loop(void)
> +{
> +    fd_set writefds;
> +    fd_set readfds;
> +    int nfds;
> +    RAMBlock *block;
> +    int ret;
> +
> +    int pending_size;
> +    bool get_page_request;
> +
> +    nfds = -1;
> +    FD_ZERO(&writefds);
> +    FD_ZERO(&readfds);
> +
> +    if (umemd.mig_write != NULL) {
> +        pending_size = nonblock_pending_size(umemd.mig_write);
> +        if (pending_size > 0) {
> +            set_fd(umemd.mig_write_fd, &writefds, &nfds);
> +        }
> +    } else {
> +        pending_size = 0;
> +    }
> +
> +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2)
> +    /* If page request to the migration source is accumulated,
> +       suspend getting page fault request. */
> +    get_page_request = (pending_size <= PENDING_SIZE_MAX);
> +
> +    if (get_page_request) {
> +        QLIST_FOREACH(block, &ram_list.blocks, next) {
> +            if (block->umem != NULL) {
> +                set_fd(block->umem->fd, &readfds, &nfds);
> +            }
> +        }
> +    }
> +
> +    if (umemd.mig_read_fd >= 0) {
> +        set_fd(umemd.mig_read_fd, &readfds, &nfds);
> +    }
> +
> +    if (umemd.to_qemu != NULL &&
> +        nonblock_pending_size(umemd.to_qemu) > 0) {
> +        set_fd(umemd.to_qemu_fd, &writefds, &nfds);
> +    }
> +    if (umemd.from_qemu_fd >= 0) {
> +        set_fd(umemd.from_qemu_fd, &readfds, &nfds);
> +    }
> +
> +    ret = select(nfds + 1, &readfds, &writefds, NULL, NULL);
> +    if (ret == -1) {
> +        if (errno == EINTR) {
> +            return 0;
> +        }
> +        return ret;
> +    }
> +
> +    if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) {
> +        nonblock_fflush(umemd.mig_write);
> +    }
> +    if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) {
> +        nonblock_fflush(umemd.to_qemu);
> +    }
> +    if (get_page_request) {
> +        QLIST_FOREACH(block, &ram_list.blocks, next) {
> +            if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) {
> +                postcopy_incoming_umem_send_page_req(block);
> +            }
> +        }
> +    }
> +    if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) {
> +        do {
> +            ret = postcopy_incoming_umem_ram_load();
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        } while (umemd.mig_read != NULL &&
> +                 qemu_pending_size(umemd.mig_read) > 0);
> +    }
> +    if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) {
> +        do {
> +            ret = postcopy_incoming_umem_handle_qemu();
> +            if (ret == -EAGAIN) {
> +                break;
> +            }
> +        } while (umemd.from_qemu != NULL &&
> +                 qemu_pending_size(umemd.from_qemu) > 0);
> +    }
> +
> +    if (umemd.mig_write != NULL) {
> +        nonblock_fflush(umemd.mig_write);
> +    }
> +    if (umemd.to_qemu != NULL) {
> +        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) {
> +            postcopy_incoming_umem_send_pages_present();
> +        }
> +        nonblock_fflush(umemd.to_qemu);
> +        if ((umemd.state & UMEM_STATE_QUIT_QUEUED) &&
> +            nonblock_pending_size(umemd.to_qemu) == 0) {
> +            DPRINTF("|= UMEM_STATE_QUIT_SENT\n");
> +            qemu_fclose(umemd.to_qemu->file);
> +            umemd.to_qemu = NULL;
> +            fd_close(&umemd.to_qemu_fd);
> +            umemd.state |= UMEM_STATE_QUIT_SENT;
> +        }
> +    }
> +
> +    return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK;
> +}
> +
> +static void postcopy_incoming_umemd(void)
> +{
> +    ram_addr_t last_ram_offset;
> +    int nbits;
> +    RAMBlock *block;
> +    int ret;
> +
> +    qemu_daemon(1, 1);
> +    signal(SIGPIPE, SIG_IGN);
> +    DPRINTF("daemon pid: %d\n", getpid());
> +
> +    umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS);
> +    umemd.page_cached.pgoffs =
> +        g_new(__u64, MAX_REQUESTS *
> +              (TARGET_PAGE_SIZE >= umemd.host_page_size ?
> +               1: umemd.nr_host_pages_per_target_page));
> +    umemd.target_pgoffs =
> +        g_new(uint64_t, MAX_REQUESTS *
> +              MAX(umemd.nr_host_pages_per_target_page,
> +                  umemd.nr_target_pages_per_host_page));
> +    umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS));
> +    umemd.present_request->nr = 0;
> +
> +    last_ram_offset = qemu_last_ram_offset();
> +    nbits = last_ram_offset >> TARGET_PAGE_BITS;
> +    umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> +    umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> +    umemd.last_block_read = NULL;
> +    umemd.last_block_write = NULL;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        UMem *umem = block->umem;
> +        umem->umem = NULL;      /* umem mapping area has VM_DONT_COPY flag,
> +                                   so we lost those mappings by fork */
> +        block->host = umem_map_shmem(umem);
> +        umem_close_shmem(umem);
> +    }
> +    umem_daemon_ready(umemd.to_qemu_fd);
> +    umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd);
> +
> +    /* wait for qemu to disown migration_fd */
> +    umem_daemon_wait_for_qemu(umemd.from_qemu_fd);
> +    umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd);
> +
> +    DPRINTF("entering umemd main loop\n");
> +    for (;;) {
> +        ret = postcopy_incoming_umemd_main_loop();
> +        if (ret != 0) {
> +            break;
> +        }
> +    }
> +    DPRINTF("exiting umemd main loop\n");
> +
> +    /* This daemon forked from qemu and the parent qemu is still running.
> +     * Cleanups of linked libraries like SDL should not be triggered,
> +     * otherwise the parent qemu may use resources which was already freed.
> +     */
> +    fflush(stdout);
> +    fflush(stderr);
> +    _exit(ret < 0? EXIT_FAILURE: 0);
> +}
> diff --git a/migration-tcp.c b/migration-tcp.c
> index cf6a9b8..aa35050 100644
> --- a/migration-tcp.c
> +++ b/migration-tcp.c
> @@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque)
>      } while (ret == -1 && (socket_error()) == EINTR);
>  
>      if (ret < 0) {
> -        migrate_fd_error(s);
> -        return;
> +        goto error_out;
>      }
>  
>      qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
>  
> -    if (val == 0)
> +    if (val == 0) {
> +        ret = postcopy_outgoing_create_read_socket(s);
> +        if (ret < 0) {
> +            goto error_out;
> +        }
>          migrate_fd_connect(s);
> -    else {
> +    } else {
>          DPRINTF("error connecting %d\n", val);
> -        migrate_fd_error(s);
> +        goto error_out;
>      }
> +    return;
> +
> +error_out:
> +    migrate_fd_error(s);
>  }
>  
>  int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
> @@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s, const char *host_port)
>  
>      if (ret < 0) {
>          DPRINTF("connect failed\n");
> -        migrate_fd_error(s);
> -        return ret;
> +        goto error_out;
> +    }
> +
> +    ret = postcopy_outgoing_create_read_socket(s);
> +    if (ret < 0) {
> +        goto error_out;
>      }
>      migrate_fd_connect(s);
>      return 0;
> +
> +error_out:
> +    migrate_fd_error(s);
> +    return ret;
>  }
>  
>  static void tcp_accept_incoming_migration(void *opaque)
> @@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque)
>      }
>  
>      process_incoming_migration(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_fork_umemd(c, f);
> +    }
>      qemu_fclose(f);
> +    if (incoming_postcopy) {
> +        /* now socket is disowned.
> +           So tell umem server that it's safe to use it */
> +        postcopy_incoming_qemu_ready();
> +    }
>  out:
>      close(c);
>  out2:
> diff --git a/migration-unix.c b/migration-unix.c
> index dfcf203..3707505 100644
> --- a/migration-unix.c
> +++ b/migration-unix.c
> @@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque)
>  
>      qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
>  
> -    if (val == 0)
> +    if (val == 0) {
> +        ret = postcopy_outgoing_create_read_socket(s);
> +        if (ret < 0) {
> +            goto error_out;
> +        }
>          migrate_fd_connect(s);
> -    else {
> +    } else {
>          DPRINTF("error connecting %d\n", val);
> -        migrate_fd_error(s);
> +        goto error_out;
>      }
> +    return;
> +
> +error_out:
> +    migrate_fd_error(s);
>  }
>  
>  int unix_start_outgoing_migration(MigrationState *s, const char *path)
> @@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s, const char *path)
>  
>      if (ret < 0) {
>          DPRINTF("connect failed\n");
> -        migrate_fd_error(s);
> -        return ret;
> +        goto error_out;
> +    }
> +
> +    ret = postcopy_outgoing_create_read_socket(s);
> +    if (ret < 0) {
> +        goto error_out;
>      }
>      migrate_fd_connect(s);
>      return 0;
> +
> +error_out:
> +    migrate_fd_error(s);
> +    return ret;
>  }
>  
>  static void unix_accept_incoming_migration(void *opaque)
> @@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void *opaque)
>      }
>  
>      process_incoming_migration(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_fork_umemd(c, f);
> +    }
>      qemu_fclose(f);
> +    if (incoming_postcopy) {
> +        postcopy_incoming_qemu_ready();
> +    }
>  out:
>      close(c);
>  out2:
> diff --git a/migration.c b/migration.c
> index 0149ab3..51efe44 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -39,6 +39,11 @@ enum {
>      MIG_STATE_COMPLETED,
>  };
>  
> +enum {
> +    MIG_SUBSTATE_PRECOPY,
> +    MIG_SUBSTATE_POSTCOPY,
> +};
> +
>  #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
>  
>  static NotifierList migration_state_notifiers =
> @@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque)
>          return;
>      }
>  
> +    if (s->substate == MIG_SUBSTATE_POSTCOPY) {
> +        /* PRINTF("postcopy background\n"); */
> +        ret = postcopy_outgoing_ram_save_background(s->mon, s->file,
> +                                                    s->postcopy);
> +        if (ret > 0) {
> +            migrate_fd_completed(s);
> +        } else if (ret < 0) {
> +            migrate_fd_error(s);
> +        }
> +        return;
> +    }
> +
>      DPRINTF("iterate\n");
>      ret = qemu_savevm_state_iterate(s->mon, s->file);
>      if (ret < 0) {
> @@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque)
>          DPRINTF("done iterating\n");
>          vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
>  
> +        if (s->params.postcopy) {
> +            if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
> +                migrate_fd_error(s);
> +                if (old_vm_running) {
> +                    vm_start();
> +                }
> +                return;
> +            }
> +            s->substate = MIG_SUBSTATE_POSTCOPY;
> +            s->postcopy = postcopy_outgoing_begin(s);
> +            return;
> +        }
> +
>          if (qemu_savevm_state_complete(s->mon, s->file) < 0) {
>              migrate_fd_error(s);
>          } else {
> @@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s)
>      int ret;
>  
>      s->state = MIG_STATE_ACTIVE;
> +    s->substate = MIG_SUBSTATE_PRECOPY;
>      s->file = qemu_fopen_ops_buffered(s,
>                                        s->bandwidth_limit,
>                                        migrate_fd_put_buffer,
> diff --git a/migration.h b/migration.h
> index 90ae362..2809e99 100644
> --- a/migration.h
> +++ b/migration.h
> @@ -40,6 +40,12 @@ struct MigrationState
>      int (*write)(MigrationState *s, const void *buff, size_t size);
>      void *opaque;
>      MigrationParams params;
> +
> +    /* for postcopy */
> +    int substate;              /* precopy or postcopy */
> +    int fd_read;
> +    QEMUFile *file_read;        /* connection from the detination */
> +    void *postcopy;
>  };
>  
>  void process_incoming_migration(QEMUFile *f);
> @@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void);
>  uint64_t ram_bytes_transferred(void);
>  uint64_t ram_bytes_total(void);
>  
> +void ram_save_set_params(const MigrationParams *params, void *opaque);
>  void sort_ram_list(void);
>  int ram_save_block(QEMUFile *f);
>  void ram_save_memory_set_dirty(void);
> @@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason);
>   */
>  void migrate_del_blocker(Error *reason);
>  
> +/* For outgoing postcopy */
> +int postcopy_outgoing_create_read_socket(MigrationState *s);
> +int postcopy_outgoing_ram_save_live(Monitor *mon,
> +                                    QEMUFile *f, int stage, void *opaque);
> +void *postcopy_outgoing_begin(MigrationState *s);
> +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f,
> +                                          void *postcopy);
> +
> +/* For incoming postcopy */
>  extern bool incoming_postcopy;
>  extern unsigned long incoming_postcopy_flags;
>  
> +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy);
> +void postcopy_incoming_ram_alloc(const char *name,
> +                                 size_t size, uint8_t **hostp, UMem **umemp);
> +void postcopy_incoming_ram_free(UMem *umem);
> +void postcopy_incoming_prepare(void);
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id);
> +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read);
> +void postcopy_incoming_qemu_ready(void);
> +void postcopy_incoming_qemu_cleanup(void);
> +#ifdef NEED_CPU_H
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size);
> +#endif
> +
>  #endif
> diff --git a/qemu-common.h b/qemu-common.h
> index 725922b..d74a8c9 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState;
>  
>  struct Monitor;
>  typedef struct Monitor Monitor;
> +typedef struct UMem UMem;
>  
>  /* we put basic includes here to avoid repeating them in device drivers */
>  #include <stdlib.h>
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 5c5b8f3..19e20f9 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG, QEMU_OPTION_postcopy_flags,
>      "-postcopy-flags unsigned-int(flags)\n"
>      "	                flags for postcopy incoming migration\n"
>      "                   when -incoming and -postcopy are specified.\n"
> -    "                   This is for benchmark/debug purpose (default: 0)\n",
> +    "                   This is for benchmark/debug purpose (default: 0)\n"
> +    "                   Currently supprted flags are\n"
> +    "                   1: enable fault request from umemd to qemu\n"
> +    "                      (default: disabled)\n",
>      QEMU_ARCH_ALL)
>  STEXI
>  @item -postcopy-flags int

Can you move umem.h and umem.h to a separate patch please ,
this patch
> diff --git a/umem.c b/umem.c
> new file mode 100644
> index 0000000..b7be006
> --- /dev/null
> +++ b/umem.c
> @@ -0,0 +1,379 @@
> +/*
> + * umem.c: user process backed memory module for postcopy livemigration
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +
> +#include <linux/umem.h>
> +
> +#include "bitops.h"
> +#include "sysemu.h"
> +#include "hw/hw.h"
> +#include "umem.h"
> +
> +//#define DEBUG_UMEM
> +#ifdef DEBUG_UMEM
> +#include <sys/syscall.h>
> +#define DPRINTF(format, ...)                                            \
> +    do {                                                                \
> +        printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid),    \
> +               __func__, __LINE__, ## __VA_ARGS__);                     \
> +    } while (0)
> +#else
> +#define DPRINTF(format, ...)    do { } while (0)
> +#endif
> +
> +#define DEV_UMEM        "/dev/umem"
> +
> +struct UMemDev {
> +    int fd;
> +    int page_shift;
> +};
> +
> +UMemDev *umem_dev_new(void)
> +{
> +    UMemDev *umem_dev;
> +    int umem_dev_fd = open(DEV_UMEM, O_RDWR);
> +    if (umem_dev_fd < 0) {
> +        perror("can't open "DEV_UMEM);
> +        abort();
> +    }
> +
> +    umem_dev = g_new(UMemDev, 1);
> +    umem_dev->fd = umem_dev_fd;
> +    umem_dev->page_shift = ffs(getpagesize()) - 1;
> +    return umem_dev;
> +}
> +
> +void umem_dev_destroy(UMemDev *dev)
> +{
> +    close(dev->fd);
> +    g_free(dev);
> +}
> +
> +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name)
> +{
> +    struct umem_create create = {
> +        .size = size,
> +        .async_req_max = 0,
> +        .sync_req_max = 0,
> +    };
> +    UMem *umem;
> +
> +    snprintf(create.name.id, sizeof(create.name.id),
> +             "pid-%"PRId64, (uint64_t)getpid());
> +    create.name.id[UMEM_ID_MAX - 1] = 0;
> +    strncpy(create.name.name, name, sizeof(create.name.name));
> +    create.name.name[UMEM_NAME_MAX - 1] = 0;
> +
> +    assert((size % getpagesize()) == 0);
> +    if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) {
> +        perror("UMEM_DEV_CREATE_UMEM");
> +        abort();
> +    }
> +    if (ftruncate(create.shmem_fd, create.size) < 0) {
> +        perror("truncate(\"shmem_fd\")");
> +        abort();
> +    }
> +
> +    umem = g_new(UMem, 1);
> +    umem->nbits = 0;
> +    umem->nsets = 0;
> +    umem->faulted = NULL;
> +    umem->page_shift = dev->page_shift;
> +    umem->fd = create.umem_fd;
> +    umem->shmem_fd = create.shmem_fd;
> +    umem->size = create.size;
> +    umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE,
> +                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +    if (umem->umem == MAP_FAILED) {
> +        perror("mmap(UMem) failed");
> +        abort();
> +    }
> +    return umem;
> +}
> +
> +void umem_mmap(UMem *umem)
> +{
> +    void *ret = mmap(umem->umem, umem->size,
> +                     PROT_EXEC | PROT_READ | PROT_WRITE,
> +                     MAP_PRIVATE | MAP_FIXED, umem->fd, 0);
> +    if (ret == MAP_FAILED) {
> +        perror("umem_mmap(UMem) failed");
> +        abort();
> +    }
> +}
> +
> +void umem_destroy(UMem *umem)
> +{
> +    if (umem->fd != -1) {
> +        close(umem->fd);
> +    }
> +    if (umem->shmem_fd != -1) {
> +        close(umem->shmem_fd);
> +    }
> +    g_free(umem->faulted);
> +    g_free(umem);
> +}
> +
> +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request)
> +{
> +    if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) {
> +        perror("daemon: UMEM_GET_PAGE_REQUEST");
> +        abort();
> +    }
> +}
> +
> +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached)
> +{
> +    if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) {
> +        perror("daemon: UMEM_MARK_PAGE_CACHED");
> +        abort();
> +    }
> +}
> +
> +void umem_unmap(UMem *umem)
> +{
> +    munmap(umem->umem, umem->size);
> +    umem->umem = NULL;
> +}
> +
> +void umem_close(UMem *umem)
> +{
> +    close(umem->fd);
> +    umem->fd = -1;
> +}
> +
> +void *umem_map_shmem(UMem *umem)
> +{
> +    umem->nbits = umem->size >> umem->page_shift;
> +    umem->nsets = 0;
> +    umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits));
> +
> +    umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +                       umem->shmem_fd, 0);
> +    if (umem->shmem == MAP_FAILED) {
> +        perror("daemon: mmap(\"shmem\")");
> +        abort();
> +    }
> +    return umem->shmem;
> +}
> +
> +void umem_unmap_shmem(UMem *umem)
> +{
> +    munmap(umem->shmem, umem->size);
> +    umem->shmem = NULL;
> +}
> +
> +void umem_remove_shmem(UMem *umem, size_t offset, size_t size)
> +{
> +    int s = offset >> umem->page_shift;
> +    int e = (offset + size) >> umem->page_shift;
> +    int i;
> +
> +    for (i = s; i < e; i++) {
> +        if (!test_and_set_bit(i, umem->faulted)) {
> +            umem->nsets++;
> +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE)
> +            madvise(umem->shmem + offset, size, MADV_REMOVE);
> +#endif
> +        }
> +    }
> +}
> +
> +void umem_close_shmem(UMem *umem)
> +{
> +    close(umem->shmem_fd);
> +    umem->shmem_fd = -1;
> +}
> +
> +/***************************************************************************/
> +/* qemu <-> umem daemon communication */
> +
> +size_t umem_pages_size(uint64_t nr)
> +{
> +    return sizeof(struct umem_pages) + nr * sizeof(uint64_t);
> +}
> +
> +static void umem_write_cmd(int fd, uint8_t cmd)
> +{
> +    DPRINTF("write cmd %c\n", cmd);
> +
> +    for (;;) {
> +        ssize_t ret = write(fd, &cmd, 1);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            } else if (errno == EPIPE) {
> +                perror("pipe");
> +                DPRINTF("write cmd %c %zd %d: pipe is closed\n",
> +                        cmd, ret, errno);
> +                break;
> +            }
> +
> +            perror("pipe");
> +            DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno);
> +            abort();
> +        }
> +
> +        break;
> +    }
> +}
> +
> +static void umem_read_cmd(int fd, uint8_t expect)
> +{
> +    uint8_t cmd;
> +    for (;;) {
> +        ssize_t ret = read(fd, &cmd, 1);
> +        if (ret == -1) {
> +            if (errno == EINTR) {
> +                continue;
> +            }
> +            perror("pipe");
> +            DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno);
> +            abort();
> +        }
> +
> +        if (ret == 0) {
> +            DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret);
> +            abort();
> +        }
> +
> +        break;
> +    }
> +
> +    DPRINTF("read cmd %c\n", cmd);
> +    if (cmd != expect) {
> +        DPRINTF("cmd %c expect %d\n", cmd, expect);
> +        abort();
> +    }
> +}
> +
> +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset)
> +{
> +    int ret;
> +    uint64_t nr;
> +    size_t size;
> +    struct umem_pages *pages;
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset);
> +    *offset += sizeof(nr);
> +    DPRINTF("ret %d nr %ld\n", ret, nr);
> +    if (ret != sizeof(nr) || nr == 0) {
> +        return NULL;
> +    }
> +
> +    size = umem_pages_size(nr);
> +    pages = g_malloc(size);
> +    pages->nr = nr;
> +    size -= sizeof(pages->nr);
> +
> +    ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset);
> +    *offset += size;
> +    if (ret != size) {
> +        g_free(pages);
> +        return NULL;
> +    }
> +    return pages;
> +}
> +
> +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages)
> +{
> +    size_t len = umem_pages_size(pages->nr);
> +    qemu_put_buffer(f, (const uint8_t*)pages, len);
> +}
> +
> +/* umem daemon -> qemu */
> +void umem_daemon_ready(int to_qemu_fd)
> +{
> +    umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY);
> +}
> +
> +void umem_daemon_quit(QEMUFile *to_qemu)
> +{
> +    qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT);
> +}
> +
> +void umem_daemon_send_pages_present(QEMUFile *to_qemu,
> +                                    struct umem_pages *pages)
> +{
> +    qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT);
> +    umem_send_pages(to_qemu, pages);
> +}
> +
> +void umem_daemon_wait_for_qemu(int from_qemu_fd)
> +{
> +    umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY);
> +}
> +
> +/* qemu -> umem daemon */
> +void umem_qemu_wait_for_daemon(int from_umemd_fd)
> +{
> +    umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY);
> +}
> +
> +void umem_qemu_ready(int to_umemd_fd)
> +{
> +    umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY);
> +}
> +
> +void umem_qemu_quit(QEMUFile *to_umemd)
> +{
> +    qemu_put_byte(to_umemd, UMEM_QEMU_QUIT);
> +}
> +
> +/* qemu side handler */
> +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> +                                                int *offset)
> +{
> +    uint64_t i;
> +    int page_shift = ffs(getpagesize()) - 1;
> +    struct umem_pages *pages = umem_recv_pages(from_umemd, offset);
> +    if (pages == NULL) {
> +        return NULL;
> +    }
> +
> +    for (i = 0; i < pages->nr; i++) {
> +        ram_addr_t addr = pages->pgoffs[i] << page_shift;
> +
> +        /* make pages present by forcibly triggering page fault. */
> +        volatile uint8_t *ram = qemu_get_ram_ptr(addr);
> +        uint8_t dummy_read = ram[0];
> +        (void)dummy_read;   /* suppress unused variable warning */
> +    }
> +
> +    return pages;
> +}
> +
> +void umem_qemu_send_pages_present(QEMUFile *to_umemd,
> +                                  const struct umem_pages *pages)
> +{
> +    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED);
> +    umem_send_pages(to_umemd, pages);
> +}
> +
> +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
> +                                   const struct umem_pages *pages)
> +{
> +    qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED);
> +    umem_send_pages(to_umemd, pages);
> +}
> diff --git a/umem.h b/umem.h
> new file mode 100644
> index 0000000..5ca19ef
> --- /dev/null
> +++ b/umem.h
> @@ -0,0 +1,105 @@
> +/*
> + * umem.h: user process backed memory module for postcopy livemigration
> + *
> + * Copyright (c) 2011
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef QEMU_UMEM_H
> +#define QEMU_UMEM_H
> +
> +#include <linux/umem.h>
> +
> +#include "qemu-common.h"
> +
> +typedef struct UMemDev UMemDev;
> +
> +struct UMem {
> +    void *umem;
> +    int fd;
> +    void *shmem;
> +    int shmem_fd;
> +    uint64_t size;
> +
> +    /* indexed by host page size */
> +    int page_shift;
> +    int nbits;
> +    int nsets;
> +    unsigned long *faulted;
> +};
> +
> +UMemDev *umem_dev_new(void);
> +void umem_dev_destroy(UMemDev *dev);
> +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name);
> +void umem_mmap(UMem *umem);
> +
> +void umem_destroy(UMem *umem);
> +
> +/* umem device operations */
> +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request);
> +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached);
> +void umem_unmap(UMem *umem);
> +void umem_close(UMem *umem);
> +
> +/* umem shmem operations */
> +void *umem_map_shmem(UMem *umem);
> +void umem_unmap_shmem(UMem *umem);
> +void umem_remove_shmem(UMem *umem, size_t offset, size_t size);
> +void umem_close_shmem(UMem *umem);
> +
> +/* qemu on source <-> umem daemon communication */
> +
> +struct umem_pages {
> +    uint64_t nr;        /* nr = 0 means completed */
> +    uint64_t pgoffs[0];
> +};
> +
> +/* daemon -> qemu */
> +#define UMEM_DAEMON_READY               'R'
> +#define UMEM_DAEMON_QUIT                'Q'
> +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT  'T'
> +#define UMEM_DAEMON_ERROR               'E'
> +
> +/* qemu -> daemon */
> +#define UMEM_QEMU_READY                 'r'
> +#define UMEM_QEMU_QUIT                  'q'
> +#define UMEM_QEMU_PAGE_FAULTED          't'
> +#define UMEM_QEMU_PAGE_UNMAPPED         'u'
> +
> +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset);
> +size_t umem_pages_size(uint64_t nr);
> +
> +/* for umem daemon */
> +void umem_daemon_ready(int to_qemu_fd);
> +void umem_daemon_wait_for_qemu(int from_qemu_fd);
> +void umem_daemon_quit(QEMUFile *to_qemu);
> +void umem_daemon_send_pages_present(QEMUFile *to_qemu,
> +                                    struct umem_pages *pages);
> +
> +/* for qemu */
> +void umem_qemu_wait_for_daemon(int from_umemd_fd);
> +void umem_qemu_ready(int to_umemd_fd);
> +void umem_qemu_quit(QEMUFile *to_umemd);
> +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd,
> +                                                int *offset);
> +void umem_qemu_send_pages_present(QEMUFile *to_umemd,
> +                                  const struct umem_pages *pages);
> +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd,
> +                                   const struct umem_pages *pages);
> +
> +#endif /* QEMU_UMEM_H */
> diff --git a/vl.c b/vl.c
> index 5430b8c..17427a0 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp)
>      default_drive(default_sdcard, snapshot, machine->use_scsi,
>                    IF_SD, 0, SD_OPTS);
>  
> -    register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL,
> -                         ram_save_live, NULL, ram_load, NULL);
> +    if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) {
> +        exit(1);
> +    }
> +    register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID,
> +                         ram_save_set_params, ram_save_live, NULL,
> +                         ram_load, NULL);
>  
>      if (nb_numa_nodes > 0) {
>          int i;
> @@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp)
>  
>      if (incoming) {
>          runstate_set(RUN_STATE_INMIGRATE);
> +        if (incoming_postcopy) {
> +            postcopy_incoming_prepare();
>+        }

how about moving postcopy_incoming_prepare into qemu_start_incoming_migration ?

>          int ret = qemu_start_incoming_migration(incoming);
>          if (ret < 0) {
>              fprintf(stderr, "Migration failed. Exit code %s(%d), exiting.\n",
> @@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp)
>      bdrv_close_all();
>      pause_all_vcpus();
>      net_cleanup();
> +    if (incoming_postcopy) {
> +        postcopy_incoming_qemu_cleanup();
> +    }
>      res_free();
>  
>      return 0;

Orit

  reply	other threads:[~2011-12-29 15:52 UTC|newest]

Thread overview: 88+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-12-29  1:25 [PATCH 00/21][RFC] postcopy live migration Isaku Yamahata
2011-12-29  1:25 ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 01/21] arch_init: export sort_ram_list() and ram_save_block() Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 02/21] arch_init: export RAM_SAVE_xxx flags for postcopy Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 03/21] arch_init/ram_save: introduce constant for ram save version = 4 Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 04/21] arch_init: refactor host_from_stream_offset() Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 05/21] arch_init/ram_save_live: factor out RAM_SAVE_FLAG_MEM_SIZE case Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 06/21] arch_init: refactor ram_save_block() Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 07/21] arch_init/ram_save_live: factor out ram_save_limit Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 08/21] arch_init/ram_load: refactor ram_load Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 09/21] exec.c: factor out qemu_get_ram_ptr() Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 10/21] exec.c: export last_ram_offset() Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 11/21] savevm: export qemu_peek_buffer, qemu_peek_byte, qemu_file_skip Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 12/21] savevm: qemu_pending_size() to return pending buffered size Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 13/21] savevm, buffered_file: introduce method to drain buffer of buffered file Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 14/21] migration: export migrate_fd_completed() and migrate_fd_cleanup() Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 15/21] migration: factor out parameters into MigrationParams Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 16/21] umem.h: import Linux umem.h Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 17/21] update-linux-headers.sh: teach umem.h to update-linux-headers.sh Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 18/21] configure: add CONFIG_POSTCOPY option Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 19/21] postcopy: introduce -postcopy and -postcopy-flags option Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:25 ` [PATCH 20/21] postcopy outgoing: add -p and -n option to migrate command Isaku Yamahata
2011-12-29  1:25   ` [Qemu-devel] " Isaku Yamahata
2011-12-29  1:26 ` [PATCH 21/21] postcopy: implement postcopy livemigration Isaku Yamahata
2011-12-29  1:26   ` [Qemu-devel] " Isaku Yamahata
2011-12-29 15:51   ` Orit Wasserman [this message]
2011-12-29 15:51     ` Orit Wasserman
2012-01-04  3:34     ` Isaku Yamahata
2012-01-04  3:34       ` [Qemu-devel] " Isaku Yamahata
2011-12-29 16:06   ` Avi Kivity
2011-12-29 16:06     ` [Qemu-devel] " Avi Kivity
2012-01-04  3:29     ` Isaku Yamahata
2012-01-04  3:29       ` [Qemu-devel] " Isaku Yamahata
2012-01-12 14:15       ` Avi Kivity
2012-01-12 14:15         ` [Qemu-devel] " Avi Kivity
2011-12-29 22:39 ` [PATCH 00/21][RFC] postcopy live migration Anthony Liguori
2011-12-29 22:39   ` [Qemu-devel] " Anthony Liguori
2012-01-01  9:43   ` Orit Wasserman
2012-01-01  9:43     ` [Qemu-devel] " Orit Wasserman
2012-01-01 16:27     ` Stefan Hajnoczi
2012-01-01 16:27       ` Stefan Hajnoczi
2012-01-02  9:28       ` Dor Laor
2012-01-02  9:28         ` Dor Laor
2012-01-02 17:22         ` Stefan Hajnoczi
2012-01-02 17:22           ` [Qemu-devel] " Stefan Hajnoczi
2012-01-01  9:52   ` Dor Laor
2012-01-01  9:52     ` [Qemu-devel] " Dor Laor
2012-01-04  1:30     ` Takuya Yoshikawa
2012-01-04  1:30       ` [Qemu-devel] " Takuya Yoshikawa
2012-01-04  3:48     ` Michael Roth
2012-01-04  3:48       ` [Qemu-devel] " Michael Roth
2012-01-04  3:51   ` Isaku Yamahata
2012-01-04  3:51     ` Isaku Yamahata
     [not found] ` <BLU0-SMTP161AC380D472854F48E33A5BC9A0@phx.gbl>
2012-01-11  2:45   ` 回??: " Isaku Yamahata
2012-01-11  2:45     ` [Qemu-devel] " Isaku Yamahata
2012-01-12  8:29     ` thfbjyddx
2012-01-12  8:29       ` [Qemu-devel] " thfbjyddx
2012-01-12  8:54       ` 回??: [PATCH 00/21][RFC] postcopy live?migration Isaku Yamahata
2012-01-12  8:54         ` [Qemu-devel] " Isaku Yamahata
2012-01-12 13:26         ` thfbjyddx
2012-01-12 13:26           ` [Qemu-devel] " thfbjyddx
2012-01-16  6:51           ` Isaku Yamahata
2012-01-16  6:51             ` [Qemu-devel] " Isaku Yamahata
2012-01-16 10:17             ` Isaku Yamahata
2012-01-16 10:17               ` [Qemu-devel] " Isaku Yamahata
2012-03-12  8:36               ` thfbjyddx
2012-03-12  8:36                 ` [Qemu-devel] " thfbjyddx
2012-03-13  3:21                 ` Isaku Yamahata
2012-03-13  3:21                   ` [Qemu-devel] " Isaku Yamahata

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4EFC8C88.70701@redhat.com \
    --to=owasserm@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=qemu-devel@nongnu.org \
    --cc=satoshi.itoh@aist.go.jp \
    --cc=t.hirofuchi@aist.go.jp \
    --cc=yamahata@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.