From mboxrd@z Thu Jan 1 00:00:00 1970 From: Orit Wasserman Subject: Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration Date: Thu, 29 Dec 2011 17:51:36 +0200 Message-ID: <4EFC8C88.70701@redhat.com> References: Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org, qemu-devel@nongnu.org, t.hirofuchi@aist.go.jp To: Isaku Yamahata , satoshi.itoh@aist.go.jp Return-path: Received: from mx1.redhat.com ([209.132.183.28]:38353 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752830Ab1L2Pwn (ORCPT ); Thu, 29 Dec 2011 10:52:43 -0500 In-Reply-To: Sender: kvm-owner@vger.kernel.org List-ID: Hi, A general comment this patch is a bit too long,which makes it hard to review. Can you split it please? On 12/29/2011 03:26 AM, Isaku Yamahata wrote: > This patch implements postcopy livemigration. > > Signed-off-by: Isaku Yamahata > --- > Makefile.target | 4 + > arch_init.c | 26 +- > cpu-all.h | 7 + > exec.c | 20 +- > migration-exec.c | 8 + > migration-fd.c | 30 + > migration-postcopy-stub.c | 77 ++ > migration-postcopy.c | 1891 +++++++++++++++++++++++++++++++++++++++++++++ > migration-tcp.c | 37 +- > migration-unix.c | 32 +- > migration.c | 31 + > migration.h | 30 + > qemu-common.h | 1 + > qemu-options.hx | 5 +- > umem.c | 379 +++++++++ > umem.h | 105 +++ > vl.c | 14 +- > 17 files changed, 2677 insertions(+), 20 deletions(-) > create mode 100644 migration-postcopy-stub.c > create mode 100644 migration-postcopy.c > create mode 100644 umem.c > create mode 100644 umem.h > > diff --git a/Makefile.target b/Makefile.target > index 3261383..d94c53f 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h > CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y) > CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y) > CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y) > +CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y) > > include ../config-host.mak > include config-devices.mak > @@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o > obj-y += memory.o > LIBS+=-lz > > +common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o > +common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o > + > QEMU_CFLAGS += $(VNC_TLS_CFLAGS) > QEMU_CFLAGS += $(VNC_SASL_CFLAGS) > QEMU_CFLAGS += $(VNC_JPEG_CFLAGS) > diff --git a/arch_init.c b/arch_init.c > index bc53092..8b3130d 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch) > return 1; > } > > +static bool outgoing_postcopy = false; > + > +void ram_save_set_params(const MigrationParams *params, void *opaque) > +{ > + outgoing_postcopy = params->postcopy; > +} > + > static RAMBlock *last_block_sent = NULL; > > int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) > @@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > uint64_t expected_time = 0; > int ret; > > + if (stage == 1) { > + last_block_sent = NULL; > + > + bytes_transferred = 0; > + last_block = NULL; > + last_offset = 0; Changing of line order + new empty line > + } > + if (outgoing_postcopy) { > + return postcopy_outgoing_ram_save_live(mon, f, stage, opaque); > + } > + I would just do : unregister_savevm_live and then register_savevm_live(...,postcopy_outgoing_ram_save_live,...) when starting outgoing postcopy migration. > if (stage < 0) { > cpu_physical_memory_set_dirty_tracking(0); > return 0; > @@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > } > > if (stage == 1) { > - bytes_transferred = 0; > - last_block_sent = NULL; > - last_block = NULL; > - last_offset = 0; > sort_ram_list(); > > /* Make sure all dirty bits are set */ > @@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id) > int flags; > int error; > > + if (incoming_postcopy) { > + return postcopy_incoming_ram_load(f, opaque, version_id); > + } > + why not call register_savevm_live(...,postcopy_incoming_ram_load,...) when starting guest with postcopy_incoming > if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) { > return -EINVAL; > } > diff --git a/cpu-all.h b/cpu-all.h > index 0244f7a..2e9d8a7 100644 > --- a/cpu-all.h > +++ b/cpu-all.h > @@ -475,6 +475,9 @@ extern ram_addr_t ram_size; > /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ > #define RAM_PREALLOC_MASK (1 << 0) > > +/* RAM is allocated via umem for postcopy incoming mode */ > +#define RAM_POSTCOPY_UMEM_MASK (1 << 1) > + > typedef struct RAMBlock { > uint8_t *host; > ram_addr_t offset; > @@ -485,6 +488,10 @@ typedef struct RAMBlock { > #if defined(__linux__) && !defined(TARGET_S390X) > int fd; > #endif > + > +#ifdef CONFIG_POSTCOPY > + UMem *umem; /* for incoming postcopy mode */ > +#endif > } RAMBlock; > > typedef struct RAMList { > diff --git a/exec.c b/exec.c > index c8c6692..90b0491 100644 > --- a/exec.c > +++ b/exec.c > @@ -35,6 +35,7 @@ > #include "qemu-timer.h" > #include "memory.h" > #include "exec-memory.h" > +#include "migration.h" > #if defined(CONFIG_USER_ONLY) > #include > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > @@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, > new_block->host = host; > new_block->flags |= RAM_PREALLOC_MASK; > } else { > +#ifdef CONFIG_POSTCOPY > + if (incoming_postcopy) { > + postcopy_incoming_ram_alloc(name, size, > + &new_block->host, &new_block->umem); > + new_block->flags |= RAM_POSTCOPY_UMEM_MASK; > + } else > +#endif > if (mem_path) { > #if defined (__linux__) && !defined(TARGET_S390X) > new_block->host = file_ram_alloc(new_block, size, mem_path); > @@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr) > QLIST_REMOVE(block, next); > if (block->flags & RAM_PREALLOC_MASK) { > ; > - } else if (mem_path) { > + } > +#ifdef CONFIG_POSTCOPY > + else if (block->flags & RAM_POSTCOPY_UMEM_MASK) { > + postcopy_incoming_ram_free(block->umem); > + } > +#endif > + else if (mem_path) { > #if defined (__linux__) && !defined(TARGET_S390X) > if (block->fd) { > munmap(block->host, block->length); > @@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) > } else { > flags = MAP_FIXED; > munmap(vaddr, length); > + if (block->flags & RAM_POSTCOPY_UMEM_MASK) { > + postcopy_incoming_qemu_pages_unmapped(addr, length); > + block->flags &= ~RAM_POSTCOPY_UMEM_MASK; > + } > if (mem_path) { > #if defined(__linux__) && !defined(TARGET_S390X) > if (block->fd) { > diff --git a/migration-exec.c b/migration-exec.c > index e14552e..2bd0c3b 100644 > --- a/migration-exec.c > +++ b/migration-exec.c > @@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s, const char *command) > { > FILE *f; > > + if (s->params.postcopy) { > + return -ENOSYS; > + } > + > f = popen(command, "w"); > if (f == NULL) { > DPRINTF("Unable to popen exec target\n"); > @@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command) > { > QEMUFile *f; > > + if (incoming_postcopy) { > + return -ENOSYS; > + } > + > DPRINTF("Attempting to start an incoming migration\n"); > f = qemu_popen_cmd(command, "r"); > if(f == NULL) { > diff --git a/migration-fd.c b/migration-fd.c > index 6211124..5a62ab9 100644 > --- a/migration-fd.c > +++ b/migration-fd.c > @@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const char *fdname) > s->write = fd_write; > s->close = fd_close; > > + if (s->params.postcopy) { > + int flags = fcntl(s->fd, F_GETFL); > + if ((flags & O_ACCMODE) != O_RDWR) { > + goto err_after_open; > + } > + > + s->fd_read = dup(s->fd); > + if (s->fd_read == -1) { > + goto err_after_open; > + } > + s->file_read = qemu_fdopen(s->fd_read, "r"); > + if (s->file_read == NULL) { > + close(s->fd_read); > + goto err_after_open; > + } > + } > + > migrate_fd_connect(s); > return 0; > > @@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque) > > process_incoming_migration(f); > qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL); > + if (incoming_postcopy) { > + postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f); > + } > qemu_fclose(f); > + if (incoming_postcopy) { > + postcopy_incoming_qemu_ready(); > + } > + return; > } > > int fd_start_incoming_migration(const char *infd) > @@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd) > DPRINTF("Attempting to start an incoming migration via fd\n"); > > fd = strtol(infd, NULL, 0); > + if (incoming_postcopy) { > + int flags = fcntl(fd, F_GETFL); > + if ((flags & O_ACCMODE) != O_RDWR) { > + return -EINVAL; > + } > + } > f = qemu_fdopen(fd, "rb"); > if(f == NULL) { > DPRINTF("Unable to apply qemu wrapper to file descriptor\n"); > diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c > new file mode 100644 > index 0000000..0b78de7 > --- /dev/null > +++ b/migration-postcopy-stub.c > @@ -0,0 +1,77 @@ > +/* > + * migration-postcopy-stub.c: postcopy livemigration > + * stub functions for non-supported hosts > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#include "sysemu.h" > +#include "migration.h" > + > +int postcopy_outgoing_create_read_socket(MigrationState *s) > +{ > + return -ENOSYS; > +} > + > +int postcopy_outgoing_ram_save_live(Monitor *mon, > + QEMUFile *f, int stage, void *opaque) > +{ > + return -ENOSYS; > +} > + > +void *postcopy_outgoing_begin(MigrationState *ms) > +{ > + return NULL; > +} > + > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, > + void *postcopy) > +{ > + return -ENOSYS; > +} > + > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy) > +{ > + return -ENOSYS; > +} > + > +void postcopy_incoming_prepare(void) > +{ > +} > + > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id) > +{ > + return -ENOSYS; > +} > + > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read) > +{ > +} > + > +void postcopy_incoming_qemu_ready(void) > +{ > +} > + > +void postcopy_incoming_qemu_cleanup(void) > +{ > +} > + > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size) > +{ > +} > diff --git a/migration-postcopy.c b/migration-postcopy.c > new file mode 100644 > index 0000000..ed0d574 > --- /dev/null > +++ b/migration-postcopy.c > @@ -0,0 +1,1891 @@ > +/* > + * migration-postcopy.c: postcopy livemigration > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#include "bitmap.h" > +#include "sysemu.h" > +#include "hw/hw.h" > +#include "arch_init.h" > +#include "migration.h" > +#include "umem.h" > + > +#include "memory.h" > +#define WANT_EXEC_OBSOLETE > +#include "exec-obsolete.h" > + > +//#define DEBUG_POSTCOPY > +#ifdef DEBUG_POSTCOPY > +#include > +#define DPRINTF(fmt, ...) \ > + do { \ > + printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid), \ > + __func__, __LINE__, ## __VA_ARGS__); \ > + } while (0) > +#else > +#define DPRINTF(fmt, ...) do { } while (0) > +#endif > + > +#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1)) > + > +static void fd_close(int *fd) > +{ > + if (*fd >= 0) { > + close(*fd); > + *fd = -1; > + } > +} > + > +/*************************************************************************** > + * QEMUFile for non blocking pipe > + */ > + > +/* read only */ > +struct QEMUFilePipe { > + int fd; > + QEMUFile *file; > +}; Why not use QEMUFileSocket ? > +typedef struct QEMUFilePipe QEMUFilePipe; > + > +static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) > +{ > + QEMUFilePipe *s = opaque; > + ssize_t len = 0; > + > + while (size > 0) { > + ssize_t ret = read(s->fd, buf, size); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } > + if (len == 0) { > + len = -errno; > + } > + break; > + } > + > + if (ret == 0) { > + /* the write end of the pipe is closed */ > + break; > + } > + len += ret; > + buf += ret; > + size -= ret; > + } > + > + return len; > +} > + > +static int pipe_close(void *opaque) > +{ > + QEMUFilePipe *s = opaque; > + g_free(s); > + return 0; > +} > + > +static QEMUFile *qemu_fopen_pipe(int fd) > +{ > + QEMUFilePipe *s = g_malloc0(sizeof(*s)); > + > + s->fd = fd; > + fcntl_setfl(fd, O_NONBLOCK); > + s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close, > + NULL, NULL, NULL); > + return s->file; > +} > + > +/* write only */ > +struct QEMUFileNonblock { > + int fd; > + QEMUFile *file; > + > + /* for pipe-write nonblocking mode */ > +#define BUF_SIZE_INC (32 * 1024) /* = IO_BUF_SIZE */ > + uint8_t *buffer; > + size_t buffer_size; > + size_t buffer_capacity; > + bool freeze_output; > +}; > +typedef struct QEMUFileNonblock QEMUFileNonblock; > + Couldn't you use QEMUFileBuffered ? > +static void nonblock_flush_buffer(QEMUFileNonblock *s) > +{ > + size_t offset = 0; > + ssize_t ret; > + > + while (offset < s->buffer_size) { > + ret = write(s->fd, s->buffer + offset, s->buffer_size - offset); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } else if (errno == EAGAIN) { > + s->freeze_output = true; > + } else { > + qemu_file_set_error(s->file, errno); > + } > + break; > + } > + > + if (ret == 0) { > + DPRINTF("ret == 0\n"); > + break; > + } > + > + offset += ret; > + } > + > + if (offset > 0) { > + assert(s->buffer_size >= offset); > + memmove(s->buffer, s->buffer + offset, s->buffer_size - offset); > + s->buffer_size -= offset; > + } > + if (s->buffer_size > 0) { > + s->freeze_output = true; > + } > +} > + > +static int nonblock_put_buffer(void *opaque, > + const uint8_t *buf, int64_t pos, int size) > +{ > + QEMUFileNonblock *s = opaque; > + int error; > + ssize_t len = 0; > + > + error = qemu_file_get_error(s->file); > + if (error) { > + return error; > + } > + > + nonblock_flush_buffer(s); > + error = qemu_file_get_error(s->file); > + if (error) { > + return error; > + } > + > + while (!s->freeze_output && size > 0) { > + ssize_t ret; > + assert(s->buffer_size == 0); > + > + ret = write(s->fd, buf, size); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } else if (errno == EAGAIN) { > + s->freeze_output = true; > + } else { > + qemu_file_set_error(s->file, errno); > + } > + break; > + } > + > + len += ret; > + buf += ret; > + size -= ret; > + } > + > + if (size > 0) { > + int inc = size - (s->buffer_capacity - s->buffer_size); > + if (inc > 0) { > + s->buffer_capacity += > + DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC; > + s->buffer = g_realloc(s->buffer, s->buffer_capacity); > + } > + memcpy(s->buffer + s->buffer_size, buf, size); > + s->buffer_size += size; > + > + len += size; > + } > + > + return len; > +} > + > +static int nonblock_pending_size(QEMUFileNonblock *s) > +{ > + return qemu_pending_size(s->file) + s->buffer_size; > +} > + > +static void nonblock_fflush(QEMUFileNonblock *s) > +{ > + s->freeze_output = false; > + nonblock_flush_buffer(s); > + if (!s->freeze_output) { > + qemu_fflush(s->file); > + } > +} > + > +static void nonblock_wait_for_flush(QEMUFileNonblock *s) > +{ > + while (nonblock_pending_size(s) > 0) { > + fd_set fds; > + FD_ZERO(&fds); > + FD_SET(s->fd, &fds); > + select(s->fd + 1, NULL, &fds, NULL, NULL); > + > + nonblock_fflush(s); > + } > +} > + > +static int nonblock_close(void *opaque) > +{ > + QEMUFileNonblock *s = opaque; > + nonblock_wait_for_flush(s); > + g_free(s->buffer); > + g_free(s); > + return 0; > +} > + > +static QEMUFileNonblock *qemu_fopen_nonblock(int fd) > +{ > + QEMUFileNonblock *s = g_malloc0(sizeof(*s)); > + > + s->fd = fd; > + fcntl_setfl(fd, O_NONBLOCK); > + s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close, > + NULL, NULL, NULL); > + return s; > +} > + > +/*************************************************************************** > + * umem daemon on destination <-> qemu on source protocol > + */ > + > +#define QEMU_UMEM_REQ_INIT 0x00 > +#define QEMU_UMEM_REQ_ON_DEMAND 0x01 > +#define QEMU_UMEM_REQ_ON_DEMAND_CONT 0x02 > +#define QEMU_UMEM_REQ_BACKGROUND 0x03 > +#define QEMU_UMEM_REQ_BACKGROUND_CONT 0x04 > +#define QEMU_UMEM_REQ_REMOVE 0x05 > +#define QEMU_UMEM_REQ_EOC 0x06 > + > +struct qemu_umem_req { > + int8_t cmd; > + uint8_t len; > + char *idstr; /* ON_DEMAND, BACKGROUND, REMOVE */ > + uint32_t nr; /* ON_DEMAND, ON_DEMAND_CONT, > + BACKGROUND, BACKGROUND_CONT, REMOVE */ > + > + /* in target page size as qemu migration protocol */ > + uint64_t *pgoffs; /* ON_DEMAND, ON_DEMAND_CONT, > + BACKGROUND, BACKGROUND_CONT, REMOVE */ > +}; > + > +static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr) > +{ > + qemu_put_byte(f, strlen(idstr)); > + qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr)); > +} > + > +static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr, > + const uint64_t *pgoffs) > +{ > + uint32_t i; > + > + qemu_put_be32(f, nr); > + for (i = 0; i < nr; i++) { > + qemu_put_be64(f, pgoffs[i]); > + } > +} > + > +static void postcopy_incoming_send_req_one(QEMUFile *f, > + const struct qemu_umem_req *req) > +{ > + DPRINTF("cmd %d\n", req->cmd); > + qemu_put_byte(f, req->cmd); > + switch (req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + case QEMU_UMEM_REQ_EOC: > + /* nothing */ > + break; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + case QEMU_UMEM_REQ_REMOVE: > + postcopy_incoming_send_req_idstr(f, req->idstr); > + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs); > + break; > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs); > + break; > + default: > + abort(); > + break; > + } > +} > + > +/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024. > + * So one message size must be <= IO_BUF_SIZE > + * cmd: 1 > + * id len: 1 > + * id: 256 > + * nr: 2 > + */ > +#define MAX_PAGE_NR ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t)) > +static void postcopy_incoming_send_req(QEMUFile *f, > + const struct qemu_umem_req *req) > +{ > + uint32_t nr = req->nr; > + struct qemu_umem_req tmp = *req; > + > + switch (req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + case QEMU_UMEM_REQ_EOC: > + postcopy_incoming_send_req_one(f, &tmp); > + break; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + tmp.nr = MIN(nr, MAX_PAGE_NR); > + postcopy_incoming_send_req_one(f, &tmp); > + > + nr -= tmp.nr; > + tmp.pgoffs += tmp.nr; > + if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) { > + tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT; > + }else { > + tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT; > + } > + /* fall through */ > + case QEMU_UMEM_REQ_REMOVE: > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + while (nr > 0) { > + tmp.nr = MIN(nr, MAX_PAGE_NR); > + postcopy_incoming_send_req_one(f, &tmp); > + > + nr -= tmp.nr; > + tmp.pgoffs += tmp.nr; > + } > + break; > + default: > + abort(); > + break; > + } > +} > + > +static int postcopy_outgoing_recv_req_idstr(QEMUFile *f, > + struct qemu_umem_req *req, > + size_t *offset) > +{ > + int ret; > + > + req->len = qemu_peek_byte(f, *offset); > + *offset += 1; > + if (req->len == 0) { > + return -EAGAIN; > + } > + req->idstr = g_malloc((int)req->len + 1); > + ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset); > + *offset += ret; > + if (ret != req->len) { > + g_free(req->idstr); > + req->idstr = NULL; > + return -EAGAIN; > + } > + req->idstr[req->len] = 0; > + return 0; > +} > + > +static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f, > + struct qemu_umem_req *req, > + size_t *offset) > +{ > + int ret; > + uint32_t be32; > + uint32_t i; > + > + ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset); > + *offset += sizeof(be32); > + if (ret != sizeof(be32)) { > + return -EAGAIN; > + } > + > + req->nr = be32_to_cpu(be32); > + req->pgoffs = g_new(uint64_t, req->nr); > + for (i = 0; i < req->nr; i++) { > + uint64_t be64; > + ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset); > + *offset += sizeof(be64); > + if (ret != sizeof(be64)) { > + g_free(req->pgoffs); > + req->pgoffs = NULL; > + return -EAGAIN; > + } > + req->pgoffs[i] = be64_to_cpu(be64); > + } > + return 0; > +} > + > +static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req *req) > +{ > + int size; > + int ret; > + size_t offset = 0; > + > + size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset); > + if (size <= 0) { > + return -EAGAIN; > + } > + offset += 1; > + > + switch (req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + case QEMU_UMEM_REQ_EOC: > + /* nothing */ > + break; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + case QEMU_UMEM_REQ_REMOVE: > + ret = postcopy_outgoing_recv_req_idstr(f, req, &offset); > + if (ret < 0) { > + return ret; > + } > + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset); > + if (ret < 0) { > + return ret; > + } > + break; > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset); > + if (ret < 0) { > + return ret; > + } > + break; > + default: > + abort(); > + break; > + } > + qemu_file_skip(f, offset); > + DPRINTF("cmd %d\n", req->cmd); > + return 0; > +} > + > +static void postcopy_outgoing_free_req(struct qemu_umem_req *req) > +{ > + g_free(req->idstr); > + g_free(req->pgoffs); > +} > + > +/*************************************************************************** > + * outgoing part > + */ > + > +#define QEMU_SAVE_LIVE_STAGE_START 0x01 /* = QEMU_VM_SECTION_START */ > +#define QEMU_SAVE_LIVE_STAGE_PART 0x02 /* = QEMU_VM_SECTION_PART */ > +#define QEMU_SAVE_LIVE_STAGE_END 0x03 /* = QEMU_VM_SECTION_END */ > + > +enum POState { > + PO_STATE_ERROR_RECEIVE, > + PO_STATE_ACTIVE, > + PO_STATE_EOC_RECEIVED, > + PO_STATE_ALL_PAGES_SENT, > + PO_STATE_COMPLETED, > +}; > +typedef enum POState POState; > + > +struct PostcopyOutgoingState { > + POState state; > + QEMUFile *mig_read; > + int fd_read; > + RAMBlock *last_block_read; > + > + QEMUFile *mig_buffered_write; > + MigrationState *ms; > + > + /* For nobg mode. Check if all pages are sent */ > + RAMBlock *block; > + ram_addr_t addr; > +}; > +typedef struct PostcopyOutgoingState PostcopyOutgoingState; > + > +int postcopy_outgoing_create_read_socket(MigrationState *s) > +{ > + if (!s->params.postcopy) { > + return 0; > + } > + > + s->fd_read = dup(s->fd); > + if (s->fd_read == -1) { > + int ret = -errno; > + perror("dup"); > + return ret; > + } > + s->file_read = qemu_fopen_socket(s->fd_read); > + if (s->file_read == NULL) { > + return -EINVAL; > + } > + return 0; > +} > + > +int postcopy_outgoing_ram_save_live(Monitor *mon, > + QEMUFile *f, int stage, void *opaque) > +{ > + int ret = 0; > + DPRINTF("stage %d\n", stage); > + if (stage == QEMU_SAVE_LIVE_STAGE_START) { > + sort_ram_list(); > + ram_save_live_mem_size(f); > + } > + if (stage == QEMU_SAVE_LIVE_STAGE_PART) { > + ret = 1; > + } > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + return ret; > +} > + > +static RAMBlock *postcopy_outgoing_find_block(const char *idstr) > +{ > + RAMBlock *block; > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (!strncmp(idstr, block->idstr, strlen(idstr))) { > + return block; > + } > + } > + return NULL; > +} > + > +/* > + * return value > + * 0: continue postcopy mode > + * > 0: completed postcopy mode. > + * < 0: error > + */ > +static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s, > + const struct qemu_umem_req *req, > + bool *written) > +{ > + int i; > + RAMBlock *block; > + > + DPRINTF("cmd %d state %d\n", req->cmd, s->state); > + switch(req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + /* nothing */ > + break; > + case QEMU_UMEM_REQ_EOC: > + /* tell to finish migration. */ > + if (s->state == PO_STATE_ALL_PAGES_SENT) { > + s->state = PO_STATE_COMPLETED; > + DPRINTF("-> PO_STATE_COMPLETED\n"); > + } else { > + s->state = PO_STATE_EOC_RECEIVED; > + DPRINTF("-> PO_STATE_EOC_RECEIVED\n"); > + } > + return 1; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + DPRINTF("idstr: %s\n", req->idstr); > + block = postcopy_outgoing_find_block(req->idstr); > + if (block == NULL) { > + return -EINVAL; > + } > + s->last_block_read = block; > + /* fall through */ > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + DPRINTF("nr %d\n", req->nr); > + for (i = 0; i < req->nr; i++) { > + DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]); > + int ret = ram_save_page(s->mig_buffered_write, s->last_block_read, > + req->pgoffs[i] << TARGET_PAGE_BITS); > + if (ret > 0) { > + *written = true; > + } > + } > + break; > + case QEMU_UMEM_REQ_REMOVE: > + block = postcopy_outgoing_find_block(req->idstr); > + if (block == NULL) { > + return -EINVAL; > + } > + for (i = 0; i < req->nr; i++) { > + ram_addr_t addr = block->offset + > + (req->pgoffs[i] << TARGET_PAGE_BITS); > + cpu_physical_memory_reset_dirty(addr, > + addr + TARGET_PAGE_SIZE, > + MIGRATION_DIRTY_FLAG); > + } > + break; > + default: > + return -EINVAL; > + } > + return 0; > +} > + > +static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s) > +{ > + if (s->mig_read != NULL) { > + qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL); > + qemu_fclose(s->mig_read); > + s->mig_read = NULL; > + fd_close(&s->fd_read); > + > + s->ms->file_read = NULL; > + s->ms->fd_read = -1; > + } > +} > + > +static void postcopy_outgoing_completed(PostcopyOutgoingState *s) > +{ > + postcopy_outgoing_close_mig_read(s); > + s->ms->postcopy = NULL; > + g_free(s); > +} > + > +static void postcopy_outgoing_recv_handler(void *opaque) > +{ > + PostcopyOutgoingState *s = opaque; > + bool written = false; > + int ret = 0; > + > + assert(s->state == PO_STATE_ACTIVE || > + s->state == PO_STATE_ALL_PAGES_SENT); > + > + do { > + struct qemu_umem_req req = {.idstr = NULL, > + .pgoffs = NULL}; > + > + ret = postcopy_outgoing_recv_req(s->mig_read, &req); > + if (ret < 0) { > + if (ret == -EAGAIN) { > + ret = 0; > + } > + break; > + } > + if (s->state == PO_STATE_ACTIVE) { > + ret = postcopy_outgoing_handle_req(s, &req, &written); > + } > + postcopy_outgoing_free_req(&req); > + } while (ret == 0); > + > + /* > + * flush buffered_file. > + * Although mig_write is rate-limited buffered file, those written pages > + * are requested on demand by the destination. So forcibly push > + * those pages ignoring rate limiting > + */ > + if (written) { > + qemu_fflush(s->mig_buffered_write); > + /* qemu_buffered_file_drain(s->mig_buffered_write); */ > + } > + > + if (ret < 0) { > + switch (s->state) { > + case PO_STATE_ACTIVE: > + s->state = PO_STATE_ERROR_RECEIVE; > + DPRINTF("-> PO_STATE_ERROR_RECEIVE\n"); > + break; > + case PO_STATE_ALL_PAGES_SENT: > + s->state = PO_STATE_COMPLETED; > + DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n"); > + break; > + default: > + abort(); > + } > + } > + if (s->state == PO_STATE_ERROR_RECEIVE || s->state == PO_STATE_COMPLETED) { > + postcopy_outgoing_close_mig_read(s); > + } > + if (s->state == PO_STATE_COMPLETED) { > + DPRINTF("PO_STATE_COMPLETED\n"); > + MigrationState *ms = s->ms; > + postcopy_outgoing_completed(s); > + migrate_fd_completed(ms); > + } > +} > + > +void *postcopy_outgoing_begin(MigrationState *ms) > +{ > + PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1); > + DPRINTF("outgoing begin\n"); > + qemu_fflush(ms->file); > + > + s->ms = ms; > + s->state = PO_STATE_ACTIVE; > + s->fd_read = ms->fd_read; > + s->mig_read = ms->file_read; > + s->mig_buffered_write = ms->file; > + s->block = NULL; > + s->addr = 0; > + > + /* Make sure all dirty bits are set */ > + ram_save_memory_set_dirty(); > + > + qemu_set_fd_handler(s->fd_read, > + &postcopy_outgoing_recv_handler, NULL, s); > + return s; > +} > + > +static void postcopy_outgoing_ram_all_sent(QEMUFile *f, > + PostcopyOutgoingState *s) > +{ > + assert(s->state == PO_STATE_ACTIVE); > + > + s->state = PO_STATE_ALL_PAGES_SENT; > + /* tell incoming side that all pages are sent */ > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + qemu_fflush(f); > + qemu_buffered_file_drain(f); > + DPRINTF("sent RAM_SAVE_FLAG_EOS\n"); > + migrate_fd_cleanup(s->ms); > + > + /* Later migrate_fd_complete() will be called which calls > + * migrate_fd_cleanup() again. So dummy file is created > + * for qemu monitor to keep working. > + */ > + s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL, > + NULL, NULL); > +} > + > +static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s, > + RAMBlock *block, > + ram_addr_t addr) > +{ > + if (block == NULL) { > + block = QLIST_FIRST(&ram_list.blocks); > + addr = block->offset; > + } > + > + for (; block != NULL; > + s->block = QLIST_NEXT(s->block, next), addr = block->offset) { > + for (; addr < block->offset + block->length; > + addr += TARGET_PAGE_SIZE) { > + if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { > + s->block = block; > + s->addr = addr; > + return 0; > + } > + } > + } > + > + return 1; > +} > + > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, > + void *postcopy) > +{ > + PostcopyOutgoingState *s = postcopy; > + > + assert(s->state == PO_STATE_ACTIVE || > + s->state == PO_STATE_EOC_RECEIVED || > + s->state == PO_STATE_ERROR_RECEIVE); > + > + switch (s->state) { > + case PO_STATE_ACTIVE: > + /* nothing. processed below */ > + break; > + case PO_STATE_EOC_RECEIVED: > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + s->state = PO_STATE_COMPLETED; > + postcopy_outgoing_completed(s); > + DPRINTF("PO_STATE_COMPLETED\n"); > + return 1; > + case PO_STATE_ERROR_RECEIVE: > + postcopy_outgoing_completed(s); > + DPRINTF("PO_STATE_ERROR_RECEIVE\n"); > + return -1; > + default: > + abort(); > + } > + > + if (s->ms->params.nobg) { > + /* See if all pages are sent. */ > + if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) == 0) { > + return 0; > + } > + /* ram_list can be reordered. (it doesn't seem so during migration, > + though) So the whole list needs to be checked again */ > + if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) { > + return 0; > + } > + > + postcopy_outgoing_ram_all_sent(f, s); > + return 0; > + } > + > + DPRINTF("outgoing background state: %d\n", s->state); > + > + while (qemu_file_rate_limit(f) == 0) { > + if (ram_save_block(f) == 0) { /* no more blocks */ > + assert(s->state == PO_STATE_ACTIVE); > + postcopy_outgoing_ram_all_sent(f, s); > + return 0; > + } > + } > + > + return 0; > +} > + > +/*************************************************************************** > + * incoming part > + */ > + > +/* flags for incoming mode to modify the behavior. > + This is for benchmark/debug purpose */ > +#define INCOMING_FLAGS_FAULT_REQUEST 0x01 > + > + > +static void postcopy_incoming_umemd(void); > + > +#define PIS_STATE_QUIT_RECEIVED 0x01 > +#define PIS_STATE_QUIT_QUEUED 0x02 > +#define PIS_STATE_QUIT_SENT 0x04 > + > +#define PIS_STATE_QUIT_MASK (PIS_STATE_QUIT_RECEIVED | \ > + PIS_STATE_QUIT_QUEUED | \ > + PIS_STATE_QUIT_SENT) > + > +struct PostcopyIncomingState { > + /* dest qemu state */ > + uint32_t state; > + > + UMemDev *dev; > + int host_page_size; > + int host_page_shift; > + > + /* qemu side */ > + int to_umemd_fd; > + QEMUFileNonblock *to_umemd; > +#define MAX_FAULTED_PAGES 256 > + struct umem_pages *faulted_pages; > + > + int from_umemd_fd; > + QEMUFile *from_umemd; > + int version_id; /* save/load format version id */ > +}; > +typedef struct PostcopyIncomingState PostcopyIncomingState; > + > + > +#define UMEM_STATE_EOS_RECEIVED 0x01 /* umem daemon <-> src qemu */ > +#define UMEM_STATE_EOC_SENT 0x02 /* umem daemon <-> src qemu */ > +#define UMEM_STATE_QUIT_RECEIVED 0x04 /* umem daemon <-> dst qemu */ > +#define UMEM_STATE_QUIT_QUEUED 0x08 /* umem daemon <-> dst qemu */ > +#define UMEM_STATE_QUIT_SENT 0x10 /* umem daemon <-> dst qemu */ > + > +#define UMEM_STATE_QUIT_MASK (UMEM_STATE_QUIT_QUEUED | \ > + UMEM_STATE_QUIT_SENT | \ > + UMEM_STATE_QUIT_RECEIVED) > +#define UMEM_STATE_END_MASK (UMEM_STATE_EOS_RECEIVED | \ > + UMEM_STATE_EOC_SENT | \ > + UMEM_STATE_QUIT_MASK) > + > +struct PostcopyIncomingUMemDaemon { > + /* umem daemon side */ > + uint32_t state; > + > + int host_page_size; > + int host_page_shift; > + int nr_host_pages_per_target_page; > + int host_to_target_page_shift; > + int nr_target_pages_per_host_page; > + int target_to_host_page_shift; > + int version_id; /* save/load format version id */ > + > + int to_qemu_fd; > + QEMUFileNonblock *to_qemu; > + int from_qemu_fd; > + QEMUFile *from_qemu; > + > + int mig_read_fd; > + QEMUFile *mig_read; /* qemu on source -> umem daemon */ > + > + int mig_write_fd; > + QEMUFileNonblock *mig_write; /* umem daemon -> qemu on source */ > + > + /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */ > +#define MAX_REQUESTS (512 * (64 + 1)) > + > + struct umem_page_request page_request; > + struct umem_page_cached page_cached; > + > +#define MAX_PRESENT_REQUESTS MAX_FAULTED_PAGES > + struct umem_pages *present_request; > + > + uint64_t *target_pgoffs; > + > + /* bitmap indexed by target page offset */ > + unsigned long *phys_requested; > + > + /* bitmap indexed by target page offset */ > + unsigned long *phys_received; > + > + RAMBlock *last_block_read; /* qemu on source -> umem daemon */ > + RAMBlock *last_block_write; /* umem daemon -> qemu on source */ > +}; > +typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon; > + > +static PostcopyIncomingState state = { > + .state = 0, > + .dev = NULL, > + .to_umemd_fd = -1, > + .to_umemd = NULL, > + .from_umemd_fd = -1, > + .from_umemd = NULL, > +}; > + > +static PostcopyIncomingUMemDaemon umemd = { > + .state = 0, > + .to_qemu_fd = -1, > + .to_qemu = NULL, > + .from_qemu_fd = -1, > + .from_qemu = NULL, > + .mig_read_fd = -1, > + .mig_read = NULL, > + .mig_write_fd = -1, > + .mig_write = NULL, > +}; > + > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy) > +{ > + /* incoming_postcopy makes sense only when incoming migration mode */ > + if (!incoming && incoming_postcopy) { > + return -EINVAL; > + } > + > + if (!incoming_postcopy) { > + return 0; > + } > + > + state.state = 0; > + state.dev = umem_dev_new(); > + state.host_page_size = getpagesize(); > + state.host_page_shift = ffs(state.host_page_size) - 1; > + state.version_id = RAM_SAVE_VERSION_ID; /* = save version of > + ram_save_live() */ > + return 0; > +} > + > +void postcopy_incoming_ram_alloc(const char *name, > + size_t size, uint8_t **hostp, UMem **umemp) > +{ > + UMem *umem; > + size = ALIGN_UP(size, state.host_page_size); > + umem = umem_dev_create(state.dev, size, name); > + > + *umemp = umem; > + *hostp = umem->umem; > +} > + > +void postcopy_incoming_ram_free(UMem *umem) > +{ > + umem_unmap(umem); > + umem_close(umem); > + umem_destroy(umem); > +} > + > +void postcopy_incoming_prepare(void) > +{ > + RAMBlock *block; > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL) { > + umem_mmap(block->umem); > + } > + } > +} > + > +static int postcopy_incoming_ram_load_get64(QEMUFile *f, > + ram_addr_t *addr, int *flags) > +{ > + *addr = qemu_get_be64(f); > + *flags = *addr & ~TARGET_PAGE_MASK; > + *addr &= TARGET_PAGE_MASK; > + return qemu_file_get_error(f); > +} > + > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id) > +{ > + ram_addr_t addr; > + int flags; > + int error; > + > + DPRINTF("incoming ram load\n"); > + /* > + * RAM_SAVE_FLAGS_EOS or > + * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS > + * see postcopy_outgoing_ram_save_live() > + */ > + > + if (version_id != RAM_SAVE_VERSION_ID) { > + DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n", > + version_id, RAM_SAVE_VERSION_ID); > + return -EINVAL; > + } > + error = postcopy_incoming_ram_load_get64(f, &addr, &flags); > + DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags); > + if (error) { > + DPRINTF("error %d\n", error); > + return error; > + } > + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) { > + DPRINTF("EOS\n"); > + return 0; > + } > + > + if (flags != RAM_SAVE_FLAG_MEM_SIZE) { > + DPRINTF("-EINVAL flags 0x%x\n", flags); > + return -EINVAL; > + } > + error = ram_load_mem_size(f, addr); > + if (error) { > + DPRINTF("addr 0x%lx error %d\n", addr, error); > + return error; > + } > + > + error = postcopy_incoming_ram_load_get64(f, &addr, &flags); > + if (error) { > + DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error); > + return error; > + } > + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) { > + DPRINTF("done\n"); > + return 0; > + } > + DPRINTF("-EINVAL\n"); > + return -EINVAL; > +} > + > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read) > +{ > + int fds[2]; > + RAMBlock *block; > + > + DPRINTF("fork\n"); > + > + /* socketpair(AF_UNIX)? */ > + > + if (qemu_pipe(fds) == -1) { > + perror("qemu_pipe"); > + abort(); > + } > + state.from_umemd_fd = fds[0]; > + umemd.to_qemu_fd = fds[1]; > + > + if (qemu_pipe(fds) == -1) { > + perror("qemu_pipe"); > + abort(); > + } > + umemd.from_qemu_fd = fds[0]; > + state.to_umemd_fd = fds[1]; > + > + pid_t child = fork(); > + if (child < 0) { > + perror("fork"); > + abort(); > + } > + > + if (child == 0) { > + int mig_write_fd; > + > + fd_close(&state.to_umemd_fd); > + fd_close(&state.from_umemd_fd); > + umemd.host_page_size = state.host_page_size; > + umemd.host_page_shift = state.host_page_shift; > + > + umemd.nr_host_pages_per_target_page = > + TARGET_PAGE_SIZE / umemd.host_page_size; > + umemd.nr_target_pages_per_host_page = > + umemd.host_page_size / TARGET_PAGE_SIZE; > + > + umemd.target_to_host_page_shift = > + ffs(umemd.nr_host_pages_per_target_page) - 1; > + umemd.host_to_target_page_shift = > + ffs(umemd.nr_target_pages_per_host_page) - 1; > + > + umemd.state = 0; > + umemd.version_id = state.version_id; > + umemd.mig_read_fd = mig_read_fd; > + umemd.mig_read = mig_read; > + > + mig_write_fd = dup(mig_read_fd); > + if (mig_write_fd < 0) { > + perror("could not dup for writable socket \n"); > + abort(); > + } > + umemd.mig_write_fd = mig_write_fd; > + umemd.mig_write = qemu_fopen_nonblock(mig_write_fd); > + > + postcopy_incoming_umemd(); /* noreturn */ > + } > + > + DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child); > + fd_close(&umemd.to_qemu_fd); > + fd_close(&umemd.from_qemu_fd); > + state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES)); > + state.faulted_pages->nr = 0; > + > + /* close all UMem.shmem_fd */ > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + umem_close_shmem(block->umem); > + } > + umem_qemu_wait_for_daemon(state.from_umemd_fd); > +} > + > +static void postcopy_incoming_qemu_recv_quit(void) > +{ > + RAMBlock *block; > + if (state.state & PIS_STATE_QUIT_RECEIVED) { > + return; > + } > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL) { > + umem_destroy(block->umem); > + block->umem = NULL; > + block->flags &= ~RAM_POSTCOPY_UMEM_MASK; > + } > + } > + > + DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n"); > + state.state |= PIS_STATE_QUIT_RECEIVED; > + qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL); > + qemu_fclose(state.from_umemd); > + state.from_umemd = NULL; > + fd_close(&state.from_umemd_fd); > +} > + > +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque) > +{ > + assert(state.to_umemd != NULL); > + > + nonblock_fflush(state.to_umemd); > + if (nonblock_pending_size(state.to_umemd) > 0) { > + return; > + } > + > + qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL); > + if (state.state & PIS_STATE_QUIT_QUEUED) { > + DPRINTF("|= PIS_STATE_QUIT_SENT\n"); > + state.state |= PIS_STATE_QUIT_SENT; > + qemu_fclose(state.to_umemd->file); > + state.to_umemd = NULL; > + fd_close(&state.to_umemd_fd); > + g_free(state.faulted_pages); > + state.faulted_pages = NULL; > + } > +} > + > +static void postcopy_incoming_qemu_fflush_to_umemd(void) > +{ > + qemu_set_fd_handler(state.to_umemd->fd, NULL, > + postcopy_incoming_qemu_fflush_to_umemd_handler, NULL); > + postcopy_incoming_qemu_fflush_to_umemd_handler(NULL); > +} > + > +static void postcopy_incoming_qemu_queue_quit(void) > +{ > + if (state.state & PIS_STATE_QUIT_QUEUED) { > + return; > + } > + > + DPRINTF("|= PIS_STATE_QUIT_QUEUED\n"); > + umem_qemu_quit(state.to_umemd->file); > + state.state |= PIS_STATE_QUIT_QUEUED; > +} > + > +static void postcopy_incoming_qemu_send_pages_present(void) > +{ > + if (state.faulted_pages->nr > 0) { > + umem_qemu_send_pages_present(state.to_umemd->file, > + state.faulted_pages); > + state.faulted_pages->nr = 0; > + } > +} > + > +static void postcopy_incoming_qemu_faulted_pages( > + const struct umem_pages *pages) > +{ > + assert(pages->nr <= MAX_FAULTED_PAGES); > + assert(state.faulted_pages != NULL); > + > + if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) { > + postcopy_incoming_qemu_send_pages_present(); > + } > + memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr], > + &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr); > + state.faulted_pages->nr += pages->nr; > +} > + > +static void postcopy_incoming_qemu_cleanup_umem(void); > + > +static int postcopy_incoming_qemu_handle_req_one(void) > +{ > + int offset = 0; > + int ret; > + uint8_t cmd; > + > + ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset); > + offset += sizeof(cmd); > + if (ret != sizeof(cmd)) { > + return -EAGAIN; > + } > + DPRINTF("cmd %c\n", cmd); > + > + switch (cmd) { > + case UMEM_DAEMON_QUIT: > + postcopy_incoming_qemu_recv_quit(); > + postcopy_incoming_qemu_queue_quit(); > + postcopy_incoming_qemu_cleanup_umem(); > + break; > + case UMEM_DAEMON_TRIGGER_PAGE_FAULT: { > + struct umem_pages *pages = > + umem_qemu_trigger_page_fault(state.from_umemd, &offset); > + if (pages == NULL) { > + return -EAGAIN; > + } > + if (state.to_umemd_fd >= 0 && !(state.state & PIS_STATE_QUIT_QUEUED)) { > + postcopy_incoming_qemu_faulted_pages(pages); > + g_free(pages); > + } > + break; > + } > + case UMEM_DAEMON_ERROR: > + /* umem daemon hit troubles, so it warned us to stop vm execution */ > + vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */ > + break; > + default: > + abort(); > + break; > + } > + > + if (state.from_umemd != NULL) { > + qemu_file_skip(state.from_umemd, offset); > + } > + return 0; > +} > + > +static void postcopy_incoming_qemu_handle_req(void *opaque) > +{ > + do { > + int ret = postcopy_incoming_qemu_handle_req_one(); > + if (ret == -EAGAIN) { > + break; > + } > + } while (state.from_umemd != NULL && > + qemu_pending_size(state.from_umemd) > 0); > + > + if (state.to_umemd != NULL) { > + if (state.faulted_pages->nr > 0) { > + postcopy_incoming_qemu_send_pages_present(); > + } > + postcopy_incoming_qemu_fflush_to_umemd(); > + } > +} > + > +void postcopy_incoming_qemu_ready(void) > +{ > + umem_qemu_ready(state.to_umemd_fd); > + > + state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd); > + state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd); > + qemu_set_fd_handler(state.from_umemd_fd, > + postcopy_incoming_qemu_handle_req, NULL, NULL); > +} > + > +static void postcopy_incoming_qemu_cleanup_umem(void) > +{ > + /* when qemu will quit before completing postcopy, tell umem daemon > + to tear down umem device and exit. */ > + if (state.to_umemd_fd >= 0) { > + postcopy_incoming_qemu_queue_quit(); > + postcopy_incoming_qemu_fflush_to_umemd(); > + } > + > + if (state.dev) { > + umem_dev_destroy(state.dev); > + state.dev = NULL; > + } > +} > + > +void postcopy_incoming_qemu_cleanup(void) > +{ > + postcopy_incoming_qemu_cleanup_umem(); > + if (state.to_umemd != NULL) { > + nonblock_wait_for_flush(state.to_umemd); > + } > +} > + > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size) > +{ > + uint64_t nr = DIV_ROUND_UP(size, state.host_page_size); > + size_t len = umem_pages_size(nr); > + ram_addr_t end = addr + size; > + struct umem_pages *pages; > + int i; > + > + if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) { > + return; > + } > + pages = g_malloc(len); > + pages->nr = nr; > + for (i = 0; addr < end; addr += state.host_page_size, i++) { > + pages->pgoffs[i] = addr >> state.host_page_shift; > + } > + umem_qemu_send_pages_unmapped(state.to_umemd->file, pages); > + g_free(pages); > + assert(state.to_umemd != NULL); > + postcopy_incoming_qemu_fflush_to_umemd(); > +} > + > +/************************************************************************** > + * incoming umem daemon > + */ > + > +static void postcopy_incoming_umem_recv_quit(void) > +{ > + if (umemd.state & UMEM_STATE_QUIT_RECEIVED) { > + return; > + } > + DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n"); > + umemd.state |= UMEM_STATE_QUIT_RECEIVED; > + qemu_fclose(umemd.from_qemu); > + umemd.from_qemu = NULL; > + fd_close(&umemd.from_qemu_fd); > +} > + > +static void postcopy_incoming_umem_queue_quit(void) > +{ > + if (umemd.state & UMEM_STATE_QUIT_QUEUED) { > + return; > + } > + DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n"); > + umem_daemon_quit(umemd.to_qemu->file); > + umemd.state |= UMEM_STATE_QUIT_QUEUED; > +} > + > +static void postcopy_incoming_umem_send_eoc_req(void) > +{ > + struct qemu_umem_req req; > + > + if (umemd.state & UMEM_STATE_EOC_SENT) { > + return; > + } > + > + DPRINTF("|= UMEM_STATE_EOC_SENT\n"); > + req.cmd = QEMU_UMEM_REQ_EOC; > + postcopy_incoming_send_req(umemd.mig_write->file, &req); > + umemd.state |= UMEM_STATE_EOC_SENT; > + qemu_fclose(umemd.mig_write->file); > + umemd.mig_write = NULL; > + fd_close(&umemd.mig_write_fd); > +} > + > +static void postcopy_incoming_umem_send_page_req(RAMBlock *block) > +{ > + struct qemu_umem_req req; > + int bit; > + uint64_t target_pgoff; > + int i; > + > + umemd.page_request.nr = MAX_REQUESTS; > + umem_get_page_request(block->umem, &umemd.page_request); > + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n", > + block->idstr, umemd.page_request.nr, > + (uint64_t)umemd.page_request.pgoffs[0], > + (uint64_t)umemd.page_request.pgoffs[1]); > + > + if (umemd.last_block_write != block) { > + req.cmd = QEMU_UMEM_REQ_ON_DEMAND; > + req.idstr = block->idstr; > + } else { > + req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT; > + } > + > + req.nr = 0; > + req.pgoffs = umemd.target_pgoffs; > + if (TARGET_PAGE_SIZE >= umemd.host_page_size) { > + for (i = 0; i < umemd.page_request.nr; i++) { > + target_pgoff = > + umemd.page_request.pgoffs[i] >> umemd.host_to_target_page_shift; > + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff; > + > + if (!test_and_set_bit(bit, umemd.phys_requested)) { > + req.pgoffs[req.nr] = target_pgoff; > + req.nr++; > + } > + } > + } else { > + for (i = 0; i < umemd.page_request.nr; i++) { > + int j; > + target_pgoff = > + umemd.page_request.pgoffs[i] << umemd.host_to_target_page_shift; > + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff; > + > + for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) { > + if (!test_and_set_bit(bit + j, umemd.phys_requested)) { > + req.pgoffs[req.nr] = target_pgoff + j; > + req.nr++; > + } > + } > + } > + } > + > + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n", > + block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]); > + if (req.nr > 0 && umemd.mig_write != NULL) { > + postcopy_incoming_send_req(umemd.mig_write->file, &req); > + umemd.last_block_write = block; > + } > +} > + > +static void postcopy_incoming_umem_send_pages_present(void) > +{ > + if (umemd.present_request->nr > 0) { > + umem_daemon_send_pages_present(umemd.to_qemu->file, > + umemd.present_request); > + umemd.present_request->nr = 0; > + } > +} > + > +static void postcopy_incoming_umem_pages_present_one( > + uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset) > +{ > + uint32_t i; > + assert(nr <= MAX_PRESENT_REQUESTS); > + > + if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) { > + postcopy_incoming_umem_send_pages_present(); > + } > + > + for (i = 0; i < nr; i++) { > + umemd.present_request->pgoffs[umemd.present_request->nr + i] = > + pgoffs[i] + ramblock_pgoffset; > + } > + umemd.present_request->nr += nr; > +} > + > +static void postcopy_incoming_umem_pages_present( > + const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset) > +{ > + uint32_t left = page_cached->nr; > + uint32_t offset = 0; > + > + while (left > 0) { > + uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS); > + postcopy_incoming_umem_pages_present_one( > + nr, &page_cached->pgoffs[offset], ramblock_pgoffset); > + > + left -= nr; > + offset += nr; > + } > +} > + > +static int postcopy_incoming_umem_ram_load(void) > +{ > + ram_addr_t offset; > + int flags; > + int error; > + void *shmem; > + int i; > + int bit; > + > + if (umemd.version_id != RAM_SAVE_VERSION_ID) { > + return -EINVAL; > + } > + > + offset = qemu_get_be64(umemd.mig_read); > + > + flags = offset & ~TARGET_PAGE_MASK; > + offset &= TARGET_PAGE_MASK; > + > + assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE)); > + > + if (flags & RAM_SAVE_FLAG_EOS) { > + DPRINTF("RAM_SAVE_FLAG_EOS\n"); > + postcopy_incoming_umem_send_eoc_req(); > + > + qemu_fclose(umemd.mig_read); > + umemd.mig_read = NULL; > + fd_close(&umemd.mig_read_fd); > + umemd.state |= UMEM_STATE_EOS_RECEIVED; > + > + postcopy_incoming_umem_queue_quit(); > + DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n"); > + return 0; > + } > + > + shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags, > + &umemd.last_block_read); > + if (!shmem) { > + DPRINTF("shmem == NULL\n"); > + return -EINVAL; > + } > + > + if (flags & RAM_SAVE_FLAG_COMPRESS) { > + uint8_t ch = qemu_get_byte(umemd.mig_read); > + memset(shmem, ch, TARGET_PAGE_SIZE); > + } else if (flags & RAM_SAVE_FLAG_PAGE) { > + qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE); > + } > + > + error = qemu_file_get_error(umemd.mig_read); > + if (error) { > + DPRINTF("error %d\n", error); > + return error; > + } > + > + umemd.page_cached.nr = 0; > + bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS; > + if (!test_and_set_bit(bit, umemd.phys_received)) { > + if (TARGET_PAGE_SIZE >= umemd.host_page_size) { > + __u64 pgoff = offset >> umemd.host_page_shift; > + for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) { > + umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i; > + umemd.page_cached.nr++; > + } > + } else { > + bool mark_cache = true; > + for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) { > + if (!test_bit(bit + i, umemd.phys_received)) { > + mark_cache = false; > + break; > + } > + } > + if (mark_cache) { > + umemd.page_cached.pgoffs[0] = offset >> umemd.host_page_shift; > + umemd.page_cached.nr = 1; > + } > + } > + } > + > + if (umemd.page_cached.nr > 0) { > + umem_mark_page_cached(umemd.last_block_read->umem, &umemd.page_cached); > + > + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 && > + (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) { > + uint64_t ramblock_pgoffset; > + > + ramblock_pgoffset = > + umemd.last_block_read->offset >> umemd.host_page_shift; > + postcopy_incoming_umem_pages_present(&umemd.page_cached, > + ramblock_pgoffset); > + } > + } > + > + return 0; > +} > + > +static bool postcopy_incoming_umem_check_umem_done(void) > +{ > + bool all_done = true; > + RAMBlock *block; > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + UMem *umem = block->umem; > + if (umem != NULL && umem->nsets == umem->nbits) { > + umem_unmap_shmem(umem); > + umem_destroy(umem); > + block->umem = NULL; > + } > + if (block->umem != NULL) { > + all_done = false; > + } > + } > + return all_done; > +} > + > +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages *pages) > +{ > + int i; > + > + for (i = 0; i < pages->nr; i++) { > + ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift; > + RAMBlock *block = qemu_get_ram_block(addr); > + addr -= block->offset; > + umem_remove_shmem(block->umem, addr, umemd.host_page_size); > + } > + return postcopy_incoming_umem_check_umem_done(); > +} > + > +static bool > +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages) > +{ > + RAMBlock *block; > + ram_addr_t addr; > + int i; > + > + struct qemu_umem_req req = { > + .cmd = QEMU_UMEM_REQ_REMOVE, > + .nr = 0, > + .pgoffs = (uint64_t*)pages->pgoffs, > + }; > + > + addr = pages->pgoffs[0] << umemd.host_page_shift; > + block = qemu_get_ram_block(addr); > + > + for (i = 0; i < pages->nr; i++) { > + int pgoff; > + > + addr = pages->pgoffs[i] << umemd.host_page_shift; > + pgoff = addr >> TARGET_PAGE_BITS; > + if (!test_bit(pgoff, umemd.phys_received) && > + !test_bit(pgoff, umemd.phys_requested)) { > + req.pgoffs[req.nr] = pgoff; > + req.nr++; > + } > + set_bit(pgoff, umemd.phys_received); > + set_bit(pgoff, umemd.phys_requested); > + > + umem_remove_shmem(block->umem, > + addr - block->offset, umemd.host_page_size); > + } > + if (req.nr > 0 && umemd.mig_write != NULL) { > + req.idstr = block->idstr; > + postcopy_incoming_send_req(umemd.mig_write->file, &req); > + } > + > + return postcopy_incoming_umem_check_umem_done(); > +} > + > +static void postcopy_incoming_umem_done(void) > +{ > + postcopy_incoming_umem_send_eoc_req(); > + postcopy_incoming_umem_queue_quit(); > +} > + > +static int postcopy_incoming_umem_handle_qemu(void) > +{ > + int ret; > + int offset = 0; > + uint8_t cmd; > + > + ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset); > + offset += sizeof(cmd); > + if (ret != sizeof(cmd)) { > + return -EAGAIN; > + } > + DPRINTF("cmd %c\n", cmd); > + switch (cmd) { > + case UMEM_QEMU_QUIT: > + postcopy_incoming_umem_recv_quit(); > + postcopy_incoming_umem_done(); > + break; > + case UMEM_QEMU_PAGE_FAULTED: { > + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu, > + &offset); > + if (pages == NULL) { > + return -EAGAIN; > + } > + if (postcopy_incoming_umem_page_faulted(pages)){ > + postcopy_incoming_umem_done(); > + } > + g_free(pages); > + break; > + } > + case UMEM_QEMU_PAGE_UNMAPPED: { > + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu, > + &offset); > + if (pages == NULL) { > + return -EAGAIN; > + } > + if (postcopy_incoming_umem_page_unmapped(pages)){ > + postcopy_incoming_umem_done(); > + } > + g_free(pages); > + break; > + } > + default: > + abort(); > + break; > + } > + if (umemd.from_qemu != NULL) { > + qemu_file_skip(umemd.from_qemu, offset); > + } > + return 0; > +} > + > +static void set_fd(int fd, fd_set *fds, int *nfds) > +{ > + FD_SET(fd, fds); > + if (fd > *nfds) { > + *nfds = fd; > + } > +} > + > +static int postcopy_incoming_umemd_main_loop(void) > +{ > + fd_set writefds; > + fd_set readfds; > + int nfds; > + RAMBlock *block; > + int ret; > + > + int pending_size; > + bool get_page_request; > + > + nfds = -1; > + FD_ZERO(&writefds); > + FD_ZERO(&readfds); > + > + if (umemd.mig_write != NULL) { > + pending_size = nonblock_pending_size(umemd.mig_write); > + if (pending_size > 0) { > + set_fd(umemd.mig_write_fd, &writefds, &nfds); > + } > + } else { > + pending_size = 0; > + } > + > +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2) > + /* If page request to the migration source is accumulated, > + suspend getting page fault request. */ > + get_page_request = (pending_size <= PENDING_SIZE_MAX); > + > + if (get_page_request) { > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL) { > + set_fd(block->umem->fd, &readfds, &nfds); > + } > + } > + } > + > + if (umemd.mig_read_fd >= 0) { > + set_fd(umemd.mig_read_fd, &readfds, &nfds); > + } > + > + if (umemd.to_qemu != NULL && > + nonblock_pending_size(umemd.to_qemu) > 0) { > + set_fd(umemd.to_qemu_fd, &writefds, &nfds); > + } > + if (umemd.from_qemu_fd >= 0) { > + set_fd(umemd.from_qemu_fd, &readfds, &nfds); > + } > + > + ret = select(nfds + 1, &readfds, &writefds, NULL, NULL); > + if (ret == -1) { > + if (errno == EINTR) { > + return 0; > + } > + return ret; > + } > + > + if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) { > + nonblock_fflush(umemd.mig_write); > + } > + if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) { > + nonblock_fflush(umemd.to_qemu); > + } > + if (get_page_request) { > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) { > + postcopy_incoming_umem_send_page_req(block); > + } > + } > + } > + if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) { > + do { > + ret = postcopy_incoming_umem_ram_load(); > + if (ret < 0) { > + return ret; > + } > + } while (umemd.mig_read != NULL && > + qemu_pending_size(umemd.mig_read) > 0); > + } > + if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) { > + do { > + ret = postcopy_incoming_umem_handle_qemu(); > + if (ret == -EAGAIN) { > + break; > + } > + } while (umemd.from_qemu != NULL && > + qemu_pending_size(umemd.from_qemu) > 0); > + } > + > + if (umemd.mig_write != NULL) { > + nonblock_fflush(umemd.mig_write); > + } > + if (umemd.to_qemu != NULL) { > + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) { > + postcopy_incoming_umem_send_pages_present(); > + } > + nonblock_fflush(umemd.to_qemu); > + if ((umemd.state & UMEM_STATE_QUIT_QUEUED) && > + nonblock_pending_size(umemd.to_qemu) == 0) { > + DPRINTF("|= UMEM_STATE_QUIT_SENT\n"); > + qemu_fclose(umemd.to_qemu->file); > + umemd.to_qemu = NULL; > + fd_close(&umemd.to_qemu_fd); > + umemd.state |= UMEM_STATE_QUIT_SENT; > + } > + } > + > + return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK; > +} > + > +static void postcopy_incoming_umemd(void) > +{ > + ram_addr_t last_ram_offset; > + int nbits; > + RAMBlock *block; > + int ret; > + > + qemu_daemon(1, 1); > + signal(SIGPIPE, SIG_IGN); > + DPRINTF("daemon pid: %d\n", getpid()); > + > + umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS); > + umemd.page_cached.pgoffs = > + g_new(__u64, MAX_REQUESTS * > + (TARGET_PAGE_SIZE >= umemd.host_page_size ? > + 1: umemd.nr_host_pages_per_target_page)); > + umemd.target_pgoffs = > + g_new(uint64_t, MAX_REQUESTS * > + MAX(umemd.nr_host_pages_per_target_page, > + umemd.nr_target_pages_per_host_page)); > + umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS)); > + umemd.present_request->nr = 0; > + > + last_ram_offset = qemu_last_ram_offset(); > + nbits = last_ram_offset >> TARGET_PAGE_BITS; > + umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits)); > + umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits)); > + umemd.last_block_read = NULL; > + umemd.last_block_write = NULL; > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + UMem *umem = block->umem; > + umem->umem = NULL; /* umem mapping area has VM_DONT_COPY flag, > + so we lost those mappings by fork */ > + block->host = umem_map_shmem(umem); > + umem_close_shmem(umem); > + } > + umem_daemon_ready(umemd.to_qemu_fd); > + umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd); > + > + /* wait for qemu to disown migration_fd */ > + umem_daemon_wait_for_qemu(umemd.from_qemu_fd); > + umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd); > + > + DPRINTF("entering umemd main loop\n"); > + for (;;) { > + ret = postcopy_incoming_umemd_main_loop(); > + if (ret != 0) { > + break; > + } > + } > + DPRINTF("exiting umemd main loop\n"); > + > + /* This daemon forked from qemu and the parent qemu is still running. > + * Cleanups of linked libraries like SDL should not be triggered, > + * otherwise the parent qemu may use resources which was already freed. > + */ > + fflush(stdout); > + fflush(stderr); > + _exit(ret < 0? EXIT_FAILURE: 0); > +} > diff --git a/migration-tcp.c b/migration-tcp.c > index cf6a9b8..aa35050 100644 > --- a/migration-tcp.c > +++ b/migration-tcp.c > @@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque) > } while (ret == -1 && (socket_error()) == EINTR); > > if (ret < 0) { > - migrate_fd_error(s); > - return; > + goto error_out; > } > > qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > > - if (val == 0) > + if (val == 0) { > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > + } > migrate_fd_connect(s); > - else { > + } else { > DPRINTF("error connecting %d\n", val); > - migrate_fd_error(s); > + goto error_out; > } > + return; > + > +error_out: > + migrate_fd_error(s); > } > > int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) > @@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) > > if (ret < 0) { > DPRINTF("connect failed\n"); > - migrate_fd_error(s); > - return ret; > + goto error_out; > + } > + > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > } > migrate_fd_connect(s); > return 0; > + > +error_out: > + migrate_fd_error(s); > + return ret; > } > > static void tcp_accept_incoming_migration(void *opaque) > @@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque) > } > > process_incoming_migration(f); > + if (incoming_postcopy) { > + postcopy_incoming_fork_umemd(c, f); > + } > qemu_fclose(f); > + if (incoming_postcopy) { > + /* now socket is disowned. > + So tell umem server that it's safe to use it */ > + postcopy_incoming_qemu_ready(); > + } > out: > close(c); > out2: > diff --git a/migration-unix.c b/migration-unix.c > index dfcf203..3707505 100644 > --- a/migration-unix.c > +++ b/migration-unix.c > @@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque) > > qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > > - if (val == 0) > + if (val == 0) { > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > + } > migrate_fd_connect(s); > - else { > + } else { > DPRINTF("error connecting %d\n", val); > - migrate_fd_error(s); > + goto error_out; > } > + return; > + > +error_out: > + migrate_fd_error(s); > } > > int unix_start_outgoing_migration(MigrationState *s, const char *path) > @@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s, const char *path) > > if (ret < 0) { > DPRINTF("connect failed\n"); > - migrate_fd_error(s); > - return ret; > + goto error_out; > + } > + > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > } > migrate_fd_connect(s); > return 0; > + > +error_out: > + migrate_fd_error(s); > + return ret; > } > > static void unix_accept_incoming_migration(void *opaque) > @@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void *opaque) > } > > process_incoming_migration(f); > + if (incoming_postcopy) { > + postcopy_incoming_fork_umemd(c, f); > + } > qemu_fclose(f); > + if (incoming_postcopy) { > + postcopy_incoming_qemu_ready(); > + } > out: > close(c); > out2: > diff --git a/migration.c b/migration.c > index 0149ab3..51efe44 100644 > --- a/migration.c > +++ b/migration.c > @@ -39,6 +39,11 @@ enum { > MIG_STATE_COMPLETED, > }; > > +enum { > + MIG_SUBSTATE_PRECOPY, > + MIG_SUBSTATE_POSTCOPY, > +}; > + > #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ > > static NotifierList migration_state_notifiers = > @@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque) > return; > } > > + if (s->substate == MIG_SUBSTATE_POSTCOPY) { > + /* PRINTF("postcopy background\n"); */ > + ret = postcopy_outgoing_ram_save_background(s->mon, s->file, > + s->postcopy); > + if (ret > 0) { > + migrate_fd_completed(s); > + } else if (ret < 0) { > + migrate_fd_error(s); > + } > + return; > + } > + > DPRINTF("iterate\n"); > ret = qemu_savevm_state_iterate(s->mon, s->file); > if (ret < 0) { > @@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque) > DPRINTF("done iterating\n"); > vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); > > + if (s->params.postcopy) { > + if (qemu_savevm_state_complete(s->mon, s->file) < 0) { > + migrate_fd_error(s); > + if (old_vm_running) { > + vm_start(); > + } > + return; > + } > + s->substate = MIG_SUBSTATE_POSTCOPY; > + s->postcopy = postcopy_outgoing_begin(s); > + return; > + } > + > if (qemu_savevm_state_complete(s->mon, s->file) < 0) { > migrate_fd_error(s); > } else { > @@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s) > int ret; > > s->state = MIG_STATE_ACTIVE; > + s->substate = MIG_SUBSTATE_PRECOPY; > s->file = qemu_fopen_ops_buffered(s, > s->bandwidth_limit, > migrate_fd_put_buffer, > diff --git a/migration.h b/migration.h > index 90ae362..2809e99 100644 > --- a/migration.h > +++ b/migration.h > @@ -40,6 +40,12 @@ struct MigrationState > int (*write)(MigrationState *s, const void *buff, size_t size); > void *opaque; > MigrationParams params; > + > + /* for postcopy */ > + int substate; /* precopy or postcopy */ > + int fd_read; > + QEMUFile *file_read; /* connection from the detination */ > + void *postcopy; > }; > > void process_incoming_migration(QEMUFile *f); > @@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void); > uint64_t ram_bytes_transferred(void); > uint64_t ram_bytes_total(void); > > +void ram_save_set_params(const MigrationParams *params, void *opaque); > void sort_ram_list(void); > int ram_save_block(QEMUFile *f); > void ram_save_memory_set_dirty(void); > @@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason); > */ > void migrate_del_blocker(Error *reason); > > +/* For outgoing postcopy */ > +int postcopy_outgoing_create_read_socket(MigrationState *s); > +int postcopy_outgoing_ram_save_live(Monitor *mon, > + QEMUFile *f, int stage, void *opaque); > +void *postcopy_outgoing_begin(MigrationState *s); > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, > + void *postcopy); > + > +/* For incoming postcopy */ > extern bool incoming_postcopy; > extern unsigned long incoming_postcopy_flags; > > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy); > +void postcopy_incoming_ram_alloc(const char *name, > + size_t size, uint8_t **hostp, UMem **umemp); > +void postcopy_incoming_ram_free(UMem *umem); > +void postcopy_incoming_prepare(void); > + > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id); > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read); > +void postcopy_incoming_qemu_ready(void); > +void postcopy_incoming_qemu_cleanup(void); > +#ifdef NEED_CPU_H > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size); > +#endif > + > #endif > diff --git a/qemu-common.h b/qemu-common.h > index 725922b..d74a8c9 100644 > --- a/qemu-common.h > +++ b/qemu-common.h > @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState; > > struct Monitor; > typedef struct Monitor Monitor; > +typedef struct UMem UMem; > > /* we put basic includes here to avoid repeating them in device drivers */ > #include > diff --git a/qemu-options.hx b/qemu-options.hx > index 5c5b8f3..19e20f9 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG, QEMU_OPTION_postcopy_flags, > "-postcopy-flags unsigned-int(flags)\n" > " flags for postcopy incoming migration\n" > " when -incoming and -postcopy are specified.\n" > - " This is for benchmark/debug purpose (default: 0)\n", > + " This is for benchmark/debug purpose (default: 0)\n" > + " Currently supprted flags are\n" > + " 1: enable fault request from umemd to qemu\n" > + " (default: disabled)\n", > QEMU_ARCH_ALL) > STEXI > @item -postcopy-flags int Can you move umem.h and umem.h to a separate patch please , this patch > diff --git a/umem.c b/umem.c > new file mode 100644 > index 0000000..b7be006 > --- /dev/null > +++ b/umem.c > @@ -0,0 +1,379 @@ > +/* > + * umem.c: user process backed memory module for postcopy livemigration > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#include > +#include > + > +#include > + > +#include "bitops.h" > +#include "sysemu.h" > +#include "hw/hw.h" > +#include "umem.h" > + > +//#define DEBUG_UMEM > +#ifdef DEBUG_UMEM > +#include > +#define DPRINTF(format, ...) \ > + do { \ > + printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid), \ > + __func__, __LINE__, ## __VA_ARGS__); \ > + } while (0) > +#else > +#define DPRINTF(format, ...) do { } while (0) > +#endif > + > +#define DEV_UMEM "/dev/umem" > + > +struct UMemDev { > + int fd; > + int page_shift; > +}; > + > +UMemDev *umem_dev_new(void) > +{ > + UMemDev *umem_dev; > + int umem_dev_fd = open(DEV_UMEM, O_RDWR); > + if (umem_dev_fd < 0) { > + perror("can't open "DEV_UMEM); > + abort(); > + } > + > + umem_dev = g_new(UMemDev, 1); > + umem_dev->fd = umem_dev_fd; > + umem_dev->page_shift = ffs(getpagesize()) - 1; > + return umem_dev; > +} > + > +void umem_dev_destroy(UMemDev *dev) > +{ > + close(dev->fd); > + g_free(dev); > +} > + > +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name) > +{ > + struct umem_create create = { > + .size = size, > + .async_req_max = 0, > + .sync_req_max = 0, > + }; > + UMem *umem; > + > + snprintf(create.name.id, sizeof(create.name.id), > + "pid-%"PRId64, (uint64_t)getpid()); > + create.name.id[UMEM_ID_MAX - 1] = 0; > + strncpy(create.name.name, name, sizeof(create.name.name)); > + create.name.name[UMEM_NAME_MAX - 1] = 0; > + > + assert((size % getpagesize()) == 0); > + if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) { > + perror("UMEM_DEV_CREATE_UMEM"); > + abort(); > + } > + if (ftruncate(create.shmem_fd, create.size) < 0) { > + perror("truncate(\"shmem_fd\")"); > + abort(); > + } > + > + umem = g_new(UMem, 1); > + umem->nbits = 0; > + umem->nsets = 0; > + umem->faulted = NULL; > + umem->page_shift = dev->page_shift; > + umem->fd = create.umem_fd; > + umem->shmem_fd = create.shmem_fd; > + umem->size = create.size; > + umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (umem->umem == MAP_FAILED) { > + perror("mmap(UMem) failed"); > + abort(); > + } > + return umem; > +} > + > +void umem_mmap(UMem *umem) > +{ > + void *ret = mmap(umem->umem, umem->size, > + PROT_EXEC | PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_FIXED, umem->fd, 0); > + if (ret == MAP_FAILED) { > + perror("umem_mmap(UMem) failed"); > + abort(); > + } > +} > + > +void umem_destroy(UMem *umem) > +{ > + if (umem->fd != -1) { > + close(umem->fd); > + } > + if (umem->shmem_fd != -1) { > + close(umem->shmem_fd); > + } > + g_free(umem->faulted); > + g_free(umem); > +} > + > +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request) > +{ > + if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) { > + perror("daemon: UMEM_GET_PAGE_REQUEST"); > + abort(); > + } > +} > + > +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached) > +{ > + if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) { > + perror("daemon: UMEM_MARK_PAGE_CACHED"); > + abort(); > + } > +} > + > +void umem_unmap(UMem *umem) > +{ > + munmap(umem->umem, umem->size); > + umem->umem = NULL; > +} > + > +void umem_close(UMem *umem) > +{ > + close(umem->fd); > + umem->fd = -1; > +} > + > +void *umem_map_shmem(UMem *umem) > +{ > + umem->nbits = umem->size >> umem->page_shift; > + umem->nsets = 0; > + umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits)); > + > + umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED, > + umem->shmem_fd, 0); > + if (umem->shmem == MAP_FAILED) { > + perror("daemon: mmap(\"shmem\")"); > + abort(); > + } > + return umem->shmem; > +} > + > +void umem_unmap_shmem(UMem *umem) > +{ > + munmap(umem->shmem, umem->size); > + umem->shmem = NULL; > +} > + > +void umem_remove_shmem(UMem *umem, size_t offset, size_t size) > +{ > + int s = offset >> umem->page_shift; > + int e = (offset + size) >> umem->page_shift; > + int i; > + > + for (i = s; i < e; i++) { > + if (!test_and_set_bit(i, umem->faulted)) { > + umem->nsets++; > +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE) > + madvise(umem->shmem + offset, size, MADV_REMOVE); > +#endif > + } > + } > +} > + > +void umem_close_shmem(UMem *umem) > +{ > + close(umem->shmem_fd); > + umem->shmem_fd = -1; > +} > + > +/***************************************************************************/ > +/* qemu <-> umem daemon communication */ > + > +size_t umem_pages_size(uint64_t nr) > +{ > + return sizeof(struct umem_pages) + nr * sizeof(uint64_t); > +} > + > +static void umem_write_cmd(int fd, uint8_t cmd) > +{ > + DPRINTF("write cmd %c\n", cmd); > + > + for (;;) { > + ssize_t ret = write(fd, &cmd, 1); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } else if (errno == EPIPE) { > + perror("pipe"); > + DPRINTF("write cmd %c %zd %d: pipe is closed\n", > + cmd, ret, errno); > + break; > + } > + > + perror("pipe"); > + DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno); > + abort(); > + } > + > + break; > + } > +} > + > +static void umem_read_cmd(int fd, uint8_t expect) > +{ > + uint8_t cmd; > + for (;;) { > + ssize_t ret = read(fd, &cmd, 1); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } > + perror("pipe"); > + DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno); > + abort(); > + } > + > + if (ret == 0) { > + DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret); > + abort(); > + } > + > + break; > + } > + > + DPRINTF("read cmd %c\n", cmd); > + if (cmd != expect) { > + DPRINTF("cmd %c expect %d\n", cmd, expect); > + abort(); > + } > +} > + > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset) > +{ > + int ret; > + uint64_t nr; > + size_t size; > + struct umem_pages *pages; > + > + ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset); > + *offset += sizeof(nr); > + DPRINTF("ret %d nr %ld\n", ret, nr); > + if (ret != sizeof(nr) || nr == 0) { > + return NULL; > + } > + > + size = umem_pages_size(nr); > + pages = g_malloc(size); > + pages->nr = nr; > + size -= sizeof(pages->nr); > + > + ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset); > + *offset += size; > + if (ret != size) { > + g_free(pages); > + return NULL; > + } > + return pages; > +} > + > +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages) > +{ > + size_t len = umem_pages_size(pages->nr); > + qemu_put_buffer(f, (const uint8_t*)pages, len); > +} > + > +/* umem daemon -> qemu */ > +void umem_daemon_ready(int to_qemu_fd) > +{ > + umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY); > +} > + > +void umem_daemon_quit(QEMUFile *to_qemu) > +{ > + qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT); > +} > + > +void umem_daemon_send_pages_present(QEMUFile *to_qemu, > + struct umem_pages *pages) > +{ > + qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT); > + umem_send_pages(to_qemu, pages); > +} > + > +void umem_daemon_wait_for_qemu(int from_qemu_fd) > +{ > + umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY); > +} > + > +/* qemu -> umem daemon */ > +void umem_qemu_wait_for_daemon(int from_umemd_fd) > +{ > + umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY); > +} > + > +void umem_qemu_ready(int to_umemd_fd) > +{ > + umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY); > +} > + > +void umem_qemu_quit(QEMUFile *to_umemd) > +{ > + qemu_put_byte(to_umemd, UMEM_QEMU_QUIT); > +} > + > +/* qemu side handler */ > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, > + int *offset) > +{ > + uint64_t i; > + int page_shift = ffs(getpagesize()) - 1; > + struct umem_pages *pages = umem_recv_pages(from_umemd, offset); > + if (pages == NULL) { > + return NULL; > + } > + > + for (i = 0; i < pages->nr; i++) { > + ram_addr_t addr = pages->pgoffs[i] << page_shift; > + > + /* make pages present by forcibly triggering page fault. */ > + volatile uint8_t *ram = qemu_get_ram_ptr(addr); > + uint8_t dummy_read = ram[0]; > + (void)dummy_read; /* suppress unused variable warning */ > + } > + > + return pages; > +} > + > +void umem_qemu_send_pages_present(QEMUFile *to_umemd, > + const struct umem_pages *pages) > +{ > + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED); > + umem_send_pages(to_umemd, pages); > +} > + > +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, > + const struct umem_pages *pages) > +{ > + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED); > + umem_send_pages(to_umemd, pages); > +} > diff --git a/umem.h b/umem.h > new file mode 100644 > index 0000000..5ca19ef > --- /dev/null > +++ b/umem.h > @@ -0,0 +1,105 @@ > +/* > + * umem.h: user process backed memory module for postcopy livemigration > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#ifndef QEMU_UMEM_H > +#define QEMU_UMEM_H > + > +#include > + > +#include "qemu-common.h" > + > +typedef struct UMemDev UMemDev; > + > +struct UMem { > + void *umem; > + int fd; > + void *shmem; > + int shmem_fd; > + uint64_t size; > + > + /* indexed by host page size */ > + int page_shift; > + int nbits; > + int nsets; > + unsigned long *faulted; > +}; > + > +UMemDev *umem_dev_new(void); > +void umem_dev_destroy(UMemDev *dev); > +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name); > +void umem_mmap(UMem *umem); > + > +void umem_destroy(UMem *umem); > + > +/* umem device operations */ > +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request); > +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached); > +void umem_unmap(UMem *umem); > +void umem_close(UMem *umem); > + > +/* umem shmem operations */ > +void *umem_map_shmem(UMem *umem); > +void umem_unmap_shmem(UMem *umem); > +void umem_remove_shmem(UMem *umem, size_t offset, size_t size); > +void umem_close_shmem(UMem *umem); > + > +/* qemu on source <-> umem daemon communication */ > + > +struct umem_pages { > + uint64_t nr; /* nr = 0 means completed */ > + uint64_t pgoffs[0]; > +}; > + > +/* daemon -> qemu */ > +#define UMEM_DAEMON_READY 'R' > +#define UMEM_DAEMON_QUIT 'Q' > +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT 'T' > +#define UMEM_DAEMON_ERROR 'E' > + > +/* qemu -> daemon */ > +#define UMEM_QEMU_READY 'r' > +#define UMEM_QEMU_QUIT 'q' > +#define UMEM_QEMU_PAGE_FAULTED 't' > +#define UMEM_QEMU_PAGE_UNMAPPED 'u' > + > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset); > +size_t umem_pages_size(uint64_t nr); > + > +/* for umem daemon */ > +void umem_daemon_ready(int to_qemu_fd); > +void umem_daemon_wait_for_qemu(int from_qemu_fd); > +void umem_daemon_quit(QEMUFile *to_qemu); > +void umem_daemon_send_pages_present(QEMUFile *to_qemu, > + struct umem_pages *pages); > + > +/* for qemu */ > +void umem_qemu_wait_for_daemon(int from_umemd_fd); > +void umem_qemu_ready(int to_umemd_fd); > +void umem_qemu_quit(QEMUFile *to_umemd); > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, > + int *offset); > +void umem_qemu_send_pages_present(QEMUFile *to_umemd, > + const struct umem_pages *pages); > +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, > + const struct umem_pages *pages); > + > +#endif /* QEMU_UMEM_H */ > diff --git a/vl.c b/vl.c > index 5430b8c..17427a0 100644 > --- a/vl.c > +++ b/vl.c > @@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp) > default_drive(default_sdcard, snapshot, machine->use_scsi, > IF_SD, 0, SD_OPTS); > > - register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL, > - ram_save_live, NULL, ram_load, NULL); > + if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) { > + exit(1); > + } > + register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, > + ram_save_set_params, ram_save_live, NULL, > + ram_load, NULL); > > if (nb_numa_nodes > 0) { > int i; > @@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp) > > if (incoming) { > runstate_set(RUN_STATE_INMIGRATE); > + if (incoming_postcopy) { > + postcopy_incoming_prepare(); >+ } how about moving postcopy_incoming_prepare into qemu_start_incoming_migration ? > int ret = qemu_start_incoming_migration(incoming); > if (ret < 0) { > fprintf(stderr, "Migration failed. Exit code %s(%d), exiting.\n", > @@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp) > bdrv_close_all(); > pause_all_vcpus(); > net_cleanup(); > + if (incoming_postcopy) { > + postcopy_incoming_qemu_cleanup(); > + } > res_free(); > > return 0; Orit From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([140.186.70.92]:58430) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1RgIHc-00079N-EE for qemu-devel@nongnu.org; Thu, 29 Dec 2011 10:52:50 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1RgIHW-0002lx-0R for qemu-devel@nongnu.org; Thu, 29 Dec 2011 10:52:44 -0500 Received: from mx1.redhat.com ([209.132.183.28]:54070) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1RgIHV-0002le-8D for qemu-devel@nongnu.org; Thu, 29 Dec 2011 10:52:37 -0500 Message-ID: <4EFC8C88.70701@redhat.com> Date: Thu, 29 Dec 2011 17:51:36 +0200 From: Orit Wasserman MIME-Version: 1.0 References: In-Reply-To: Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH 21/21] postcopy: implement postcopy livemigration List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Isaku Yamahata , satoshi.itoh@aist.go.jp Cc: t.hirofuchi@aist.go.jp, qemu-devel@nongnu.org, kvm@vger.kernel.org Hi, A general comment this patch is a bit too long,which makes it hard to review. Can you split it please? On 12/29/2011 03:26 AM, Isaku Yamahata wrote: > This patch implements postcopy livemigration. > > Signed-off-by: Isaku Yamahata > --- > Makefile.target | 4 + > arch_init.c | 26 +- > cpu-all.h | 7 + > exec.c | 20 +- > migration-exec.c | 8 + > migration-fd.c | 30 + > migration-postcopy-stub.c | 77 ++ > migration-postcopy.c | 1891 +++++++++++++++++++++++++++++++++++++++++++++ > migration-tcp.c | 37 +- > migration-unix.c | 32 +- > migration.c | 31 + > migration.h | 30 + > qemu-common.h | 1 + > qemu-options.hx | 5 +- > umem.c | 379 +++++++++ > umem.h | 105 +++ > vl.c | 14 +- > 17 files changed, 2677 insertions(+), 20 deletions(-) > create mode 100644 migration-postcopy-stub.c > create mode 100644 migration-postcopy.c > create mode 100644 umem.c > create mode 100644 umem.h > > diff --git a/Makefile.target b/Makefile.target > index 3261383..d94c53f 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -4,6 +4,7 @@ GENERATED_HEADERS = config-target.h > CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y) > CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y) > CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y) > +CONFIG_NO_POSTCOPY = $(if $(subst n,,$(CONFIG_POSTCOPY)),n,y) > > include ../config-host.mak > include config-devices.mak > @@ -199,6 +200,9 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o > obj-y += memory.o > LIBS+=-lz > > +common-obj-$(CONFIG_POSTCOPY) += migration-postcopy.o umem.o > +common-obj-$(CONFIG_NO_POSTCOPY) += migration-postcopy-stub.o > + > QEMU_CFLAGS += $(VNC_TLS_CFLAGS) > QEMU_CFLAGS += $(VNC_SASL_CFLAGS) > QEMU_CFLAGS += $(VNC_JPEG_CFLAGS) > diff --git a/arch_init.c b/arch_init.c > index bc53092..8b3130d 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -102,6 +102,13 @@ static int is_dup_page(uint8_t *page, uint8_t ch) > return 1; > } > > +static bool outgoing_postcopy = false; > + > +void ram_save_set_params(const MigrationParams *params, void *opaque) > +{ > + outgoing_postcopy = params->postcopy; > +} > + > static RAMBlock *last_block_sent = NULL; > > int ram_save_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) > @@ -284,6 +291,17 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > uint64_t expected_time = 0; > int ret; > > + if (stage == 1) { > + last_block_sent = NULL; > + > + bytes_transferred = 0; > + last_block = NULL; > + last_offset = 0; Changing of line order + new empty line > + } > + if (outgoing_postcopy) { > + return postcopy_outgoing_ram_save_live(mon, f, stage, opaque); > + } > + I would just do : unregister_savevm_live and then register_savevm_live(...,postcopy_outgoing_ram_save_live,...) when starting outgoing postcopy migration. > if (stage < 0) { > cpu_physical_memory_set_dirty_tracking(0); > return 0; > @@ -295,10 +313,6 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > } > > if (stage == 1) { > - bytes_transferred = 0; > - last_block_sent = NULL; > - last_block = NULL; > - last_offset = 0; > sort_ram_list(); > > /* Make sure all dirty bits are set */ > @@ -436,6 +450,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id) > int flags; > int error; > > + if (incoming_postcopy) { > + return postcopy_incoming_ram_load(f, opaque, version_id); > + } > + why not call register_savevm_live(...,postcopy_incoming_ram_load,...) when starting guest with postcopy_incoming > if (version_id < 3 || version_id > RAM_SAVE_VERSION_ID) { > return -EINVAL; > } > diff --git a/cpu-all.h b/cpu-all.h > index 0244f7a..2e9d8a7 100644 > --- a/cpu-all.h > +++ b/cpu-all.h > @@ -475,6 +475,9 @@ extern ram_addr_t ram_size; > /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ > #define RAM_PREALLOC_MASK (1 << 0) > > +/* RAM is allocated via umem for postcopy incoming mode */ > +#define RAM_POSTCOPY_UMEM_MASK (1 << 1) > + > typedef struct RAMBlock { > uint8_t *host; > ram_addr_t offset; > @@ -485,6 +488,10 @@ typedef struct RAMBlock { > #if defined(__linux__) && !defined(TARGET_S390X) > int fd; > #endif > + > +#ifdef CONFIG_POSTCOPY > + UMem *umem; /* for incoming postcopy mode */ > +#endif > } RAMBlock; > > typedef struct RAMList { > diff --git a/exec.c b/exec.c > index c8c6692..90b0491 100644 > --- a/exec.c > +++ b/exec.c > @@ -35,6 +35,7 @@ > #include "qemu-timer.h" > #include "memory.h" > #include "exec-memory.h" > +#include "migration.h" > #if defined(CONFIG_USER_ONLY) > #include > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > @@ -2949,6 +2950,13 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, > new_block->host = host; > new_block->flags |= RAM_PREALLOC_MASK; > } else { > +#ifdef CONFIG_POSTCOPY > + if (incoming_postcopy) { > + postcopy_incoming_ram_alloc(name, size, > + &new_block->host, &new_block->umem); > + new_block->flags |= RAM_POSTCOPY_UMEM_MASK; > + } else > +#endif > if (mem_path) { > #if defined (__linux__) && !defined(TARGET_S390X) > new_block->host = file_ram_alloc(new_block, size, mem_path); > @@ -3027,7 +3035,13 @@ void qemu_ram_free(ram_addr_t addr) > QLIST_REMOVE(block, next); > if (block->flags & RAM_PREALLOC_MASK) { > ; > - } else if (mem_path) { > + } > +#ifdef CONFIG_POSTCOPY > + else if (block->flags & RAM_POSTCOPY_UMEM_MASK) { > + postcopy_incoming_ram_free(block->umem); > + } > +#endif > + else if (mem_path) { > #if defined (__linux__) && !defined(TARGET_S390X) > if (block->fd) { > munmap(block->host, block->length); > @@ -3073,6 +3087,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) > } else { > flags = MAP_FIXED; > munmap(vaddr, length); > + if (block->flags & RAM_POSTCOPY_UMEM_MASK) { > + postcopy_incoming_qemu_pages_unmapped(addr, length); > + block->flags &= ~RAM_POSTCOPY_UMEM_MASK; > + } > if (mem_path) { > #if defined(__linux__) && !defined(TARGET_S390X) > if (block->fd) { > diff --git a/migration-exec.c b/migration-exec.c > index e14552e..2bd0c3b 100644 > --- a/migration-exec.c > +++ b/migration-exec.c > @@ -62,6 +62,10 @@ int exec_start_outgoing_migration(MigrationState *s, const char *command) > { > FILE *f; > > + if (s->params.postcopy) { > + return -ENOSYS; > + } > + > f = popen(command, "w"); > if (f == NULL) { > DPRINTF("Unable to popen exec target\n"); > @@ -104,6 +108,10 @@ int exec_start_incoming_migration(const char *command) > { > QEMUFile *f; > > + if (incoming_postcopy) { > + return -ENOSYS; > + } > + > DPRINTF("Attempting to start an incoming migration\n"); > f = qemu_popen_cmd(command, "r"); > if(f == NULL) { > diff --git a/migration-fd.c b/migration-fd.c > index 6211124..5a62ab9 100644 > --- a/migration-fd.c > +++ b/migration-fd.c > @@ -88,6 +88,23 @@ int fd_start_outgoing_migration(MigrationState *s, const char *fdname) > s->write = fd_write; > s->close = fd_close; > > + if (s->params.postcopy) { > + int flags = fcntl(s->fd, F_GETFL); > + if ((flags & O_ACCMODE) != O_RDWR) { > + goto err_after_open; > + } > + > + s->fd_read = dup(s->fd); > + if (s->fd_read == -1) { > + goto err_after_open; > + } > + s->file_read = qemu_fdopen(s->fd_read, "r"); > + if (s->file_read == NULL) { > + close(s->fd_read); > + goto err_after_open; > + } > + } > + > migrate_fd_connect(s); > return 0; > > @@ -103,7 +120,14 @@ static void fd_accept_incoming_migration(void *opaque) > > process_incoming_migration(f); > qemu_set_fd_handler2(qemu_stdio_fd(f), NULL, NULL, NULL, NULL); > + if (incoming_postcopy) { > + postcopy_incoming_fork_umemd(qemu_stdio_fd(f), f); > + } > qemu_fclose(f); > + if (incoming_postcopy) { > + postcopy_incoming_qemu_ready(); > + } > + return; > } > > int fd_start_incoming_migration(const char *infd) > @@ -114,6 +138,12 @@ int fd_start_incoming_migration(const char *infd) > DPRINTF("Attempting to start an incoming migration via fd\n"); > > fd = strtol(infd, NULL, 0); > + if (incoming_postcopy) { > + int flags = fcntl(fd, F_GETFL); > + if ((flags & O_ACCMODE) != O_RDWR) { > + return -EINVAL; > + } > + } > f = qemu_fdopen(fd, "rb"); > if(f == NULL) { > DPRINTF("Unable to apply qemu wrapper to file descriptor\n"); > diff --git a/migration-postcopy-stub.c b/migration-postcopy-stub.c > new file mode 100644 > index 0000000..0b78de7 > --- /dev/null > +++ b/migration-postcopy-stub.c > @@ -0,0 +1,77 @@ > +/* > + * migration-postcopy-stub.c: postcopy livemigration > + * stub functions for non-supported hosts > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#include "sysemu.h" > +#include "migration.h" > + > +int postcopy_outgoing_create_read_socket(MigrationState *s) > +{ > + return -ENOSYS; > +} > + > +int postcopy_outgoing_ram_save_live(Monitor *mon, > + QEMUFile *f, int stage, void *opaque) > +{ > + return -ENOSYS; > +} > + > +void *postcopy_outgoing_begin(MigrationState *ms) > +{ > + return NULL; > +} > + > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, > + void *postcopy) > +{ > + return -ENOSYS; > +} > + > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy) > +{ > + return -ENOSYS; > +} > + > +void postcopy_incoming_prepare(void) > +{ > +} > + > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id) > +{ > + return -ENOSYS; > +} > + > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read) > +{ > +} > + > +void postcopy_incoming_qemu_ready(void) > +{ > +} > + > +void postcopy_incoming_qemu_cleanup(void) > +{ > +} > + > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size) > +{ > +} > diff --git a/migration-postcopy.c b/migration-postcopy.c > new file mode 100644 > index 0000000..ed0d574 > --- /dev/null > +++ b/migration-postcopy.c > @@ -0,0 +1,1891 @@ > +/* > + * migration-postcopy.c: postcopy livemigration > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#include "bitmap.h" > +#include "sysemu.h" > +#include "hw/hw.h" > +#include "arch_init.h" > +#include "migration.h" > +#include "umem.h" > + > +#include "memory.h" > +#define WANT_EXEC_OBSOLETE > +#include "exec-obsolete.h" > + > +//#define DEBUG_POSTCOPY > +#ifdef DEBUG_POSTCOPY > +#include > +#define DPRINTF(fmt, ...) \ > + do { \ > + printf("%d:%ld %s:%d: " fmt, getpid(), syscall(SYS_gettid), \ > + __func__, __LINE__, ## __VA_ARGS__); \ > + } while (0) > +#else > +#define DPRINTF(fmt, ...) do { } while (0) > +#endif > + > +#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1)) > + > +static void fd_close(int *fd) > +{ > + if (*fd >= 0) { > + close(*fd); > + *fd = -1; > + } > +} > + > +/*************************************************************************** > + * QEMUFile for non blocking pipe > + */ > + > +/* read only */ > +struct QEMUFilePipe { > + int fd; > + QEMUFile *file; > +}; Why not use QEMUFileSocket ? > +typedef struct QEMUFilePipe QEMUFilePipe; > + > +static int pipe_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) > +{ > + QEMUFilePipe *s = opaque; > + ssize_t len = 0; > + > + while (size > 0) { > + ssize_t ret = read(s->fd, buf, size); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } > + if (len == 0) { > + len = -errno; > + } > + break; > + } > + > + if (ret == 0) { > + /* the write end of the pipe is closed */ > + break; > + } > + len += ret; > + buf += ret; > + size -= ret; > + } > + > + return len; > +} > + > +static int pipe_close(void *opaque) > +{ > + QEMUFilePipe *s = opaque; > + g_free(s); > + return 0; > +} > + > +static QEMUFile *qemu_fopen_pipe(int fd) > +{ > + QEMUFilePipe *s = g_malloc0(sizeof(*s)); > + > + s->fd = fd; > + fcntl_setfl(fd, O_NONBLOCK); > + s->file = qemu_fopen_ops(s, NULL, pipe_get_buffer, pipe_close, > + NULL, NULL, NULL); > + return s->file; > +} > + > +/* write only */ > +struct QEMUFileNonblock { > + int fd; > + QEMUFile *file; > + > + /* for pipe-write nonblocking mode */ > +#define BUF_SIZE_INC (32 * 1024) /* = IO_BUF_SIZE */ > + uint8_t *buffer; > + size_t buffer_size; > + size_t buffer_capacity; > + bool freeze_output; > +}; > +typedef struct QEMUFileNonblock QEMUFileNonblock; > + Couldn't you use QEMUFileBuffered ? > +static void nonblock_flush_buffer(QEMUFileNonblock *s) > +{ > + size_t offset = 0; > + ssize_t ret; > + > + while (offset < s->buffer_size) { > + ret = write(s->fd, s->buffer + offset, s->buffer_size - offset); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } else if (errno == EAGAIN) { > + s->freeze_output = true; > + } else { > + qemu_file_set_error(s->file, errno); > + } > + break; > + } > + > + if (ret == 0) { > + DPRINTF("ret == 0\n"); > + break; > + } > + > + offset += ret; > + } > + > + if (offset > 0) { > + assert(s->buffer_size >= offset); > + memmove(s->buffer, s->buffer + offset, s->buffer_size - offset); > + s->buffer_size -= offset; > + } > + if (s->buffer_size > 0) { > + s->freeze_output = true; > + } > +} > + > +static int nonblock_put_buffer(void *opaque, > + const uint8_t *buf, int64_t pos, int size) > +{ > + QEMUFileNonblock *s = opaque; > + int error; > + ssize_t len = 0; > + > + error = qemu_file_get_error(s->file); > + if (error) { > + return error; > + } > + > + nonblock_flush_buffer(s); > + error = qemu_file_get_error(s->file); > + if (error) { > + return error; > + } > + > + while (!s->freeze_output && size > 0) { > + ssize_t ret; > + assert(s->buffer_size == 0); > + > + ret = write(s->fd, buf, size); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } else if (errno == EAGAIN) { > + s->freeze_output = true; > + } else { > + qemu_file_set_error(s->file, errno); > + } > + break; > + } > + > + len += ret; > + buf += ret; > + size -= ret; > + } > + > + if (size > 0) { > + int inc = size - (s->buffer_capacity - s->buffer_size); > + if (inc > 0) { > + s->buffer_capacity += > + DIV_ROUND_UP(inc, BUF_SIZE_INC) * BUF_SIZE_INC; > + s->buffer = g_realloc(s->buffer, s->buffer_capacity); > + } > + memcpy(s->buffer + s->buffer_size, buf, size); > + s->buffer_size += size; > + > + len += size; > + } > + > + return len; > +} > + > +static int nonblock_pending_size(QEMUFileNonblock *s) > +{ > + return qemu_pending_size(s->file) + s->buffer_size; > +} > + > +static void nonblock_fflush(QEMUFileNonblock *s) > +{ > + s->freeze_output = false; > + nonblock_flush_buffer(s); > + if (!s->freeze_output) { > + qemu_fflush(s->file); > + } > +} > + > +static void nonblock_wait_for_flush(QEMUFileNonblock *s) > +{ > + while (nonblock_pending_size(s) > 0) { > + fd_set fds; > + FD_ZERO(&fds); > + FD_SET(s->fd, &fds); > + select(s->fd + 1, NULL, &fds, NULL, NULL); > + > + nonblock_fflush(s); > + } > +} > + > +static int nonblock_close(void *opaque) > +{ > + QEMUFileNonblock *s = opaque; > + nonblock_wait_for_flush(s); > + g_free(s->buffer); > + g_free(s); > + return 0; > +} > + > +static QEMUFileNonblock *qemu_fopen_nonblock(int fd) > +{ > + QEMUFileNonblock *s = g_malloc0(sizeof(*s)); > + > + s->fd = fd; > + fcntl_setfl(fd, O_NONBLOCK); > + s->file = qemu_fopen_ops(s, nonblock_put_buffer, NULL, nonblock_close, > + NULL, NULL, NULL); > + return s; > +} > + > +/*************************************************************************** > + * umem daemon on destination <-> qemu on source protocol > + */ > + > +#define QEMU_UMEM_REQ_INIT 0x00 > +#define QEMU_UMEM_REQ_ON_DEMAND 0x01 > +#define QEMU_UMEM_REQ_ON_DEMAND_CONT 0x02 > +#define QEMU_UMEM_REQ_BACKGROUND 0x03 > +#define QEMU_UMEM_REQ_BACKGROUND_CONT 0x04 > +#define QEMU_UMEM_REQ_REMOVE 0x05 > +#define QEMU_UMEM_REQ_EOC 0x06 > + > +struct qemu_umem_req { > + int8_t cmd; > + uint8_t len; > + char *idstr; /* ON_DEMAND, BACKGROUND, REMOVE */ > + uint32_t nr; /* ON_DEMAND, ON_DEMAND_CONT, > + BACKGROUND, BACKGROUND_CONT, REMOVE */ > + > + /* in target page size as qemu migration protocol */ > + uint64_t *pgoffs; /* ON_DEMAND, ON_DEMAND_CONT, > + BACKGROUND, BACKGROUND_CONT, REMOVE */ > +}; > + > +static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr) > +{ > + qemu_put_byte(f, strlen(idstr)); > + qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr)); > +} > + > +static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr, > + const uint64_t *pgoffs) > +{ > + uint32_t i; > + > + qemu_put_be32(f, nr); > + for (i = 0; i < nr; i++) { > + qemu_put_be64(f, pgoffs[i]); > + } > +} > + > +static void postcopy_incoming_send_req_one(QEMUFile *f, > + const struct qemu_umem_req *req) > +{ > + DPRINTF("cmd %d\n", req->cmd); > + qemu_put_byte(f, req->cmd); > + switch (req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + case QEMU_UMEM_REQ_EOC: > + /* nothing */ > + break; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + case QEMU_UMEM_REQ_REMOVE: > + postcopy_incoming_send_req_idstr(f, req->idstr); > + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs); > + break; > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs); > + break; > + default: > + abort(); > + break; > + } > +} > + > +/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024. > + * So one message size must be <= IO_BUF_SIZE > + * cmd: 1 > + * id len: 1 > + * id: 256 > + * nr: 2 > + */ > +#define MAX_PAGE_NR ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t)) > +static void postcopy_incoming_send_req(QEMUFile *f, > + const struct qemu_umem_req *req) > +{ > + uint32_t nr = req->nr; > + struct qemu_umem_req tmp = *req; > + > + switch (req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + case QEMU_UMEM_REQ_EOC: > + postcopy_incoming_send_req_one(f, &tmp); > + break; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + tmp.nr = MIN(nr, MAX_PAGE_NR); > + postcopy_incoming_send_req_one(f, &tmp); > + > + nr -= tmp.nr; > + tmp.pgoffs += tmp.nr; > + if (tmp.cmd == QEMU_UMEM_REQ_ON_DEMAND) { > + tmp.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT; > + }else { > + tmp.cmd = QEMU_UMEM_REQ_BACKGROUND_CONT; > + } > + /* fall through */ > + case QEMU_UMEM_REQ_REMOVE: > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + while (nr > 0) { > + tmp.nr = MIN(nr, MAX_PAGE_NR); > + postcopy_incoming_send_req_one(f, &tmp); > + > + nr -= tmp.nr; > + tmp.pgoffs += tmp.nr; > + } > + break; > + default: > + abort(); > + break; > + } > +} > + > +static int postcopy_outgoing_recv_req_idstr(QEMUFile *f, > + struct qemu_umem_req *req, > + size_t *offset) > +{ > + int ret; > + > + req->len = qemu_peek_byte(f, *offset); > + *offset += 1; > + if (req->len == 0) { > + return -EAGAIN; > + } > + req->idstr = g_malloc((int)req->len + 1); > + ret = qemu_peek_buffer(f, (uint8_t*)req->idstr, req->len, *offset); > + *offset += ret; > + if (ret != req->len) { > + g_free(req->idstr); > + req->idstr = NULL; > + return -EAGAIN; > + } > + req->idstr[req->len] = 0; > + return 0; > +} > + > +static int postcopy_outgoing_recv_req_pgoffs(QEMUFile *f, > + struct qemu_umem_req *req, > + size_t *offset) > +{ > + int ret; > + uint32_t be32; > + uint32_t i; > + > + ret = qemu_peek_buffer(f, (uint8_t*)&be32, sizeof(be32), *offset); > + *offset += sizeof(be32); > + if (ret != sizeof(be32)) { > + return -EAGAIN; > + } > + > + req->nr = be32_to_cpu(be32); > + req->pgoffs = g_new(uint64_t, req->nr); > + for (i = 0; i < req->nr; i++) { > + uint64_t be64; > + ret = qemu_peek_buffer(f, (uint8_t*)&be64, sizeof(be64), *offset); > + *offset += sizeof(be64); > + if (ret != sizeof(be64)) { > + g_free(req->pgoffs); > + req->pgoffs = NULL; > + return -EAGAIN; > + } > + req->pgoffs[i] = be64_to_cpu(be64); > + } > + return 0; > +} > + > +static int postcopy_outgoing_recv_req(QEMUFile *f, struct qemu_umem_req *req) > +{ > + int size; > + int ret; > + size_t offset = 0; > + > + size = qemu_peek_buffer(f, (uint8_t*)&req->cmd, 1, offset); > + if (size <= 0) { > + return -EAGAIN; > + } > + offset += 1; > + > + switch (req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + case QEMU_UMEM_REQ_EOC: > + /* nothing */ > + break; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + case QEMU_UMEM_REQ_REMOVE: > + ret = postcopy_outgoing_recv_req_idstr(f, req, &offset); > + if (ret < 0) { > + return ret; > + } > + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset); > + if (ret < 0) { > + return ret; > + } > + break; > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + ret = postcopy_outgoing_recv_req_pgoffs(f, req, &offset); > + if (ret < 0) { > + return ret; > + } > + break; > + default: > + abort(); > + break; > + } > + qemu_file_skip(f, offset); > + DPRINTF("cmd %d\n", req->cmd); > + return 0; > +} > + > +static void postcopy_outgoing_free_req(struct qemu_umem_req *req) > +{ > + g_free(req->idstr); > + g_free(req->pgoffs); > +} > + > +/*************************************************************************** > + * outgoing part > + */ > + > +#define QEMU_SAVE_LIVE_STAGE_START 0x01 /* = QEMU_VM_SECTION_START */ > +#define QEMU_SAVE_LIVE_STAGE_PART 0x02 /* = QEMU_VM_SECTION_PART */ > +#define QEMU_SAVE_LIVE_STAGE_END 0x03 /* = QEMU_VM_SECTION_END */ > + > +enum POState { > + PO_STATE_ERROR_RECEIVE, > + PO_STATE_ACTIVE, > + PO_STATE_EOC_RECEIVED, > + PO_STATE_ALL_PAGES_SENT, > + PO_STATE_COMPLETED, > +}; > +typedef enum POState POState; > + > +struct PostcopyOutgoingState { > + POState state; > + QEMUFile *mig_read; > + int fd_read; > + RAMBlock *last_block_read; > + > + QEMUFile *mig_buffered_write; > + MigrationState *ms; > + > + /* For nobg mode. Check if all pages are sent */ > + RAMBlock *block; > + ram_addr_t addr; > +}; > +typedef struct PostcopyOutgoingState PostcopyOutgoingState; > + > +int postcopy_outgoing_create_read_socket(MigrationState *s) > +{ > + if (!s->params.postcopy) { > + return 0; > + } > + > + s->fd_read = dup(s->fd); > + if (s->fd_read == -1) { > + int ret = -errno; > + perror("dup"); > + return ret; > + } > + s->file_read = qemu_fopen_socket(s->fd_read); > + if (s->file_read == NULL) { > + return -EINVAL; > + } > + return 0; > +} > + > +int postcopy_outgoing_ram_save_live(Monitor *mon, > + QEMUFile *f, int stage, void *opaque) > +{ > + int ret = 0; > + DPRINTF("stage %d\n", stage); > + if (stage == QEMU_SAVE_LIVE_STAGE_START) { > + sort_ram_list(); > + ram_save_live_mem_size(f); > + } > + if (stage == QEMU_SAVE_LIVE_STAGE_PART) { > + ret = 1; > + } > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + return ret; > +} > + > +static RAMBlock *postcopy_outgoing_find_block(const char *idstr) > +{ > + RAMBlock *block; > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (!strncmp(idstr, block->idstr, strlen(idstr))) { > + return block; > + } > + } > + return NULL; > +} > + > +/* > + * return value > + * 0: continue postcopy mode > + * > 0: completed postcopy mode. > + * < 0: error > + */ > +static int postcopy_outgoing_handle_req(PostcopyOutgoingState *s, > + const struct qemu_umem_req *req, > + bool *written) > +{ > + int i; > + RAMBlock *block; > + > + DPRINTF("cmd %d state %d\n", req->cmd, s->state); > + switch(req->cmd) { > + case QEMU_UMEM_REQ_INIT: > + /* nothing */ > + break; > + case QEMU_UMEM_REQ_EOC: > + /* tell to finish migration. */ > + if (s->state == PO_STATE_ALL_PAGES_SENT) { > + s->state = PO_STATE_COMPLETED; > + DPRINTF("-> PO_STATE_COMPLETED\n"); > + } else { > + s->state = PO_STATE_EOC_RECEIVED; > + DPRINTF("-> PO_STATE_EOC_RECEIVED\n"); > + } > + return 1; > + case QEMU_UMEM_REQ_ON_DEMAND: > + case QEMU_UMEM_REQ_BACKGROUND: > + DPRINTF("idstr: %s\n", req->idstr); > + block = postcopy_outgoing_find_block(req->idstr); > + if (block == NULL) { > + return -EINVAL; > + } > + s->last_block_read = block; > + /* fall through */ > + case QEMU_UMEM_REQ_ON_DEMAND_CONT: > + case QEMU_UMEM_REQ_BACKGROUND_CONT: > + DPRINTF("nr %d\n", req->nr); > + for (i = 0; i < req->nr; i++) { > + DPRINTF("offs[%d] 0x%"PRIx64"\n", i, req->pgoffs[i]); > + int ret = ram_save_page(s->mig_buffered_write, s->last_block_read, > + req->pgoffs[i] << TARGET_PAGE_BITS); > + if (ret > 0) { > + *written = true; > + } > + } > + break; > + case QEMU_UMEM_REQ_REMOVE: > + block = postcopy_outgoing_find_block(req->idstr); > + if (block == NULL) { > + return -EINVAL; > + } > + for (i = 0; i < req->nr; i++) { > + ram_addr_t addr = block->offset + > + (req->pgoffs[i] << TARGET_PAGE_BITS); > + cpu_physical_memory_reset_dirty(addr, > + addr + TARGET_PAGE_SIZE, > + MIGRATION_DIRTY_FLAG); > + } > + break; > + default: > + return -EINVAL; > + } > + return 0; > +} > + > +static void postcopy_outgoing_close_mig_read(PostcopyOutgoingState *s) > +{ > + if (s->mig_read != NULL) { > + qemu_set_fd_handler(s->fd_read, NULL, NULL, NULL); > + qemu_fclose(s->mig_read); > + s->mig_read = NULL; > + fd_close(&s->fd_read); > + > + s->ms->file_read = NULL; > + s->ms->fd_read = -1; > + } > +} > + > +static void postcopy_outgoing_completed(PostcopyOutgoingState *s) > +{ > + postcopy_outgoing_close_mig_read(s); > + s->ms->postcopy = NULL; > + g_free(s); > +} > + > +static void postcopy_outgoing_recv_handler(void *opaque) > +{ > + PostcopyOutgoingState *s = opaque; > + bool written = false; > + int ret = 0; > + > + assert(s->state == PO_STATE_ACTIVE || > + s->state == PO_STATE_ALL_PAGES_SENT); > + > + do { > + struct qemu_umem_req req = {.idstr = NULL, > + .pgoffs = NULL}; > + > + ret = postcopy_outgoing_recv_req(s->mig_read, &req); > + if (ret < 0) { > + if (ret == -EAGAIN) { > + ret = 0; > + } > + break; > + } > + if (s->state == PO_STATE_ACTIVE) { > + ret = postcopy_outgoing_handle_req(s, &req, &written); > + } > + postcopy_outgoing_free_req(&req); > + } while (ret == 0); > + > + /* > + * flush buffered_file. > + * Although mig_write is rate-limited buffered file, those written pages > + * are requested on demand by the destination. So forcibly push > + * those pages ignoring rate limiting > + */ > + if (written) { > + qemu_fflush(s->mig_buffered_write); > + /* qemu_buffered_file_drain(s->mig_buffered_write); */ > + } > + > + if (ret < 0) { > + switch (s->state) { > + case PO_STATE_ACTIVE: > + s->state = PO_STATE_ERROR_RECEIVE; > + DPRINTF("-> PO_STATE_ERROR_RECEIVE\n"); > + break; > + case PO_STATE_ALL_PAGES_SENT: > + s->state = PO_STATE_COMPLETED; > + DPRINTF("-> PO_STATE_ALL_PAGES_SENT\n"); > + break; > + default: > + abort(); > + } > + } > + if (s->state == PO_STATE_ERROR_RECEIVE || s->state == PO_STATE_COMPLETED) { > + postcopy_outgoing_close_mig_read(s); > + } > + if (s->state == PO_STATE_COMPLETED) { > + DPRINTF("PO_STATE_COMPLETED\n"); > + MigrationState *ms = s->ms; > + postcopy_outgoing_completed(s); > + migrate_fd_completed(ms); > + } > +} > + > +void *postcopy_outgoing_begin(MigrationState *ms) > +{ > + PostcopyOutgoingState *s = g_new(PostcopyOutgoingState, 1); > + DPRINTF("outgoing begin\n"); > + qemu_fflush(ms->file); > + > + s->ms = ms; > + s->state = PO_STATE_ACTIVE; > + s->fd_read = ms->fd_read; > + s->mig_read = ms->file_read; > + s->mig_buffered_write = ms->file; > + s->block = NULL; > + s->addr = 0; > + > + /* Make sure all dirty bits are set */ > + ram_save_memory_set_dirty(); > + > + qemu_set_fd_handler(s->fd_read, > + &postcopy_outgoing_recv_handler, NULL, s); > + return s; > +} > + > +static void postcopy_outgoing_ram_all_sent(QEMUFile *f, > + PostcopyOutgoingState *s) > +{ > + assert(s->state == PO_STATE_ACTIVE); > + > + s->state = PO_STATE_ALL_PAGES_SENT; > + /* tell incoming side that all pages are sent */ > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + qemu_fflush(f); > + qemu_buffered_file_drain(f); > + DPRINTF("sent RAM_SAVE_FLAG_EOS\n"); > + migrate_fd_cleanup(s->ms); > + > + /* Later migrate_fd_complete() will be called which calls > + * migrate_fd_cleanup() again. So dummy file is created > + * for qemu monitor to keep working. > + */ > + s->ms->file = qemu_fopen_ops(NULL, NULL, NULL, NULL, NULL, > + NULL, NULL); > +} > + > +static int postcopy_outgoing_check_all_ram_sent(PostcopyOutgoingState *s, > + RAMBlock *block, > + ram_addr_t addr) > +{ > + if (block == NULL) { > + block = QLIST_FIRST(&ram_list.blocks); > + addr = block->offset; > + } > + > + for (; block != NULL; > + s->block = QLIST_NEXT(s->block, next), addr = block->offset) { > + for (; addr < block->offset + block->length; > + addr += TARGET_PAGE_SIZE) { > + if (cpu_physical_memory_get_dirty(addr, MIGRATION_DIRTY_FLAG)) { > + s->block = block; > + s->addr = addr; > + return 0; > + } > + } > + } > + > + return 1; > +} > + > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, > + void *postcopy) > +{ > + PostcopyOutgoingState *s = postcopy; > + > + assert(s->state == PO_STATE_ACTIVE || > + s->state == PO_STATE_EOC_RECEIVED || > + s->state == PO_STATE_ERROR_RECEIVE); > + > + switch (s->state) { > + case PO_STATE_ACTIVE: > + /* nothing. processed below */ > + break; > + case PO_STATE_EOC_RECEIVED: > + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); > + s->state = PO_STATE_COMPLETED; > + postcopy_outgoing_completed(s); > + DPRINTF("PO_STATE_COMPLETED\n"); > + return 1; > + case PO_STATE_ERROR_RECEIVE: > + postcopy_outgoing_completed(s); > + DPRINTF("PO_STATE_ERROR_RECEIVE\n"); > + return -1; > + default: > + abort(); > + } > + > + if (s->ms->params.nobg) { > + /* See if all pages are sent. */ > + if (postcopy_outgoing_check_all_ram_sent(s, s->block, s->addr) == 0) { > + return 0; > + } > + /* ram_list can be reordered. (it doesn't seem so during migration, > + though) So the whole list needs to be checked again */ > + if (postcopy_outgoing_check_all_ram_sent(s, NULL, 0) == 0) { > + return 0; > + } > + > + postcopy_outgoing_ram_all_sent(f, s); > + return 0; > + } > + > + DPRINTF("outgoing background state: %d\n", s->state); > + > + while (qemu_file_rate_limit(f) == 0) { > + if (ram_save_block(f) == 0) { /* no more blocks */ > + assert(s->state == PO_STATE_ACTIVE); > + postcopy_outgoing_ram_all_sent(f, s); > + return 0; > + } > + } > + > + return 0; > +} > + > +/*************************************************************************** > + * incoming part > + */ > + > +/* flags for incoming mode to modify the behavior. > + This is for benchmark/debug purpose */ > +#define INCOMING_FLAGS_FAULT_REQUEST 0x01 > + > + > +static void postcopy_incoming_umemd(void); > + > +#define PIS_STATE_QUIT_RECEIVED 0x01 > +#define PIS_STATE_QUIT_QUEUED 0x02 > +#define PIS_STATE_QUIT_SENT 0x04 > + > +#define PIS_STATE_QUIT_MASK (PIS_STATE_QUIT_RECEIVED | \ > + PIS_STATE_QUIT_QUEUED | \ > + PIS_STATE_QUIT_SENT) > + > +struct PostcopyIncomingState { > + /* dest qemu state */ > + uint32_t state; > + > + UMemDev *dev; > + int host_page_size; > + int host_page_shift; > + > + /* qemu side */ > + int to_umemd_fd; > + QEMUFileNonblock *to_umemd; > +#define MAX_FAULTED_PAGES 256 > + struct umem_pages *faulted_pages; > + > + int from_umemd_fd; > + QEMUFile *from_umemd; > + int version_id; /* save/load format version id */ > +}; > +typedef struct PostcopyIncomingState PostcopyIncomingState; > + > + > +#define UMEM_STATE_EOS_RECEIVED 0x01 /* umem daemon <-> src qemu */ > +#define UMEM_STATE_EOC_SENT 0x02 /* umem daemon <-> src qemu */ > +#define UMEM_STATE_QUIT_RECEIVED 0x04 /* umem daemon <-> dst qemu */ > +#define UMEM_STATE_QUIT_QUEUED 0x08 /* umem daemon <-> dst qemu */ > +#define UMEM_STATE_QUIT_SENT 0x10 /* umem daemon <-> dst qemu */ > + > +#define UMEM_STATE_QUIT_MASK (UMEM_STATE_QUIT_QUEUED | \ > + UMEM_STATE_QUIT_SENT | \ > + UMEM_STATE_QUIT_RECEIVED) > +#define UMEM_STATE_END_MASK (UMEM_STATE_EOS_RECEIVED | \ > + UMEM_STATE_EOC_SENT | \ > + UMEM_STATE_QUIT_MASK) > + > +struct PostcopyIncomingUMemDaemon { > + /* umem daemon side */ > + uint32_t state; > + > + int host_page_size; > + int host_page_shift; > + int nr_host_pages_per_target_page; > + int host_to_target_page_shift; > + int nr_target_pages_per_host_page; > + int target_to_host_page_shift; > + int version_id; /* save/load format version id */ > + > + int to_qemu_fd; > + QEMUFileNonblock *to_qemu; > + int from_qemu_fd; > + QEMUFile *from_qemu; > + > + int mig_read_fd; > + QEMUFile *mig_read; /* qemu on source -> umem daemon */ > + > + int mig_write_fd; > + QEMUFileNonblock *mig_write; /* umem daemon -> qemu on source */ > + > + /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */ > +#define MAX_REQUESTS (512 * (64 + 1)) > + > + struct umem_page_request page_request; > + struct umem_page_cached page_cached; > + > +#define MAX_PRESENT_REQUESTS MAX_FAULTED_PAGES > + struct umem_pages *present_request; > + > + uint64_t *target_pgoffs; > + > + /* bitmap indexed by target page offset */ > + unsigned long *phys_requested; > + > + /* bitmap indexed by target page offset */ > + unsigned long *phys_received; > + > + RAMBlock *last_block_read; /* qemu on source -> umem daemon */ > + RAMBlock *last_block_write; /* umem daemon -> qemu on source */ > +}; > +typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon; > + > +static PostcopyIncomingState state = { > + .state = 0, > + .dev = NULL, > + .to_umemd_fd = -1, > + .to_umemd = NULL, > + .from_umemd_fd = -1, > + .from_umemd = NULL, > +}; > + > +static PostcopyIncomingUMemDaemon umemd = { > + .state = 0, > + .to_qemu_fd = -1, > + .to_qemu = NULL, > + .from_qemu_fd = -1, > + .from_qemu = NULL, > + .mig_read_fd = -1, > + .mig_read = NULL, > + .mig_write_fd = -1, > + .mig_write = NULL, > +}; > + > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy) > +{ > + /* incoming_postcopy makes sense only when incoming migration mode */ > + if (!incoming && incoming_postcopy) { > + return -EINVAL; > + } > + > + if (!incoming_postcopy) { > + return 0; > + } > + > + state.state = 0; > + state.dev = umem_dev_new(); > + state.host_page_size = getpagesize(); > + state.host_page_shift = ffs(state.host_page_size) - 1; > + state.version_id = RAM_SAVE_VERSION_ID; /* = save version of > + ram_save_live() */ > + return 0; > +} > + > +void postcopy_incoming_ram_alloc(const char *name, > + size_t size, uint8_t **hostp, UMem **umemp) > +{ > + UMem *umem; > + size = ALIGN_UP(size, state.host_page_size); > + umem = umem_dev_create(state.dev, size, name); > + > + *umemp = umem; > + *hostp = umem->umem; > +} > + > +void postcopy_incoming_ram_free(UMem *umem) > +{ > + umem_unmap(umem); > + umem_close(umem); > + umem_destroy(umem); > +} > + > +void postcopy_incoming_prepare(void) > +{ > + RAMBlock *block; > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL) { > + umem_mmap(block->umem); > + } > + } > +} > + > +static int postcopy_incoming_ram_load_get64(QEMUFile *f, > + ram_addr_t *addr, int *flags) > +{ > + *addr = qemu_get_be64(f); > + *flags = *addr & ~TARGET_PAGE_MASK; > + *addr &= TARGET_PAGE_MASK; > + return qemu_file_get_error(f); > +} > + > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id) > +{ > + ram_addr_t addr; > + int flags; > + int error; > + > + DPRINTF("incoming ram load\n"); > + /* > + * RAM_SAVE_FLAGS_EOS or > + * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS > + * see postcopy_outgoing_ram_save_live() > + */ > + > + if (version_id != RAM_SAVE_VERSION_ID) { > + DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n", > + version_id, RAM_SAVE_VERSION_ID); > + return -EINVAL; > + } > + error = postcopy_incoming_ram_load_get64(f, &addr, &flags); > + DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags); > + if (error) { > + DPRINTF("error %d\n", error); > + return error; > + } > + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) { > + DPRINTF("EOS\n"); > + return 0; > + } > + > + if (flags != RAM_SAVE_FLAG_MEM_SIZE) { > + DPRINTF("-EINVAL flags 0x%x\n", flags); > + return -EINVAL; > + } > + error = ram_load_mem_size(f, addr); > + if (error) { > + DPRINTF("addr 0x%lx error %d\n", addr, error); > + return error; > + } > + > + error = postcopy_incoming_ram_load_get64(f, &addr, &flags); > + if (error) { > + DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error); > + return error; > + } > + if (flags == RAM_SAVE_FLAG_EOS && addr == 0) { > + DPRINTF("done\n"); > + return 0; > + } > + DPRINTF("-EINVAL\n"); > + return -EINVAL; > +} > + > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read) > +{ > + int fds[2]; > + RAMBlock *block; > + > + DPRINTF("fork\n"); > + > + /* socketpair(AF_UNIX)? */ > + > + if (qemu_pipe(fds) == -1) { > + perror("qemu_pipe"); > + abort(); > + } > + state.from_umemd_fd = fds[0]; > + umemd.to_qemu_fd = fds[1]; > + > + if (qemu_pipe(fds) == -1) { > + perror("qemu_pipe"); > + abort(); > + } > + umemd.from_qemu_fd = fds[0]; > + state.to_umemd_fd = fds[1]; > + > + pid_t child = fork(); > + if (child < 0) { > + perror("fork"); > + abort(); > + } > + > + if (child == 0) { > + int mig_write_fd; > + > + fd_close(&state.to_umemd_fd); > + fd_close(&state.from_umemd_fd); > + umemd.host_page_size = state.host_page_size; > + umemd.host_page_shift = state.host_page_shift; > + > + umemd.nr_host_pages_per_target_page = > + TARGET_PAGE_SIZE / umemd.host_page_size; > + umemd.nr_target_pages_per_host_page = > + umemd.host_page_size / TARGET_PAGE_SIZE; > + > + umemd.target_to_host_page_shift = > + ffs(umemd.nr_host_pages_per_target_page) - 1; > + umemd.host_to_target_page_shift = > + ffs(umemd.nr_target_pages_per_host_page) - 1; > + > + umemd.state = 0; > + umemd.version_id = state.version_id; > + umemd.mig_read_fd = mig_read_fd; > + umemd.mig_read = mig_read; > + > + mig_write_fd = dup(mig_read_fd); > + if (mig_write_fd < 0) { > + perror("could not dup for writable socket \n"); > + abort(); > + } > + umemd.mig_write_fd = mig_write_fd; > + umemd.mig_write = qemu_fopen_nonblock(mig_write_fd); > + > + postcopy_incoming_umemd(); /* noreturn */ > + } > + > + DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child); > + fd_close(&umemd.to_qemu_fd); > + fd_close(&umemd.from_qemu_fd); > + state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES)); > + state.faulted_pages->nr = 0; > + > + /* close all UMem.shmem_fd */ > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + umem_close_shmem(block->umem); > + } > + umem_qemu_wait_for_daemon(state.from_umemd_fd); > +} > + > +static void postcopy_incoming_qemu_recv_quit(void) > +{ > + RAMBlock *block; > + if (state.state & PIS_STATE_QUIT_RECEIVED) { > + return; > + } > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL) { > + umem_destroy(block->umem); > + block->umem = NULL; > + block->flags &= ~RAM_POSTCOPY_UMEM_MASK; > + } > + } > + > + DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n"); > + state.state |= PIS_STATE_QUIT_RECEIVED; > + qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL); > + qemu_fclose(state.from_umemd); > + state.from_umemd = NULL; > + fd_close(&state.from_umemd_fd); > +} > + > +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque) > +{ > + assert(state.to_umemd != NULL); > + > + nonblock_fflush(state.to_umemd); > + if (nonblock_pending_size(state.to_umemd) > 0) { > + return; > + } > + > + qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL); > + if (state.state & PIS_STATE_QUIT_QUEUED) { > + DPRINTF("|= PIS_STATE_QUIT_SENT\n"); > + state.state |= PIS_STATE_QUIT_SENT; > + qemu_fclose(state.to_umemd->file); > + state.to_umemd = NULL; > + fd_close(&state.to_umemd_fd); > + g_free(state.faulted_pages); > + state.faulted_pages = NULL; > + } > +} > + > +static void postcopy_incoming_qemu_fflush_to_umemd(void) > +{ > + qemu_set_fd_handler(state.to_umemd->fd, NULL, > + postcopy_incoming_qemu_fflush_to_umemd_handler, NULL); > + postcopy_incoming_qemu_fflush_to_umemd_handler(NULL); > +} > + > +static void postcopy_incoming_qemu_queue_quit(void) > +{ > + if (state.state & PIS_STATE_QUIT_QUEUED) { > + return; > + } > + > + DPRINTF("|= PIS_STATE_QUIT_QUEUED\n"); > + umem_qemu_quit(state.to_umemd->file); > + state.state |= PIS_STATE_QUIT_QUEUED; > +} > + > +static void postcopy_incoming_qemu_send_pages_present(void) > +{ > + if (state.faulted_pages->nr > 0) { > + umem_qemu_send_pages_present(state.to_umemd->file, > + state.faulted_pages); > + state.faulted_pages->nr = 0; > + } > +} > + > +static void postcopy_incoming_qemu_faulted_pages( > + const struct umem_pages *pages) > +{ > + assert(pages->nr <= MAX_FAULTED_PAGES); > + assert(state.faulted_pages != NULL); > + > + if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) { > + postcopy_incoming_qemu_send_pages_present(); > + } > + memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr], > + &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr); > + state.faulted_pages->nr += pages->nr; > +} > + > +static void postcopy_incoming_qemu_cleanup_umem(void); > + > +static int postcopy_incoming_qemu_handle_req_one(void) > +{ > + int offset = 0; > + int ret; > + uint8_t cmd; > + > + ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset); > + offset += sizeof(cmd); > + if (ret != sizeof(cmd)) { > + return -EAGAIN; > + } > + DPRINTF("cmd %c\n", cmd); > + > + switch (cmd) { > + case UMEM_DAEMON_QUIT: > + postcopy_incoming_qemu_recv_quit(); > + postcopy_incoming_qemu_queue_quit(); > + postcopy_incoming_qemu_cleanup_umem(); > + break; > + case UMEM_DAEMON_TRIGGER_PAGE_FAULT: { > + struct umem_pages *pages = > + umem_qemu_trigger_page_fault(state.from_umemd, &offset); > + if (pages == NULL) { > + return -EAGAIN; > + } > + if (state.to_umemd_fd >= 0 && !(state.state & PIS_STATE_QUIT_QUEUED)) { > + postcopy_incoming_qemu_faulted_pages(pages); > + g_free(pages); > + } > + break; > + } > + case UMEM_DAEMON_ERROR: > + /* umem daemon hit troubles, so it warned us to stop vm execution */ > + vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */ > + break; > + default: > + abort(); > + break; > + } > + > + if (state.from_umemd != NULL) { > + qemu_file_skip(state.from_umemd, offset); > + } > + return 0; > +} > + > +static void postcopy_incoming_qemu_handle_req(void *opaque) > +{ > + do { > + int ret = postcopy_incoming_qemu_handle_req_one(); > + if (ret == -EAGAIN) { > + break; > + } > + } while (state.from_umemd != NULL && > + qemu_pending_size(state.from_umemd) > 0); > + > + if (state.to_umemd != NULL) { > + if (state.faulted_pages->nr > 0) { > + postcopy_incoming_qemu_send_pages_present(); > + } > + postcopy_incoming_qemu_fflush_to_umemd(); > + } > +} > + > +void postcopy_incoming_qemu_ready(void) > +{ > + umem_qemu_ready(state.to_umemd_fd); > + > + state.from_umemd = qemu_fopen_pipe(state.from_umemd_fd); > + state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd); > + qemu_set_fd_handler(state.from_umemd_fd, > + postcopy_incoming_qemu_handle_req, NULL, NULL); > +} > + > +static void postcopy_incoming_qemu_cleanup_umem(void) > +{ > + /* when qemu will quit before completing postcopy, tell umem daemon > + to tear down umem device and exit. */ > + if (state.to_umemd_fd >= 0) { > + postcopy_incoming_qemu_queue_quit(); > + postcopy_incoming_qemu_fflush_to_umemd(); > + } > + > + if (state.dev) { > + umem_dev_destroy(state.dev); > + state.dev = NULL; > + } > +} > + > +void postcopy_incoming_qemu_cleanup(void) > +{ > + postcopy_incoming_qemu_cleanup_umem(); > + if (state.to_umemd != NULL) { > + nonblock_wait_for_flush(state.to_umemd); > + } > +} > + > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size) > +{ > + uint64_t nr = DIV_ROUND_UP(size, state.host_page_size); > + size_t len = umem_pages_size(nr); > + ram_addr_t end = addr + size; > + struct umem_pages *pages; > + int i; > + > + if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) { > + return; > + } > + pages = g_malloc(len); > + pages->nr = nr; > + for (i = 0; addr < end; addr += state.host_page_size, i++) { > + pages->pgoffs[i] = addr >> state.host_page_shift; > + } > + umem_qemu_send_pages_unmapped(state.to_umemd->file, pages); > + g_free(pages); > + assert(state.to_umemd != NULL); > + postcopy_incoming_qemu_fflush_to_umemd(); > +} > + > +/************************************************************************** > + * incoming umem daemon > + */ > + > +static void postcopy_incoming_umem_recv_quit(void) > +{ > + if (umemd.state & UMEM_STATE_QUIT_RECEIVED) { > + return; > + } > + DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n"); > + umemd.state |= UMEM_STATE_QUIT_RECEIVED; > + qemu_fclose(umemd.from_qemu); > + umemd.from_qemu = NULL; > + fd_close(&umemd.from_qemu_fd); > +} > + > +static void postcopy_incoming_umem_queue_quit(void) > +{ > + if (umemd.state & UMEM_STATE_QUIT_QUEUED) { > + return; > + } > + DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n"); > + umem_daemon_quit(umemd.to_qemu->file); > + umemd.state |= UMEM_STATE_QUIT_QUEUED; > +} > + > +static void postcopy_incoming_umem_send_eoc_req(void) > +{ > + struct qemu_umem_req req; > + > + if (umemd.state & UMEM_STATE_EOC_SENT) { > + return; > + } > + > + DPRINTF("|= UMEM_STATE_EOC_SENT\n"); > + req.cmd = QEMU_UMEM_REQ_EOC; > + postcopy_incoming_send_req(umemd.mig_write->file, &req); > + umemd.state |= UMEM_STATE_EOC_SENT; > + qemu_fclose(umemd.mig_write->file); > + umemd.mig_write = NULL; > + fd_close(&umemd.mig_write_fd); > +} > + > +static void postcopy_incoming_umem_send_page_req(RAMBlock *block) > +{ > + struct qemu_umem_req req; > + int bit; > + uint64_t target_pgoff; > + int i; > + > + umemd.page_request.nr = MAX_REQUESTS; > + umem_get_page_request(block->umem, &umemd.page_request); > + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n", > + block->idstr, umemd.page_request.nr, > + (uint64_t)umemd.page_request.pgoffs[0], > + (uint64_t)umemd.page_request.pgoffs[1]); > + > + if (umemd.last_block_write != block) { > + req.cmd = QEMU_UMEM_REQ_ON_DEMAND; > + req.idstr = block->idstr; > + } else { > + req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT; > + } > + > + req.nr = 0; > + req.pgoffs = umemd.target_pgoffs; > + if (TARGET_PAGE_SIZE >= umemd.host_page_size) { > + for (i = 0; i < umemd.page_request.nr; i++) { > + target_pgoff = > + umemd.page_request.pgoffs[i] >> umemd.host_to_target_page_shift; > + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff; > + > + if (!test_and_set_bit(bit, umemd.phys_requested)) { > + req.pgoffs[req.nr] = target_pgoff; > + req.nr++; > + } > + } > + } else { > + for (i = 0; i < umemd.page_request.nr; i++) { > + int j; > + target_pgoff = > + umemd.page_request.pgoffs[i] << umemd.host_to_target_page_shift; > + bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff; > + > + for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) { > + if (!test_and_set_bit(bit + j, umemd.phys_requested)) { > + req.pgoffs[req.nr] = target_pgoff + j; > + req.nr++; > + } > + } > + } > + } > + > + DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n", > + block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]); > + if (req.nr > 0 && umemd.mig_write != NULL) { > + postcopy_incoming_send_req(umemd.mig_write->file, &req); > + umemd.last_block_write = block; > + } > +} > + > +static void postcopy_incoming_umem_send_pages_present(void) > +{ > + if (umemd.present_request->nr > 0) { > + umem_daemon_send_pages_present(umemd.to_qemu->file, > + umemd.present_request); > + umemd.present_request->nr = 0; > + } > +} > + > +static void postcopy_incoming_umem_pages_present_one( > + uint32_t nr, const __u64 *pgoffs, uint64_t ramblock_pgoffset) > +{ > + uint32_t i; > + assert(nr <= MAX_PRESENT_REQUESTS); > + > + if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) { > + postcopy_incoming_umem_send_pages_present(); > + } > + > + for (i = 0; i < nr; i++) { > + umemd.present_request->pgoffs[umemd.present_request->nr + i] = > + pgoffs[i] + ramblock_pgoffset; > + } > + umemd.present_request->nr += nr; > +} > + > +static void postcopy_incoming_umem_pages_present( > + const struct umem_page_cached *page_cached, uint64_t ramblock_pgoffset) > +{ > + uint32_t left = page_cached->nr; > + uint32_t offset = 0; > + > + while (left > 0) { > + uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS); > + postcopy_incoming_umem_pages_present_one( > + nr, &page_cached->pgoffs[offset], ramblock_pgoffset); > + > + left -= nr; > + offset += nr; > + } > +} > + > +static int postcopy_incoming_umem_ram_load(void) > +{ > + ram_addr_t offset; > + int flags; > + int error; > + void *shmem; > + int i; > + int bit; > + > + if (umemd.version_id != RAM_SAVE_VERSION_ID) { > + return -EINVAL; > + } > + > + offset = qemu_get_be64(umemd.mig_read); > + > + flags = offset & ~TARGET_PAGE_MASK; > + offset &= TARGET_PAGE_MASK; > + > + assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE)); > + > + if (flags & RAM_SAVE_FLAG_EOS) { > + DPRINTF("RAM_SAVE_FLAG_EOS\n"); > + postcopy_incoming_umem_send_eoc_req(); > + > + qemu_fclose(umemd.mig_read); > + umemd.mig_read = NULL; > + fd_close(&umemd.mig_read_fd); > + umemd.state |= UMEM_STATE_EOS_RECEIVED; > + > + postcopy_incoming_umem_queue_quit(); > + DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n"); > + return 0; > + } > + > + shmem = ram_load_host_from_stream_offset(umemd.mig_read, offset, flags, > + &umemd.last_block_read); > + if (!shmem) { > + DPRINTF("shmem == NULL\n"); > + return -EINVAL; > + } > + > + if (flags & RAM_SAVE_FLAG_COMPRESS) { > + uint8_t ch = qemu_get_byte(umemd.mig_read); > + memset(shmem, ch, TARGET_PAGE_SIZE); > + } else if (flags & RAM_SAVE_FLAG_PAGE) { > + qemu_get_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE); > + } > + > + error = qemu_file_get_error(umemd.mig_read); > + if (error) { > + DPRINTF("error %d\n", error); > + return error; > + } > + > + umemd.page_cached.nr = 0; > + bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS; > + if (!test_and_set_bit(bit, umemd.phys_received)) { > + if (TARGET_PAGE_SIZE >= umemd.host_page_size) { > + __u64 pgoff = offset >> umemd.host_page_shift; > + for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) { > + umemd.page_cached.pgoffs[umemd.page_cached.nr] = pgoff + i; > + umemd.page_cached.nr++; > + } > + } else { > + bool mark_cache = true; > + for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) { > + if (!test_bit(bit + i, umemd.phys_received)) { > + mark_cache = false; > + break; > + } > + } > + if (mark_cache) { > + umemd.page_cached.pgoffs[0] = offset >> umemd.host_page_shift; > + umemd.page_cached.nr = 1; > + } > + } > + } > + > + if (umemd.page_cached.nr > 0) { > + umem_mark_page_cached(umemd.last_block_read->umem, &umemd.page_cached); > + > + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 && > + (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) { > + uint64_t ramblock_pgoffset; > + > + ramblock_pgoffset = > + umemd.last_block_read->offset >> umemd.host_page_shift; > + postcopy_incoming_umem_pages_present(&umemd.page_cached, > + ramblock_pgoffset); > + } > + } > + > + return 0; > +} > + > +static bool postcopy_incoming_umem_check_umem_done(void) > +{ > + bool all_done = true; > + RAMBlock *block; > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + UMem *umem = block->umem; > + if (umem != NULL && umem->nsets == umem->nbits) { > + umem_unmap_shmem(umem); > + umem_destroy(umem); > + block->umem = NULL; > + } > + if (block->umem != NULL) { > + all_done = false; > + } > + } > + return all_done; > +} > + > +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages *pages) > +{ > + int i; > + > + for (i = 0; i < pages->nr; i++) { > + ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift; > + RAMBlock *block = qemu_get_ram_block(addr); > + addr -= block->offset; > + umem_remove_shmem(block->umem, addr, umemd.host_page_size); > + } > + return postcopy_incoming_umem_check_umem_done(); > +} > + > +static bool > +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages) > +{ > + RAMBlock *block; > + ram_addr_t addr; > + int i; > + > + struct qemu_umem_req req = { > + .cmd = QEMU_UMEM_REQ_REMOVE, > + .nr = 0, > + .pgoffs = (uint64_t*)pages->pgoffs, > + }; > + > + addr = pages->pgoffs[0] << umemd.host_page_shift; > + block = qemu_get_ram_block(addr); > + > + for (i = 0; i < pages->nr; i++) { > + int pgoff; > + > + addr = pages->pgoffs[i] << umemd.host_page_shift; > + pgoff = addr >> TARGET_PAGE_BITS; > + if (!test_bit(pgoff, umemd.phys_received) && > + !test_bit(pgoff, umemd.phys_requested)) { > + req.pgoffs[req.nr] = pgoff; > + req.nr++; > + } > + set_bit(pgoff, umemd.phys_received); > + set_bit(pgoff, umemd.phys_requested); > + > + umem_remove_shmem(block->umem, > + addr - block->offset, umemd.host_page_size); > + } > + if (req.nr > 0 && umemd.mig_write != NULL) { > + req.idstr = block->idstr; > + postcopy_incoming_send_req(umemd.mig_write->file, &req); > + } > + > + return postcopy_incoming_umem_check_umem_done(); > +} > + > +static void postcopy_incoming_umem_done(void) > +{ > + postcopy_incoming_umem_send_eoc_req(); > + postcopy_incoming_umem_queue_quit(); > +} > + > +static int postcopy_incoming_umem_handle_qemu(void) > +{ > + int ret; > + int offset = 0; > + uint8_t cmd; > + > + ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset); > + offset += sizeof(cmd); > + if (ret != sizeof(cmd)) { > + return -EAGAIN; > + } > + DPRINTF("cmd %c\n", cmd); > + switch (cmd) { > + case UMEM_QEMU_QUIT: > + postcopy_incoming_umem_recv_quit(); > + postcopy_incoming_umem_done(); > + break; > + case UMEM_QEMU_PAGE_FAULTED: { > + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu, > + &offset); > + if (pages == NULL) { > + return -EAGAIN; > + } > + if (postcopy_incoming_umem_page_faulted(pages)){ > + postcopy_incoming_umem_done(); > + } > + g_free(pages); > + break; > + } > + case UMEM_QEMU_PAGE_UNMAPPED: { > + struct umem_pages *pages = umem_recv_pages(umemd.from_qemu, > + &offset); > + if (pages == NULL) { > + return -EAGAIN; > + } > + if (postcopy_incoming_umem_page_unmapped(pages)){ > + postcopy_incoming_umem_done(); > + } > + g_free(pages); > + break; > + } > + default: > + abort(); > + break; > + } > + if (umemd.from_qemu != NULL) { > + qemu_file_skip(umemd.from_qemu, offset); > + } > + return 0; > +} > + > +static void set_fd(int fd, fd_set *fds, int *nfds) > +{ > + FD_SET(fd, fds); > + if (fd > *nfds) { > + *nfds = fd; > + } > +} > + > +static int postcopy_incoming_umemd_main_loop(void) > +{ > + fd_set writefds; > + fd_set readfds; > + int nfds; > + RAMBlock *block; > + int ret; > + > + int pending_size; > + bool get_page_request; > + > + nfds = -1; > + FD_ZERO(&writefds); > + FD_ZERO(&readfds); > + > + if (umemd.mig_write != NULL) { > + pending_size = nonblock_pending_size(umemd.mig_write); > + if (pending_size > 0) { > + set_fd(umemd.mig_write_fd, &writefds, &nfds); > + } > + } else { > + pending_size = 0; > + } > + > +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2) > + /* If page request to the migration source is accumulated, > + suspend getting page fault request. */ > + get_page_request = (pending_size <= PENDING_SIZE_MAX); > + > + if (get_page_request) { > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL) { > + set_fd(block->umem->fd, &readfds, &nfds); > + } > + } > + } > + > + if (umemd.mig_read_fd >= 0) { > + set_fd(umemd.mig_read_fd, &readfds, &nfds); > + } > + > + if (umemd.to_qemu != NULL && > + nonblock_pending_size(umemd.to_qemu) > 0) { > + set_fd(umemd.to_qemu_fd, &writefds, &nfds); > + } > + if (umemd.from_qemu_fd >= 0) { > + set_fd(umemd.from_qemu_fd, &readfds, &nfds); > + } > + > + ret = select(nfds + 1, &readfds, &writefds, NULL, NULL); > + if (ret == -1) { > + if (errno == EINTR) { > + return 0; > + } > + return ret; > + } > + > + if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) { > + nonblock_fflush(umemd.mig_write); > + } > + if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) { > + nonblock_fflush(umemd.to_qemu); > + } > + if (get_page_request) { > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) { > + postcopy_incoming_umem_send_page_req(block); > + } > + } > + } > + if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) { > + do { > + ret = postcopy_incoming_umem_ram_load(); > + if (ret < 0) { > + return ret; > + } > + } while (umemd.mig_read != NULL && > + qemu_pending_size(umemd.mig_read) > 0); > + } > + if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) { > + do { > + ret = postcopy_incoming_umem_handle_qemu(); > + if (ret == -EAGAIN) { > + break; > + } > + } while (umemd.from_qemu != NULL && > + qemu_pending_size(umemd.from_qemu) > 0); > + } > + > + if (umemd.mig_write != NULL) { > + nonblock_fflush(umemd.mig_write); > + } > + if (umemd.to_qemu != NULL) { > + if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) { > + postcopy_incoming_umem_send_pages_present(); > + } > + nonblock_fflush(umemd.to_qemu); > + if ((umemd.state & UMEM_STATE_QUIT_QUEUED) && > + nonblock_pending_size(umemd.to_qemu) == 0) { > + DPRINTF("|= UMEM_STATE_QUIT_SENT\n"); > + qemu_fclose(umemd.to_qemu->file); > + umemd.to_qemu = NULL; > + fd_close(&umemd.to_qemu_fd); > + umemd.state |= UMEM_STATE_QUIT_SENT; > + } > + } > + > + return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK; > +} > + > +static void postcopy_incoming_umemd(void) > +{ > + ram_addr_t last_ram_offset; > + int nbits; > + RAMBlock *block; > + int ret; > + > + qemu_daemon(1, 1); > + signal(SIGPIPE, SIG_IGN); > + DPRINTF("daemon pid: %d\n", getpid()); > + > + umemd.page_request.pgoffs = g_new(__u64, MAX_REQUESTS); > + umemd.page_cached.pgoffs = > + g_new(__u64, MAX_REQUESTS * > + (TARGET_PAGE_SIZE >= umemd.host_page_size ? > + 1: umemd.nr_host_pages_per_target_page)); > + umemd.target_pgoffs = > + g_new(uint64_t, MAX_REQUESTS * > + MAX(umemd.nr_host_pages_per_target_page, > + umemd.nr_target_pages_per_host_page)); > + umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS)); > + umemd.present_request->nr = 0; > + > + last_ram_offset = qemu_last_ram_offset(); > + nbits = last_ram_offset >> TARGET_PAGE_BITS; > + umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits)); > + umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits)); > + umemd.last_block_read = NULL; > + umemd.last_block_write = NULL; > + > + QLIST_FOREACH(block, &ram_list.blocks, next) { > + UMem *umem = block->umem; > + umem->umem = NULL; /* umem mapping area has VM_DONT_COPY flag, > + so we lost those mappings by fork */ > + block->host = umem_map_shmem(umem); > + umem_close_shmem(umem); > + } > + umem_daemon_ready(umemd.to_qemu_fd); > + umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd); > + > + /* wait for qemu to disown migration_fd */ > + umem_daemon_wait_for_qemu(umemd.from_qemu_fd); > + umemd.from_qemu = qemu_fopen_pipe(umemd.from_qemu_fd); > + > + DPRINTF("entering umemd main loop\n"); > + for (;;) { > + ret = postcopy_incoming_umemd_main_loop(); > + if (ret != 0) { > + break; > + } > + } > + DPRINTF("exiting umemd main loop\n"); > + > + /* This daemon forked from qemu and the parent qemu is still running. > + * Cleanups of linked libraries like SDL should not be triggered, > + * otherwise the parent qemu may use resources which was already freed. > + */ > + fflush(stdout); > + fflush(stderr); > + _exit(ret < 0? EXIT_FAILURE: 0); > +} > diff --git a/migration-tcp.c b/migration-tcp.c > index cf6a9b8..aa35050 100644 > --- a/migration-tcp.c > +++ b/migration-tcp.c > @@ -63,18 +63,25 @@ static void tcp_wait_for_connect(void *opaque) > } while (ret == -1 && (socket_error()) == EINTR); > > if (ret < 0) { > - migrate_fd_error(s); > - return; > + goto error_out; > } > > qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > > - if (val == 0) > + if (val == 0) { > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > + } > migrate_fd_connect(s); > - else { > + } else { > DPRINTF("error connecting %d\n", val); > - migrate_fd_error(s); > + goto error_out; > } > + return; > + > +error_out: > + migrate_fd_error(s); > } > > int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) > @@ -112,11 +119,19 @@ int tcp_start_outgoing_migration(MigrationState *s, const char *host_port) > > if (ret < 0) { > DPRINTF("connect failed\n"); > - migrate_fd_error(s); > - return ret; > + goto error_out; > + } > + > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > } > migrate_fd_connect(s); > return 0; > + > +error_out: > + migrate_fd_error(s); > + return ret; > } > > static void tcp_accept_incoming_migration(void *opaque) > @@ -145,7 +160,15 @@ static void tcp_accept_incoming_migration(void *opaque) > } > > process_incoming_migration(f); > + if (incoming_postcopy) { > + postcopy_incoming_fork_umemd(c, f); > + } > qemu_fclose(f); > + if (incoming_postcopy) { > + /* now socket is disowned. > + So tell umem server that it's safe to use it */ > + postcopy_incoming_qemu_ready(); > + } > out: > close(c); > out2: > diff --git a/migration-unix.c b/migration-unix.c > index dfcf203..3707505 100644 > --- a/migration-unix.c > +++ b/migration-unix.c > @@ -69,12 +69,20 @@ static void unix_wait_for_connect(void *opaque) > > qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > > - if (val == 0) > + if (val == 0) { > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > + } > migrate_fd_connect(s); > - else { > + } else { > DPRINTF("error connecting %d\n", val); > - migrate_fd_error(s); > + goto error_out; > } > + return; > + > +error_out: > + migrate_fd_error(s); > } > > int unix_start_outgoing_migration(MigrationState *s, const char *path) > @@ -109,11 +117,19 @@ int unix_start_outgoing_migration(MigrationState *s, const char *path) > > if (ret < 0) { > DPRINTF("connect failed\n"); > - migrate_fd_error(s); > - return ret; > + goto error_out; > + } > + > + ret = postcopy_outgoing_create_read_socket(s); > + if (ret < 0) { > + goto error_out; > } > migrate_fd_connect(s); > return 0; > + > +error_out: > + migrate_fd_error(s); > + return ret; > } > > static void unix_accept_incoming_migration(void *opaque) > @@ -142,7 +158,13 @@ static void unix_accept_incoming_migration(void *opaque) > } > > process_incoming_migration(f); > + if (incoming_postcopy) { > + postcopy_incoming_fork_umemd(c, f); > + } > qemu_fclose(f); > + if (incoming_postcopy) { > + postcopy_incoming_qemu_ready(); > + } > out: > close(c); > out2: > diff --git a/migration.c b/migration.c > index 0149ab3..51efe44 100644 > --- a/migration.c > +++ b/migration.c > @@ -39,6 +39,11 @@ enum { > MIG_STATE_COMPLETED, > }; > > +enum { > + MIG_SUBSTATE_PRECOPY, > + MIG_SUBSTATE_POSTCOPY, > +}; > + > #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ > > static NotifierList migration_state_notifiers = > @@ -255,6 +260,18 @@ static void migrate_fd_put_ready(void *opaque) > return; > } > > + if (s->substate == MIG_SUBSTATE_POSTCOPY) { > + /* PRINTF("postcopy background\n"); */ > + ret = postcopy_outgoing_ram_save_background(s->mon, s->file, > + s->postcopy); > + if (ret > 0) { > + migrate_fd_completed(s); > + } else if (ret < 0) { > + migrate_fd_error(s); > + } > + return; > + } > + > DPRINTF("iterate\n"); > ret = qemu_savevm_state_iterate(s->mon, s->file); > if (ret < 0) { > @@ -265,6 +282,19 @@ static void migrate_fd_put_ready(void *opaque) > DPRINTF("done iterating\n"); > vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); > > + if (s->params.postcopy) { > + if (qemu_savevm_state_complete(s->mon, s->file) < 0) { > + migrate_fd_error(s); > + if (old_vm_running) { > + vm_start(); > + } > + return; > + } > + s->substate = MIG_SUBSTATE_POSTCOPY; > + s->postcopy = postcopy_outgoing_begin(s); > + return; > + } > + > if (qemu_savevm_state_complete(s->mon, s->file) < 0) { > migrate_fd_error(s); > } else { > @@ -357,6 +387,7 @@ void migrate_fd_connect(MigrationState *s) > int ret; > > s->state = MIG_STATE_ACTIVE; > + s->substate = MIG_SUBSTATE_PRECOPY; > s->file = qemu_fopen_ops_buffered(s, > s->bandwidth_limit, > migrate_fd_put_buffer, > diff --git a/migration.h b/migration.h > index 90ae362..2809e99 100644 > --- a/migration.h > +++ b/migration.h > @@ -40,6 +40,12 @@ struct MigrationState > int (*write)(MigrationState *s, const void *buff, size_t size); > void *opaque; > MigrationParams params; > + > + /* for postcopy */ > + int substate; /* precopy or postcopy */ > + int fd_read; > + QEMUFile *file_read; /* connection from the detination */ > + void *postcopy; > }; > > void process_incoming_migration(QEMUFile *f); > @@ -86,6 +92,7 @@ uint64_t ram_bytes_remaining(void); > uint64_t ram_bytes_transferred(void); > uint64_t ram_bytes_total(void); > > +void ram_save_set_params(const MigrationParams *params, void *opaque); > void sort_ram_list(void); > int ram_save_block(QEMUFile *f); > void ram_save_memory_set_dirty(void); > @@ -107,7 +114,30 @@ void migrate_add_blocker(Error *reason); > */ > void migrate_del_blocker(Error *reason); > > +/* For outgoing postcopy */ > +int postcopy_outgoing_create_read_socket(MigrationState *s); > +int postcopy_outgoing_ram_save_live(Monitor *mon, > + QEMUFile *f, int stage, void *opaque); > +void *postcopy_outgoing_begin(MigrationState *s); > +int postcopy_outgoing_ram_save_background(Monitor *mon, QEMUFile *f, > + void *postcopy); > + > +/* For incoming postcopy */ > extern bool incoming_postcopy; > extern unsigned long incoming_postcopy_flags; > > +int postcopy_incoming_init(const char *incoming, bool incoming_postcopy); > +void postcopy_incoming_ram_alloc(const char *name, > + size_t size, uint8_t **hostp, UMem **umemp); > +void postcopy_incoming_ram_free(UMem *umem); > +void postcopy_incoming_prepare(void); > + > +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id); > +void postcopy_incoming_fork_umemd(int mig_read_fd, QEMUFile *mig_read); > +void postcopy_incoming_qemu_ready(void); > +void postcopy_incoming_qemu_cleanup(void); > +#ifdef NEED_CPU_H > +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size); > +#endif > + > #endif > diff --git a/qemu-common.h b/qemu-common.h > index 725922b..d74a8c9 100644 > --- a/qemu-common.h > +++ b/qemu-common.h > @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState; > > struct Monitor; > typedef struct Monitor Monitor; > +typedef struct UMem UMem; > > /* we put basic includes here to avoid repeating them in device drivers */ > #include > diff --git a/qemu-options.hx b/qemu-options.hx > index 5c5b8f3..19e20f9 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -2510,7 +2510,10 @@ DEF("postcopy-flags", HAS_ARG, QEMU_OPTION_postcopy_flags, > "-postcopy-flags unsigned-int(flags)\n" > " flags for postcopy incoming migration\n" > " when -incoming and -postcopy are specified.\n" > - " This is for benchmark/debug purpose (default: 0)\n", > + " This is for benchmark/debug purpose (default: 0)\n" > + " Currently supprted flags are\n" > + " 1: enable fault request from umemd to qemu\n" > + " (default: disabled)\n", > QEMU_ARCH_ALL) > STEXI > @item -postcopy-flags int Can you move umem.h and umem.h to a separate patch please , this patch > diff --git a/umem.c b/umem.c > new file mode 100644 > index 0000000..b7be006 > --- /dev/null > +++ b/umem.c > @@ -0,0 +1,379 @@ > +/* > + * umem.c: user process backed memory module for postcopy livemigration > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#include > +#include > + > +#include > + > +#include "bitops.h" > +#include "sysemu.h" > +#include "hw/hw.h" > +#include "umem.h" > + > +//#define DEBUG_UMEM > +#ifdef DEBUG_UMEM > +#include > +#define DPRINTF(format, ...) \ > + do { \ > + printf("%d:%ld %s:%d "format, getpid(), syscall(SYS_gettid), \ > + __func__, __LINE__, ## __VA_ARGS__); \ > + } while (0) > +#else > +#define DPRINTF(format, ...) do { } while (0) > +#endif > + > +#define DEV_UMEM "/dev/umem" > + > +struct UMemDev { > + int fd; > + int page_shift; > +}; > + > +UMemDev *umem_dev_new(void) > +{ > + UMemDev *umem_dev; > + int umem_dev_fd = open(DEV_UMEM, O_RDWR); > + if (umem_dev_fd < 0) { > + perror("can't open "DEV_UMEM); > + abort(); > + } > + > + umem_dev = g_new(UMemDev, 1); > + umem_dev->fd = umem_dev_fd; > + umem_dev->page_shift = ffs(getpagesize()) - 1; > + return umem_dev; > +} > + > +void umem_dev_destroy(UMemDev *dev) > +{ > + close(dev->fd); > + g_free(dev); > +} > + > +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name) > +{ > + struct umem_create create = { > + .size = size, > + .async_req_max = 0, > + .sync_req_max = 0, > + }; > + UMem *umem; > + > + snprintf(create.name.id, sizeof(create.name.id), > + "pid-%"PRId64, (uint64_t)getpid()); > + create.name.id[UMEM_ID_MAX - 1] = 0; > + strncpy(create.name.name, name, sizeof(create.name.name)); > + create.name.name[UMEM_NAME_MAX - 1] = 0; > + > + assert((size % getpagesize()) == 0); > + if (ioctl(dev->fd, UMEM_DEV_CREATE_UMEM, &create) < 0) { > + perror("UMEM_DEV_CREATE_UMEM"); > + abort(); > + } > + if (ftruncate(create.shmem_fd, create.size) < 0) { > + perror("truncate(\"shmem_fd\")"); > + abort(); > + } > + > + umem = g_new(UMem, 1); > + umem->nbits = 0; > + umem->nsets = 0; > + umem->faulted = NULL; > + umem->page_shift = dev->page_shift; > + umem->fd = create.umem_fd; > + umem->shmem_fd = create.shmem_fd; > + umem->size = create.size; > + umem->umem = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (umem->umem == MAP_FAILED) { > + perror("mmap(UMem) failed"); > + abort(); > + } > + return umem; > +} > + > +void umem_mmap(UMem *umem) > +{ > + void *ret = mmap(umem->umem, umem->size, > + PROT_EXEC | PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_FIXED, umem->fd, 0); > + if (ret == MAP_FAILED) { > + perror("umem_mmap(UMem) failed"); > + abort(); > + } > +} > + > +void umem_destroy(UMem *umem) > +{ > + if (umem->fd != -1) { > + close(umem->fd); > + } > + if (umem->shmem_fd != -1) { > + close(umem->shmem_fd); > + } > + g_free(umem->faulted); > + g_free(umem); > +} > + > +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request) > +{ > + if (ioctl(umem->fd, UMEM_GET_PAGE_REQUEST, page_request)) { > + perror("daemon: UMEM_GET_PAGE_REQUEST"); > + abort(); > + } > +} > + > +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached) > +{ > + if (ioctl(umem->fd, UMEM_MARK_PAGE_CACHED, page_cached)) { > + perror("daemon: UMEM_MARK_PAGE_CACHED"); > + abort(); > + } > +} > + > +void umem_unmap(UMem *umem) > +{ > + munmap(umem->umem, umem->size); > + umem->umem = NULL; > +} > + > +void umem_close(UMem *umem) > +{ > + close(umem->fd); > + umem->fd = -1; > +} > + > +void *umem_map_shmem(UMem *umem) > +{ > + umem->nbits = umem->size >> umem->page_shift; > + umem->nsets = 0; > + umem->faulted = g_new0(unsigned long, BITS_TO_LONGS(umem->nbits)); > + > + umem->shmem = mmap(NULL, umem->size, PROT_READ | PROT_WRITE, MAP_SHARED, > + umem->shmem_fd, 0); > + if (umem->shmem == MAP_FAILED) { > + perror("daemon: mmap(\"shmem\")"); > + abort(); > + } > + return umem->shmem; > +} > + > +void umem_unmap_shmem(UMem *umem) > +{ > + munmap(umem->shmem, umem->size); > + umem->shmem = NULL; > +} > + > +void umem_remove_shmem(UMem *umem, size_t offset, size_t size) > +{ > + int s = offset >> umem->page_shift; > + int e = (offset + size) >> umem->page_shift; > + int i; > + > + for (i = s; i < e; i++) { > + if (!test_and_set_bit(i, umem->faulted)) { > + umem->nsets++; > +#if defined(CONFIG_MADVISE) && defined(MADV_REMOVE) > + madvise(umem->shmem + offset, size, MADV_REMOVE); > +#endif > + } > + } > +} > + > +void umem_close_shmem(UMem *umem) > +{ > + close(umem->shmem_fd); > + umem->shmem_fd = -1; > +} > + > +/***************************************************************************/ > +/* qemu <-> umem daemon communication */ > + > +size_t umem_pages_size(uint64_t nr) > +{ > + return sizeof(struct umem_pages) + nr * sizeof(uint64_t); > +} > + > +static void umem_write_cmd(int fd, uint8_t cmd) > +{ > + DPRINTF("write cmd %c\n", cmd); > + > + for (;;) { > + ssize_t ret = write(fd, &cmd, 1); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } else if (errno == EPIPE) { > + perror("pipe"); > + DPRINTF("write cmd %c %zd %d: pipe is closed\n", > + cmd, ret, errno); > + break; > + } > + > + perror("pipe"); > + DPRINTF("write cmd %c %zd %d\n", cmd, ret, errno); > + abort(); > + } > + > + break; > + } > +} > + > +static void umem_read_cmd(int fd, uint8_t expect) > +{ > + uint8_t cmd; > + for (;;) { > + ssize_t ret = read(fd, &cmd, 1); > + if (ret == -1) { > + if (errno == EINTR) { > + continue; > + } > + perror("pipe"); > + DPRINTF("read error cmd %c %zd %d\n", cmd, ret, errno); > + abort(); > + } > + > + if (ret == 0) { > + DPRINTF("read cmd %c %zd: pipe is closed\n", cmd, ret); > + abort(); > + } > + > + break; > + } > + > + DPRINTF("read cmd %c\n", cmd); > + if (cmd != expect) { > + DPRINTF("cmd %c expect %d\n", cmd, expect); > + abort(); > + } > +} > + > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset) > +{ > + int ret; > + uint64_t nr; > + size_t size; > + struct umem_pages *pages; > + > + ret = qemu_peek_buffer(f, (uint8_t*)&nr, sizeof(nr), *offset); > + *offset += sizeof(nr); > + DPRINTF("ret %d nr %ld\n", ret, nr); > + if (ret != sizeof(nr) || nr == 0) { > + return NULL; > + } > + > + size = umem_pages_size(nr); > + pages = g_malloc(size); > + pages->nr = nr; > + size -= sizeof(pages->nr); > + > + ret = qemu_peek_buffer(f, (uint8_t*)pages->pgoffs, size, *offset); > + *offset += size; > + if (ret != size) { > + g_free(pages); > + return NULL; > + } > + return pages; > +} > + > +static void umem_send_pages(QEMUFile *f, const struct umem_pages *pages) > +{ > + size_t len = umem_pages_size(pages->nr); > + qemu_put_buffer(f, (const uint8_t*)pages, len); > +} > + > +/* umem daemon -> qemu */ > +void umem_daemon_ready(int to_qemu_fd) > +{ > + umem_write_cmd(to_qemu_fd, UMEM_DAEMON_READY); > +} > + > +void umem_daemon_quit(QEMUFile *to_qemu) > +{ > + qemu_put_byte(to_qemu, UMEM_DAEMON_QUIT); > +} > + > +void umem_daemon_send_pages_present(QEMUFile *to_qemu, > + struct umem_pages *pages) > +{ > + qemu_put_byte(to_qemu, UMEM_DAEMON_TRIGGER_PAGE_FAULT); > + umem_send_pages(to_qemu, pages); > +} > + > +void umem_daemon_wait_for_qemu(int from_qemu_fd) > +{ > + umem_read_cmd(from_qemu_fd, UMEM_QEMU_READY); > +} > + > +/* qemu -> umem daemon */ > +void umem_qemu_wait_for_daemon(int from_umemd_fd) > +{ > + umem_read_cmd(from_umemd_fd, UMEM_DAEMON_READY); > +} > + > +void umem_qemu_ready(int to_umemd_fd) > +{ > + umem_write_cmd(to_umemd_fd, UMEM_QEMU_READY); > +} > + > +void umem_qemu_quit(QEMUFile *to_umemd) > +{ > + qemu_put_byte(to_umemd, UMEM_QEMU_QUIT); > +} > + > +/* qemu side handler */ > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, > + int *offset) > +{ > + uint64_t i; > + int page_shift = ffs(getpagesize()) - 1; > + struct umem_pages *pages = umem_recv_pages(from_umemd, offset); > + if (pages == NULL) { > + return NULL; > + } > + > + for (i = 0; i < pages->nr; i++) { > + ram_addr_t addr = pages->pgoffs[i] << page_shift; > + > + /* make pages present by forcibly triggering page fault. */ > + volatile uint8_t *ram = qemu_get_ram_ptr(addr); > + uint8_t dummy_read = ram[0]; > + (void)dummy_read; /* suppress unused variable warning */ > + } > + > + return pages; > +} > + > +void umem_qemu_send_pages_present(QEMUFile *to_umemd, > + const struct umem_pages *pages) > +{ > + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_FAULTED); > + umem_send_pages(to_umemd, pages); > +} > + > +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, > + const struct umem_pages *pages) > +{ > + qemu_put_byte(to_umemd, UMEM_QEMU_PAGE_UNMAPPED); > + umem_send_pages(to_umemd, pages); > +} > diff --git a/umem.h b/umem.h > new file mode 100644 > index 0000000..5ca19ef > --- /dev/null > +++ b/umem.h > @@ -0,0 +1,105 @@ > +/* > + * umem.h: user process backed memory module for postcopy livemigration > + * > + * Copyright (c) 2011 > + * National Institute of Advanced Industrial Science and Technology > + * > + * https://sites.google.com/site/grivonhome/quick-kvm-migration > + * Author: Isaku Yamahata > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, see . > + */ > + > +#ifndef QEMU_UMEM_H > +#define QEMU_UMEM_H > + > +#include > + > +#include "qemu-common.h" > + > +typedef struct UMemDev UMemDev; > + > +struct UMem { > + void *umem; > + int fd; > + void *shmem; > + int shmem_fd; > + uint64_t size; > + > + /* indexed by host page size */ > + int page_shift; > + int nbits; > + int nsets; > + unsigned long *faulted; > +}; > + > +UMemDev *umem_dev_new(void); > +void umem_dev_destroy(UMemDev *dev); > +UMem *umem_dev_create(UMemDev *dev, size_t size, const char *name); > +void umem_mmap(UMem *umem); > + > +void umem_destroy(UMem *umem); > + > +/* umem device operations */ > +void umem_get_page_request(UMem *umem, struct umem_page_request *page_request); > +void umem_mark_page_cached(UMem *umem, struct umem_page_cached *page_cached); > +void umem_unmap(UMem *umem); > +void umem_close(UMem *umem); > + > +/* umem shmem operations */ > +void *umem_map_shmem(UMem *umem); > +void umem_unmap_shmem(UMem *umem); > +void umem_remove_shmem(UMem *umem, size_t offset, size_t size); > +void umem_close_shmem(UMem *umem); > + > +/* qemu on source <-> umem daemon communication */ > + > +struct umem_pages { > + uint64_t nr; /* nr = 0 means completed */ > + uint64_t pgoffs[0]; > +}; > + > +/* daemon -> qemu */ > +#define UMEM_DAEMON_READY 'R' > +#define UMEM_DAEMON_QUIT 'Q' > +#define UMEM_DAEMON_TRIGGER_PAGE_FAULT 'T' > +#define UMEM_DAEMON_ERROR 'E' > + > +/* qemu -> daemon */ > +#define UMEM_QEMU_READY 'r' > +#define UMEM_QEMU_QUIT 'q' > +#define UMEM_QEMU_PAGE_FAULTED 't' > +#define UMEM_QEMU_PAGE_UNMAPPED 'u' > + > +struct umem_pages *umem_recv_pages(QEMUFile *f, int *offset); > +size_t umem_pages_size(uint64_t nr); > + > +/* for umem daemon */ > +void umem_daemon_ready(int to_qemu_fd); > +void umem_daemon_wait_for_qemu(int from_qemu_fd); > +void umem_daemon_quit(QEMUFile *to_qemu); > +void umem_daemon_send_pages_present(QEMUFile *to_qemu, > + struct umem_pages *pages); > + > +/* for qemu */ > +void umem_qemu_wait_for_daemon(int from_umemd_fd); > +void umem_qemu_ready(int to_umemd_fd); > +void umem_qemu_quit(QEMUFile *to_umemd); > +struct umem_pages *umem_qemu_trigger_page_fault(QEMUFile *from_umemd, > + int *offset); > +void umem_qemu_send_pages_present(QEMUFile *to_umemd, > + const struct umem_pages *pages); > +void umem_qemu_send_pages_unmapped(QEMUFile *to_umemd, > + const struct umem_pages *pages); > + > +#endif /* QEMU_UMEM_H */ > diff --git a/vl.c b/vl.c > index 5430b8c..17427a0 100644 > --- a/vl.c > +++ b/vl.c > @@ -3274,8 +3274,12 @@ int main(int argc, char **argv, char **envp) > default_drive(default_sdcard, snapshot, machine->use_scsi, > IF_SD, 0, SD_OPTS); > > - register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, NULL, > - ram_save_live, NULL, ram_load, NULL); > + if (postcopy_incoming_init(incoming, incoming_postcopy) < 0) { > + exit(1); > + } > + register_savevm_live(NULL, "ram", 0, RAM_SAVE_VERSION_ID, > + ram_save_set_params, ram_save_live, NULL, > + ram_load, NULL); > > if (nb_numa_nodes > 0) { > int i; > @@ -3471,6 +3475,9 @@ int main(int argc, char **argv, char **envp) > > if (incoming) { > runstate_set(RUN_STATE_INMIGRATE); > + if (incoming_postcopy) { > + postcopy_incoming_prepare(); >+ } how about moving postcopy_incoming_prepare into qemu_start_incoming_migration ? > int ret = qemu_start_incoming_migration(incoming); > if (ret < 0) { > fprintf(stderr, "Migration failed. Exit code %s(%d), exiting.\n", > @@ -3488,6 +3495,9 @@ int main(int argc, char **argv, char **envp) > bdrv_close_all(); > pause_all_vcpus(); > net_cleanup(); > + if (incoming_postcopy) { > + postcopy_incoming_qemu_cleanup(); > + } > res_free(); > > return 0; Orit