From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:43329) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YwoXq-00015l-2H for qemu-devel@nongnu.org; Mon, 25 May 2015 05:19:42 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1YwoXi-0003if-BH for qemu-devel@nongnu.org; Mon, 25 May 2015 05:19:35 -0400 Received: from szxga02-in.huawei.com ([119.145.14.65]:53418) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YwoXg-0003g6-Ih for qemu-devel@nongnu.org; Mon, 25 May 2015 05:19:30 -0400 Message-ID: <5562E8F4.4010302@huawei.com> Date: Mon, 25 May 2015 17:18:44 +0800 From: zhanghailiang MIME-Version: 1.0 References: <1429031053-4454-1-git-send-email-dgilbert@redhat.com> <1429031053-4454-43-git-send-email-dgilbert@redhat.com> In-Reply-To: <1429031053-4454-43-git-send-email-dgilbert@redhat.com> Content-Type: text/plain; charset="windows-1252"; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH v6 42/47] Postcopy; Handle userfault requests List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: "Dr. David Alan Gilbert (git)" , qemu-devel@nongnu.org Cc: aarcange@redhat.com, yamahata@private.email.ne.jp, quintela@redhat.com, peter.huangpeng@huawei.com, amit.shah@redhat.com, pbonzini@redhat.com, yayanghy@cn.fujitsu.com, david@gibson.dropbear.id.au On 2015/4/15 1:04, Dr. David Alan Gilbert (git) wrote: > From: "Dr. David Alan Gilbert" > > userfaultfd is a Linux syscall that gives an fd that receives a stream > of notifications of accesses to pages registered with it and allows > the program to acknowledge those stalls and tell the accessing > thread to carry on. > > Signed-off-by: Dr. David Alan Gilbert > --- > include/migration/migration.h | 4 + > migration/postcopy-ram.c | 165 +++++++++++++++++++++++++++++++++++++++--- > trace-events | 9 +++ > 3 files changed, 169 insertions(+), 9 deletions(-) > > diff --git a/include/migration/migration.h b/include/migration/migration.h > index db06fd2..4d6f33a 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -75,11 +75,15 @@ struct MigrationIncomingState { > */ > QemuEvent main_thread_load_event; > > + bool have_fault_thread; > QemuThread fault_thread; > QemuSemaphore fault_thread_sem; > > /* For the kernel to send us notifications */ > int userfault_fd; > + /* To tell the fault_thread to quit */ > + int userfault_quit_fd; > + > QEMUFile *return_path; > QemuMutex rp_mutex; /* We send replies from multiple threads */ > void *postcopy_tmp_page; > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 33aadbc..b2dc3b7 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -49,6 +49,8 @@ struct PostcopyDiscardState { > */ > #if defined(__linux__) > > +#include > +#include > #include > #include > #include > @@ -273,15 +275,41 @@ int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) > */ > int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) > { > - /* TODO: Join the fault thread once we're sure it will exit */ > - if (qemu_ram_foreach_block(cleanup_area, mis)) { > - return -1; > + trace_postcopy_ram_incoming_cleanup_entry(); > + > + if (mis->have_fault_thread) { > + uint64_t tmp64; > + > + if (qemu_ram_foreach_block(cleanup_area, mis)) { > + return -1; > + } > + /* > + * Tell the fault_thread to exit, it's an eventfd that should > + * currently be at 0, we're going to inc it to 1 > + */ > + tmp64 = 1; > + if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) { > + trace_postcopy_ram_incoming_cleanup_join(); > + qemu_thread_join(&mis->fault_thread); > + } else { > + /* Not much we can do here, but may as well report it */ > + error_report("%s: incing userfault_quit_fd: %s", __func__, > + strerror(errno)); > + } > + trace_postcopy_ram_incoming_cleanup_closeuf(); > + close(mis->userfault_fd); > + close(mis->userfault_quit_fd); > + mis->have_fault_thread = false; > } > > + postcopy_state_set(mis, POSTCOPY_INCOMING_END); > + migrate_send_rp_shut(mis, qemu_file_get_error(mis->file) != 0); > + > if (mis->postcopy_tmp_page) { > munmap(mis->postcopy_tmp_page, getpagesize()); > mis->postcopy_tmp_page = NULL; > } > + trace_postcopy_ram_incoming_cleanup_exit(); > return 0; > } > > @@ -320,31 +348,150 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, > static void *postcopy_ram_fault_thread(void *opaque) > { > MigrationIncomingState *mis = (MigrationIncomingState *)opaque; > - > - fprintf(stderr, "postcopy_ram_fault_thread\n"); > - /* TODO: In later patch */ > + uint64_t hostaddr; /* The kernel always gives us 64 bit, not a pointer */ > + int ret; > + size_t hostpagesize = getpagesize(); > + RAMBlock *rb = NULL; > + RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ > + uint8_t *local_tmp_page; > + > + trace_postcopy_ram_fault_thread_entry(); > qemu_sem_post(&mis->fault_thread_sem); > - while (1) { > - /* TODO: In later patch */ > + > + local_tmp_page = mmap(NULL, getpagesize(), > + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, > + -1, 0); > + if (!local_tmp_page) { > + error_report("%s mapping local tmp page: %s", __func__, > + strerror(errno)); > + return NULL; > } > + if (madvise(local_tmp_page, getpagesize(), MADV_DONTFORK)) { What's this 'local_tmp_page' used for ? I don't find where it is used in this function. Besides, there is a helper function qemu_madvise() in qemu, maybe you should use it :) Thanks. > + munmap(local_tmp_page, getpagesize()); > + error_report("%s postcopy local page DONTFORK: %s", __func__, > + strerror(errno)); > + return NULL; > + } > + > + while (true) { > + ram_addr_t rb_offset; > + ram_addr_t in_raspace; > + struct pollfd pfd[2]; > + > + /* > + * We're mainly waiting for the kernel to give us a faulting HVA, > + * however we can be told to quit via userfault_quit_fd which is > + * an eventfd > + */ > + pfd[0].fd = mis->userfault_fd; > + pfd[0].events = POLLIN; > + pfd[0].revents = 0; > + pfd[1].fd = mis->userfault_quit_fd; > + pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ > + pfd[1].revents = 0; > + > + if (poll(pfd, 2, -1 /* Wait forever */) == -1) { > + error_report("%s: userfault poll: %s", __func__, strerror(errno)); > + break; > + } > + > + if (pfd[1].revents) { > + trace_postcopy_ram_fault_thread_quit(); > + break; > + } > + > + ret = read(mis->userfault_fd, &hostaddr, sizeof(hostaddr)); > + if (ret != sizeof(hostaddr)) { > + if (errno == EAGAIN) { > + /* > + * if a wake up happens on the other thread just after > + * the poll, there is nothing to read. > + */ > + continue; > + } > + if (ret < 0) { > + error_report("%s: Failed to read full userfault hostaddr: %s", > + __func__, strerror(errno)); > + break; > + } else { > + error_report("%s: Read %d bytes from userfaultfd expected %zd", > + __func__, ret, sizeof(hostaddr)); > + break; /* Lost alignment, don't know what we'd read next */ > + } > + } > > + rb = qemu_ram_block_from_host((void *)(uintptr_t)hostaddr, true, > + &in_raspace, &rb_offset); > + if (!rb) { > + error_report("postcopy_ram_fault_thread: Fault outside guest: %" > + PRIx64, hostaddr); > + break; > + } > + > + trace_postcopy_ram_fault_thread_request(hostaddr, > + qemu_ram_get_idstr(rb), > + rb_offset); > + > + /* > + * Send the request to the source - we want to request one > + * of our host page sizes (which is >= TPS) > + */ > + if (rb != last_rb) { > + last_rb = rb; > + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > + rb_offset, hostpagesize); > + } else { > + /* Save some space */ > + migrate_send_rp_req_pages(mis, NULL, > + rb_offset, hostpagesize); > + } > + } > + munmap(local_tmp_page, getpagesize()); > + trace_postcopy_ram_fault_thread_exit(); > return NULL; > } > > int postcopy_ram_enable_notify(MigrationIncomingState *mis) > { > - /* Create the fault handler thread and wait for it to be ready */ > + /* Open the fd for the kernel to give us userfaults */ > + mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); > + if (mis->userfault_fd == -1) { > + error_report("%s: Failed to open userfault fd: %s", __func__, > + strerror(errno)); > + return -1; > + } > + > + /* > + * Although the host check already tested the API, we need to > + * do the check again as an ABI handshake on the new fd. > + */ > + if (!ufd_version_check(mis->userfault_fd)) { > + return -1; > + } > + > + /* Now an eventfd we use to tell the fault-thread to quit */ > + mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC); > + if (mis->userfault_quit_fd == -1) { > + error_report("%s: Opening userfault_quit_fd: %s", __func__, > + strerror(errno)); > + close(mis->userfault_fd); > + return -1; > + } > + > qemu_sem_init(&mis->fault_thread_sem, 0); > qemu_thread_create(&mis->fault_thread, "postcopy/fault", > postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); > qemu_sem_wait(&mis->fault_thread_sem); > qemu_sem_destroy(&mis->fault_thread_sem); > + mis->have_fault_thread = true; > > /* Mark so that we get notified of accesses to unwritten areas */ > if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { > return -1; > } > > + trace_postcopy_ram_enable_notify(); > + > return 0; > } > > diff --git a/trace-events b/trace-events > index b65740c..72a65fa 100644 > --- a/trace-events > +++ b/trace-events > @@ -1500,6 +1500,15 @@ postcopy_cleanup_area(const char *ramblock, void *host_addr, size_t offset, size > postcopy_ram_discard_range(void *start, void *end) "%p,%p" > postcopy_init_area(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx" > postcopy_place_page(void *host_addr, bool all_zero) "host=%p all_zero=%d" > +postcopy_ram_enable_notify(void) "" > +postcopy_ram_fault_thread_entry(void) "" > +postcopy_ram_fault_thread_exit(void) "" > +postcopy_ram_fault_thread_quit(void) "" > +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx" > +postcopy_ram_incoming_cleanup_closeuf(void) "" > +postcopy_ram_incoming_cleanup_entry(void) "" > +postcopy_ram_incoming_cleanup_exit(void) "" > +postcopy_ram_incoming_cleanup_join(void) "" > > # kvm-all.c > kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" >