From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:38753) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1UwGwk-0004Ua-1p for qemu-devel@nongnu.org; Mon, 08 Jul 2013 15:18:04 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1UwGwi-0007wa-3u for qemu-devel@nongnu.org; Mon, 08 Jul 2013 15:18:01 -0400 Received: from e39.co.us.ibm.com ([32.97.110.160]:44863) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1UwGwh-0007wV-Qg for qemu-devel@nongnu.org; Mon, 08 Jul 2013 15:18:00 -0400 Received: from /spool/local by e39.co.us.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 8 Jul 2013 12:45:26 -0600 From: Anthony Liguori In-Reply-To: <1372315560-5478-12-git-send-email-aik@ozlabs.ru> References: <1372315560-5478-1-git-send-email-aik@ozlabs.ru> <1372315560-5478-12-git-send-email-aik@ozlabs.ru> Date: Mon, 08 Jul 2013 13:45:05 -0500 Message-ID: <87mwpwzya6.fsf@codemonkey.ws> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Subject: Re: [Qemu-devel] [PATCH 11/17] pseries: savevm support for pseries machine List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Alexey Kardashevskiy , qemu-devel@nongnu.org Cc: Alexander Graf , qemu-ppc@nongnu.org, Paolo Bonzini , Paul Mackerras , David Gibson Alexey Kardashevskiy writes: > From: David Gibson > > This adds the necessary pieces to implement savevm / migration for the > pseries machine. The most complex part here is migrating the hash > table - for the paravirtualized pseries machine the guest's hash page > table is not stored within guest memory, but externally and the guest > accesses it via hypercalls. > > This patch uses a hypervisor reserved bit of the HPTE as a dirty bit > (tracking changes to the HPTE itself, not the page it references). > This is used to implement a live migration style incremental save and > restore of the hash table contents. > > In addition it adds VMStateDescription information to save and restore > the (few) remaining pieces of state information needed by the pseries > machine. > > Signed-off-by: David Gibson > Signed-off-by: Alexey Kardashevskiy I vaguely recall making the suggestion to use a live section like this. How large is the HTAB typically? Regards, Anthony Liguori > --- > hw/ppc/spapr.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++- > hw/ppc/spapr_hcall.c | 8 +- > include/hw/ppc/spapr.h | 12 ++- > 3 files changed, 281 insertions(+), 8 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index def3505..f989a22 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -32,6 +32,7 @@ > #include "sysemu/cpus.h" > #include "sysemu/kvm.h" > #include "kvm_ppc.h" > +#include "mmu-hash64.h" > > #include "hw/boards.h" > #include "hw/ppc/ppc.h" > @@ -667,7 +668,7 @@ static void spapr_cpu_reset(void *opaque) > > env->spr[SPR_HIOR] = 0; > > - env->external_htab = spapr->htab; > + env->external_htab = (uint8_t *)spapr->htab; > env->htab_base = -1; > env->htab_mask = HTAB_SIZE(spapr) - 1; > env->spr[SPR_SDR1] = (target_ulong)spapr->htab | > @@ -719,6 +720,268 @@ static int spapr_vga_init(PCIBus *pci_bus) > } > } > > +static const VMStateDescription vmstate_spapr = { > + .name = "spapr", > + .version_id = 1, > + .minimum_version_id = 1, > + .minimum_version_id_old = 1, > + .fields = (VMStateField []) { > + VMSTATE_UINT32(next_irq, sPAPREnvironment), > + > + /* RTC offset */ > + VMSTATE_UINT64(rtc_offset, sPAPREnvironment), > + > + VMSTATE_END_OF_LIST() > + }, > +}; > + > +#define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2)) > +#define HPTE_VALID(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID) > +#define HPTE_DIRTY(_hpte) (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY) > +#define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY)) > + > +static int htab_save_setup(QEMUFile *f, void *opaque) > +{ > + sPAPREnvironment *spapr = opaque; > + > + spapr->htab_save_index = 0; > + spapr->htab_first_pass = true; > + > + /* "Iteration" header */ > + qemu_put_be32(f, spapr->htab_shift); > + > + return 0; > +} > + > +#define MAX_ITERATION_NS 5000000 /* 5 ms */ > + > +static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr, > + int64_t max_ns) > +{ > + int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64; > + int index = spapr->htab_save_index; > + int64_t starttime = qemu_get_clock_ns(rt_clock); > + > + assert(spapr->htab_first_pass); > + > + do { > + int chunkstart; > + > + /* Consume invalid HPTEs */ > + while ((index < htabslots) > + && !HPTE_VALID(HPTE(spapr->htab, index))) { > + index++; > + CLEAN_HPTE(HPTE(spapr->htab, index)); > + } > + > + /* Consume valid HPTEs */ > + chunkstart = index; > + while ((index < htabslots) > + && HPTE_VALID(HPTE(spapr->htab, index))) { > + index++; > + CLEAN_HPTE(HPTE(spapr->htab, index)); > + } > + > + if (index > chunkstart) { > + int n_valid = index - chunkstart; > + > + qemu_put_be32(f, chunkstart); > + qemu_put_be16(f, n_valid); > + qemu_put_be16(f, 0); > + qemu_put_buffer(f, HPTE(spapr->htab, chunkstart), > + HASH_PTE_SIZE_64 * n_valid); > + > + if ((qemu_get_clock_ns(rt_clock) - starttime) > max_ns) { > + break; > + } > + } > + } while ((index < htabslots) && !qemu_file_rate_limit(f)); > + > + if (index >= htabslots) { > + assert(index == htabslots); > + index = 0; > + spapr->htab_first_pass = false; > + } > + spapr->htab_save_index = index; > +} > + > +static bool htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr, > + int64_t max_ns) > +{ > + bool final = max_ns < 0; > + int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64; > + int examined = 0, sent = 0; > + int index = spapr->htab_save_index; > + int64_t starttime = qemu_get_clock_ns(rt_clock); > + > + assert(!spapr->htab_first_pass); > + > + do { > + int chunkstart, invalidstart; > + > + /* Consume non-dirty HPTEs */ > + while ((index < htabslots) > + && !HPTE_DIRTY(HPTE(spapr->htab, index))) { > + index++; > + examined++; > + } > + > + chunkstart = index; > + /* Consume valid dirty HPTEs */ > + while ((index < htabslots) > + && HPTE_DIRTY(HPTE(spapr->htab, index)) > + && HPTE_VALID(HPTE(spapr->htab, index))) { > + CLEAN_HPTE(HPTE(spapr->htab, index)); > + index++; > + examined++; > + } > + > + invalidstart = index; > + /* Consume invalid dirty HPTEs */ > + while ((index < htabslots) > + && HPTE_DIRTY(HPTE(spapr->htab, index)) > + && !HPTE_VALID(HPTE(spapr->htab, index))) { > + CLEAN_HPTE(HPTE(spapr->htab, index)); > + index++; > + examined++; > + } > + > + if (index > chunkstart) { > + int n_valid = invalidstart - chunkstart; > + int n_invalid = index - invalidstart; > + > + qemu_put_be32(f, chunkstart); > + qemu_put_be16(f, n_valid); > + qemu_put_be16(f, n_invalid); > + qemu_put_buffer(f, HPTE(spapr->htab, chunkstart), > + HASH_PTE_SIZE_64 * n_valid); > + sent += index - chunkstart; > + > + if (!final && (qemu_get_clock_ns(rt_clock) - starttime) > max_ns) { > + break; > + } > + } > + > + if (examined >= htabslots) { > + break; > + } > + > + if (index >= htabslots) { > + assert(index == htabslots); > + index = 0; > + } > + } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final)); > + > + if (index >= htabslots) { > + assert(index == htabslots); > + index = 0; > + } > + > + spapr->htab_save_index = index; > + > + return (examined >= htabslots) && (sent == 0); > +} > + > +static int htab_save_iterate(QEMUFile *f, void *opaque) > +{ > + sPAPREnvironment *spapr = opaque; > + bool nothingleft = false;; > + > + /* Iteration header */ > + qemu_put_be32(f, 0); > + > + if (spapr->htab_first_pass) { > + htab_save_first_pass(f, spapr, MAX_ITERATION_NS); > + } else { > + nothingleft = htab_save_later_pass(f, spapr, MAX_ITERATION_NS); > + } > + > + /* End marker */ > + qemu_put_be32(f, 0); > + qemu_put_be16(f, 0); > + qemu_put_be16(f, 0); > + > + return nothingleft ? 1 : 0; > +} > + > +static int htab_save_complete(QEMUFile *f, void *opaque) > +{ > + sPAPREnvironment *spapr = opaque; > + > + /* Iteration header */ > + qemu_put_be32(f, 0); > + > + htab_save_later_pass(f, spapr, -1); > + > + /* End marker */ > + qemu_put_be32(f, 0); > + qemu_put_be16(f, 0); > + qemu_put_be16(f, 0); > + > + return 0; > +} > + > +static int htab_load(QEMUFile *f, void *opaque, int version_id) > +{ > + sPAPREnvironment *spapr = opaque; > + uint32_t section_hdr; > + > + if (version_id < 1 || version_id > 1) { > + fprintf(stderr, "htab_load() bad version\n"); > + return -EINVAL; > + } > + > + section_hdr = qemu_get_be32(f); > + > + if (section_hdr) { > + /* First section, just the hash shift */ > + if (spapr->htab_shift != section_hdr) { > + return -EINVAL; > + } > + return 0; > + } > + > + while (true) { > + uint32_t index; > + uint16_t n_valid, n_invalid; > + > + index = qemu_get_be32(f); > + n_valid = qemu_get_be16(f); > + n_invalid = qemu_get_be16(f); > + > + if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) { > + /* End of Stream */ > + break; > + } > + > + if ((index + n_valid + n_invalid) >= > + (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) { > + /* Bad index in stream */ > + fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) " > + "in htab stream\n", index, n_valid, n_invalid); > + return -EINVAL; > + } > + > + if (n_valid) { > + qemu_get_buffer(f, HPTE(spapr->htab, index), > + HASH_PTE_SIZE_64 * n_valid); > + } > + if (n_invalid) { > + memset(HPTE(spapr->htab, index + n_valid), 0, > + HASH_PTE_SIZE_64 * n_invalid); > + } > + } > + > + return 0; > +} > + > +static SaveVMHandlers savevm_htab_handlers = { > + .save_live_setup = htab_save_setup, > + .save_live_iterate = htab_save_iterate, > + .save_live_complete = htab_save_complete, > + .load_state = htab_load, > +}; > + > static struct icp_state *try_create_xics(const char *type, int nr_servers, > int nr_irqs) > { > @@ -987,6 +1250,10 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args) > > spapr->entry_point = 0x100; > > + vmstate_register(NULL, 0, &vmstate_spapr, spapr); > + register_savevm_live(NULL, "spapr/htab", -1, 1, > + &savevm_htab_handlers, spapr); > + > /* Prepare the device tree */ > spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, > initrd_base, initrd_size, > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index e6f321d..7ca984e 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -115,7 +115,7 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr, > } > ppc_hash64_store_hpte1(env, hpte, ptel); > /* eieio(); FIXME: need some sort of barrier for smp? */ > - ppc_hash64_store_hpte0(env, hpte, pteh); > + ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY); > > args[0] = pte_index + i; > return H_SUCCESS; > @@ -152,7 +152,7 @@ static target_ulong remove_hpte(CPUPPCState *env, target_ulong ptex, > } > *vp = v; > *rp = r; > - ppc_hash64_store_hpte0(env, hpte, 0); > + ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY); > rb = compute_tlbie_rb(v, r, ptex); > ppc_tlb_invalidate_one(env, rb); > return REMOVE_SUCCESS; > @@ -282,11 +282,11 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr, > r |= (flags << 48) & HPTE64_R_KEY_HI; > r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO); > rb = compute_tlbie_rb(v, r, pte_index); > - ppc_hash64_store_hpte0(env, hpte, v & ~HPTE64_V_VALID); > + ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY); > ppc_tlb_invalidate_one(env, rb); > ppc_hash64_store_hpte1(env, hpte, r); > /* Don't need a memory barrier, due to qemu's global lock */ > - ppc_hash64_store_hpte0(env, hpte, v); > + ppc_hash64_store_hpte0(env, hpte, v | HPTE64_V_HPTE_DIRTY); > return H_SUCCESS; > } > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 09c4570..4cfe449 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -9,6 +9,8 @@ struct sPAPRPHBState; > struct sPAPRNVRAM; > struct icp_state; > > +#define HPTE64_V_HPTE_DIRTY 0x0000000000000040ULL > + > typedef struct sPAPREnvironment { > struct VIOsPAPRBus *vio_bus; > QLIST_HEAD(, sPAPRPHBState) phbs; > @@ -17,20 +19,24 @@ typedef struct sPAPREnvironment { > > hwaddr ram_limit; > void *htab; > - long htab_shift; > + uint32_t htab_shift; > hwaddr rma_size; > int vrma_adjust; > hwaddr fdt_addr, rtas_addr; > long rtas_size; > void *fdt_skel; > target_ulong entry_point; > - int next_irq; > - int rtc_offset; > + uint32_t next_irq; > + uint64_t rtc_offset; > char *cpu_model; > bool has_graphics; > > uint32_t epow_irq; > Notifier epow_notifier; > + > + /* Migration state */ > + int htab_save_index; > + bool htab_first_pass; > } sPAPREnvironment; > > #define H_SUCCESS 0 > -- > 1.7.10.4