All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side)
@ 2016-03-21  4:42 David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 1/6] pseries: Stubs for HPT resizing David Gibson
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

This is my latest draft of a PAPR extension allowing a guest's Hash
Page Table (HPT) to be resized at runtime (to better respond to memory
hotplug events).

This series contains the qemu portions of the implementation: a full
implementation for TCG and KVM PR guests, and hooks to enable and
advertise the KVM HV implementation when available.

To use this requires a guest which is aware of HPT resizing, and, if
using KVM HV, a host KVM which implements it as well.  The latest
guest and host kernel implementations can be found here:
    https://lists.ozlabs.org/pipermail/linuxppc-dev/2016-March/140896.html

David Gibson (6):
  pseries: Stubs for HPT resizing
  pseries: Implement HPT resizing
  pseries: Enable HPT resizing for 2.6
  pseries: Use smaller default hash page tables when guest can resize
  pseries: Allow HPT resizing on PR KVM
  pseries: Allow KVM HV implementation of HPT resizing to be used

 hw/ppc/spapr.c            |  84 +++++++++-
 hw/ppc/spapr_hcall.c      | 405 +++++++++++++++++++++++++++++++++++++++++++++-
 include/hw/ppc/spapr.h    |  21 ++-
 linux-headers/linux/kvm.h |   1 +
 target-ppc/kvm.c          |  54 +++++++
 target-ppc/kvm_ppc.h      |   6 +
 target-ppc/mmu-hash64.h   |   4 +
 trace-events              |   2 +
 8 files changed, 567 insertions(+), 10 deletions(-)

-- 
2.5.0

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Qemu-devel] [RFC 1/6] pseries: Stubs for HPT resizing
  2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
@ 2016-03-21  4:42 ` David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 2/6] pseries: Implement " David Gibson
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

This introduces stub implementations of the H_RESIZE_HPT_PREPARE and
H_RESIZE_HPT_COMMIT hypercalls which we hope to add in a PAPR
extension to allow run time resizing of a guest's hash page table.  It
also adds a new machine property for controlling whether this new
facility is available.

Finally, it adds a new string to the hypertas property in the device
tree, advertising to the guest the availability of the HPT resizing
hypercalls.  This is a tentative suggested value, and would need to be
standardized by PAPR before being merged.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr.c         | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_hcall.c   | 37 ++++++++++++++++++++++++++++
 include/hw/ppc/spapr.h | 13 +++++++++-
 target-ppc/kvm.c       | 12 ++++++++++
 target-ppc/kvm_ppc.h   |  6 +++++
 trace-events           |  2 ++
 6 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d43d6d9..a027a80 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -318,6 +318,7 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
                                    const char *kernel_cmdline,
                                    uint32_t epow_irq)
 {
+    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     void *fdt;
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
@@ -337,6 +338,9 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
     add_str(hypertas, "hcall-splpar");
     add_str(hypertas, "hcall-bulk");
     add_str(hypertas, "hcall-set-mode");
+    if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
+        add_str(hypertas, "hcall-hpt-resize");
+    }
     add_str(qemu_hypertas, "hcall-memop1");
 
     fdt = g_malloc0(FDT_MAX_SIZE);
@@ -1733,6 +1737,7 @@ static void ppc_spapr_init(MachineState *machine)
     long load_limit, fw_size;
     bool kernel_le = false;
     char *filename;
+    Error *resize_hpt_err = NULL;
 
     msi_nonbroken = true;
 
@@ -1740,6 +1745,25 @@ static void ppc_spapr_init(MachineState *machine)
 
     cpu_ppc_hypercall = emulate_spapr_hypercall;
 
+    /* Check HPT resizing availability */
+    kvmppc_check_papr_resize_hpt(&resize_hpt_err);
+    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
+        if (resize_hpt_err) {
+            spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
+            error_free(resize_hpt_err);
+            resize_hpt_err = NULL;
+        } else {
+            spapr->resize_hpt = smc->resize_hpt_default;
+        }
+    }
+
+    assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
+
+    if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
+        error_report_err(resize_hpt_err);
+        exit(1);
+    }
+
     /* Allocate RMA if necessary */
     rma_alloc_size = kvmppc_alloc_rma(&rma);
 
@@ -2093,6 +2117,40 @@ static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
     spapr->kvm_type = g_strdup(value);
 }
 
+static char *spapr_get_resize_hpt(Object *obj, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+    switch (spapr->resize_hpt) {
+    case SPAPR_RESIZE_HPT_DEFAULT:
+        return g_strdup("default");
+    case SPAPR_RESIZE_HPT_DISABLED:
+        return g_strdup("disabled");
+    case SPAPR_RESIZE_HPT_ENABLED:
+        return g_strdup("enabled");
+    case SPAPR_RESIZE_HPT_REQUIRED:
+        return g_strdup("required");
+    }
+    assert(0);
+}
+
+static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+
+    if (strcmp(value, "default") == 0) {
+        spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
+    } else if (strcmp(value, "disabled") == 0) {
+        spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
+    } else if (strcmp(value, "enabled") == 0) {
+        spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
+    } else if (strcmp(value, "required") == 0) {
+        spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
+    } else {
+        error_setg(errp, "Bad value for \"resize-hpt\" property");
+    }
+}
+
 static void spapr_machine_initfn(Object *obj)
 {
     sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
@@ -2103,6 +2161,12 @@ static void spapr_machine_initfn(Object *obj)
     object_property_set_description(obj, "kvm-type",
                                     "Specifies the KVM virtualization mode (HV, PR)",
                                     NULL);
+
+    object_property_add_str(obj, "resize-hpt",
+                            spapr_get_resize_hpt, spapr_set_resize_hpt, NULL);
+    object_property_set_description(obj, "resize-hpt",
+                                    "Resizing of the Hash Page Table (enabled, disabled, required)",
+                                    NULL);
 }
 
 static void spapr_machine_finalizefn(Object *obj)
@@ -2296,6 +2360,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
     mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
 
     smc->dr_lmb_enabled = true;
+    smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
     fwc->get_dev_path = spapr_get_fw_dev_path;
     nc->nmi_monitor_handler = spapr_nmi;
 }
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index b2b1b93..0f0675e 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -338,6 +338,38 @@ static target_ulong h_read(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     return H_SUCCESS;
 }
 
+static target_ulong h_resize_hpt_prepare(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         target_ulong opcode,
+                                         target_ulong *args)
+{
+    target_ulong flags = args[0];
+    target_ulong shift = args[1];
+
+    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
+        return H_AUTHORITY;
+    }
+
+    trace_spapr_h_resize_hpt_prepare(flags, shift);
+    return H_HARDWARE;
+}
+
+static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu,
+                                        sPAPRMachineState *spapr,
+                                        target_ulong opcode,
+                                        target_ulong *args)
+{
+    target_ulong flags = args[0];
+    target_ulong shift = args[1];
+
+    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
+        return H_AUTHORITY;
+    }
+
+    trace_spapr_h_resize_hpt_commit(flags, shift);
+    return H_HARDWARE;
+}
+
 static target_ulong h_set_sprg0(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                                 target_ulong opcode, target_ulong *args)
 {
@@ -1096,6 +1128,11 @@ static void hypercall_register_types(void)
     /* hcall-bulk */
     spapr_register_hypercall(H_BULK_REMOVE, h_bulk_remove);
 
+    /* hcall-hpt-resize */
+    spapr_register_hypercall(KVMPPC_H_RESIZE_HPT_PREPARE,
+                             h_resize_hpt_prepare);
+    spapr_register_hypercall(KVMPPC_H_RESIZE_HPT_COMMIT, h_resize_hpt_commit);
+
     /* hcall-splpar */
     spapr_register_hypercall(H_REGISTER_VPA, h_register_vpa);
     spapr_register_hypercall(H_CEDE, h_cede);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 098d85d..6de5135 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -27,6 +27,13 @@ typedef struct sPAPRMachineState sPAPRMachineState;
 #define SPAPR_MACHINE_CLASS(klass) \
     OBJECT_CLASS_CHECK(sPAPRMachineClass, klass, TYPE_SPAPR_MACHINE)
 
+typedef enum {
+    SPAPR_RESIZE_HPT_DEFAULT = 0,
+    SPAPR_RESIZE_HPT_DISABLED,
+    SPAPR_RESIZE_HPT_ENABLED,
+    SPAPR_RESIZE_HPT_REQUIRED,
+} sPAPRResizeHPT;
+
 /**
  * sPAPRMachineClass:
  */
@@ -37,6 +44,7 @@ struct sPAPRMachineClass {
     /*< public >*/
     bool dr_lmb_enabled;       /* enable dynamic-reconfig/hotplug of LMBs */
     bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
+    sPAPRResizeHPT resize_hpt_default;
 };
 
 /**
@@ -52,6 +60,7 @@ struct sPAPRMachineState {
     XICSState *icp;
     DeviceState *rtc;
 
+    sPAPRResizeHPT resize_hpt;
     void *htab;
     uint32_t htab_shift;
     hwaddr rma_size;
@@ -351,7 +360,9 @@ struct sPAPRMachineState {
 #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
 /* Client Architecture support */
 #define KVMPPC_H_CAS            (KVMPPC_HCALL_BASE + 0x2)
-#define KVMPPC_HCALL_MAX        KVMPPC_H_CAS
+#define KVMPPC_H_RESIZE_HPT_PREPARE (KVMPPC_HCALL_BASE + 0x3)
+#define KVMPPC_H_RESIZE_HPT_COMMIT  (KVMPPC_HCALL_BASE + 0x4)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_RESIZE_HPT_COMMIT
 
 typedef struct sPAPRDeviceTreeUpdateHeader {
     uint32_t version_id;
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index e5183db..989e1d1 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -2572,3 +2572,15 @@ int kvmppc_enable_hwrng(void)
 
     return kvmppc_enable_hcall(kvm_state, H_RANDOM);
 }
+
+void kvmppc_check_papr_resize_hpt(Error **errp)
+{
+    if (!kvm_enabled()) {
+        return;
+    }
+
+    /* KVM will need to advertise capability for HPT resizing once
+     * implemented, for now we assume that it's not possible with
+     * KVM */
+    error_setg(errp, "Hash page table resizing not available with KVM");
+}
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index fc79312..5446d6f 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -56,6 +56,7 @@ void kvmppc_hash64_write_pte(CPUPPCState *env, target_ulong pte_index,
 bool kvmppc_has_cap_fixup_hcalls(void);
 int kvmppc_enable_hwrng(void);
 int kvmppc_put_books_sregs(PowerPCCPU *cpu);
+void kvmppc_check_papr_resize_hpt(Error **errp);
 
 #else
 
@@ -252,6 +253,11 @@ static inline int kvmppc_put_books_sregs(PowerPCCPU *cpu)
 {
     abort();
 }
+
+static inline void kvmppc_check_papr_resize_hpt(Error **errp)
+{
+    return;
+}
 #endif
 
 #ifndef CONFIG_KVM
diff --git a/trace-events b/trace-events
index d494de1..dfc263f 100644
--- a/trace-events
+++ b/trace-events
@@ -1419,6 +1419,8 @@ spapr_cas_continue(unsigned long n) "Copy changes to the guest: %ld bytes"
 # hw/ppc/spapr_hcall.c
 spapr_cas_pvr_try(uint32_t pvr) "%x"
 spapr_cas_pvr(uint32_t cur_pvr, bool cpu_match, uint32_t new_pvr, uint64_t pcr) "current=%x, cpu_match=%u, new=%x, compat flags=%"PRIx64
+spapr_h_resize_hpt_prepare(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
+spapr_h_resize_hpt_commit(uint64_t flags, uint64_t shift) "flags=0x%"PRIx64", shift=%"PRIu64
 
 # hw/ppc/spapr_iommu.c
 spapr_iommu_put(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tce=0x%"PRIx64" ret=%"PRId64
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [RFC 2/6] pseries: Implement HPT resizing
  2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 1/6] pseries: Stubs for HPT resizing David Gibson
@ 2016-03-21  4:42 ` David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 3/6] pseries: Enable HPT resizing for 2.6 David Gibson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

This patch implements hypercalls allowing a PAPR guest to resize its own
hash page table.  This will eventually allow for more flexible memory
hotplug.

The implementation is partially asynchronous, handled in a special thread
running the hpt_prepare_thread() function.  The state of a pending resize
is stored in SPAPR_MACHINE->pending_hpt.

The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or,
if one is already in progress, monitor it for completion.  If there is an
existing HPT resize in progress that doesn't match the size specified in
the call, it will cancel it, replacing it with a new one matching the
given size.

The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only
be called successfully once H_RESIZE_HPT_PREPARE has successfully
completed initialization of a new HPT.  The guest must ensure that there
are no concurrent accesses to the existing HPT while this is called (this
effectively means stop_machine() for Linux guests).

For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each
HPTE into the new HPT.  This can have quite high latency, but it seems to
be of the order of typical migration downtime latencies for HPTs of size
up to ~2GiB (which would be used in a 256GiB guest).

In future we probably want to move more of the rehashing to the "prepare"
phase, by having H_ENTER and other hcalls update both current and
pending HPTs.  That's a project for another day, but should be possible
without any changes to the guest interface.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr.c          |   4 +-
 hw/ppc/spapr_hcall.c    | 346 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/hw/ppc/spapr.h  |   6 +
 target-ppc/mmu-hash64.h |   4 +
 4 files changed, 354 insertions(+), 6 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index a027a80..53bd8c4 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -91,8 +91,6 @@
 
 #define PHANDLE_XICP            0x00001111
 
-#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
-
 static XICSState *try_create_xics(const char *type, int nr_servers,
                                   int nr_irqs, Error **errp)
 {
@@ -1055,7 +1053,7 @@ static void close_htab_fd(sPAPRMachineState *spapr)
     spapr->htab_fd = -1;
 }
 
-static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
+int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
 {
     int shift;
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 0f0675e..d56b259 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1,5 +1,6 @@
 #include "qemu/osdep.h"
 #include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
 #include "cpu.h"
 #include "helper_regs.h"
 #include "hw/ppc/spapr.h"
@@ -338,20 +339,320 @@ static target_ulong h_read(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     return H_SUCCESS;
 }
 
+struct sPAPRPendingHPT {
+    /* These fields are read-only after initialization */
+    int shift;
+    QemuThread thread;
+
+    /* These fields are protected by the BQL */
+    bool complete;
+
+    /* These fields are private to the preparation thread if
+     * !complete, otherwise protected by the BQL */
+    int ret;
+    void *hpt;
+};
+
+static void free_pending_hpt(sPAPRPendingHPT *pending)
+{
+    if (pending->hpt) {
+        qemu_vfree(pending->hpt);
+    }
+
+    g_free(pending);
+}
+
+static void *hpt_prepare_thread(void *opaque)
+{
+    sPAPRPendingHPT *pending = opaque;
+    size_t size = 1ULL << pending->shift;
+
+    pending->hpt = qemu_memalign(size, size);
+    if (pending->hpt) {
+        memset(pending->hpt, 0, size);
+        pending->ret = H_SUCCESS;
+    } else {
+        pending->ret = H_NO_MEM;
+    }
+
+    qemu_mutex_lock_iothread();
+
+    if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt != pending) {
+        /* We've been cancelled, clean ourselves up */
+        free_pending_hpt(pending);
+        goto out;
+    }
+
+    pending->complete = true;
+
+out:
+    qemu_mutex_unlock_iothread();
+    return NULL;
+}
+
+/* Must be called with BQL held */
+static void cancel_hpt_prepare(sPAPRMachineState *spapr)
+{
+    sPAPRPendingHPT *pending = spapr->pending_hpt;
+
+    /* Let the thread know it's cancelled */
+    spapr->pending_hpt = NULL;
+
+    if (!pending) {
+        /* Nothing to do */
+        return;
+    }
+
+    if (!pending->complete) {
+        /* thread will clean itself up */
+        return;
+    }
+
+    free_pending_hpt(pending);
+}
+
+static int build_dimm_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+        DeviceState *dev = DEVICE(obj);
+        if (dev->realized) { /* only realized DIMMs matter */
+            *list = g_slist_prepend(*list, dev);
+        }
+    }
+
+    object_child_foreach(obj, build_dimm_list, opaque);
+    return 0;
+}
+
+static ram_addr_t get_current_ram_size(void)
+{
+    GSList *list = NULL, *item;
+    ram_addr_t size = ram_size;
+
+    build_dimm_list(qdev_get_machine(), &list);
+    for (item = list; item; item = g_slist_next(item)) {
+        Object *obj = OBJECT(item->data);
+        if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) {
+            size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
+                                            &error_abort);
+        }
+    }
+    g_slist_free(list);
+
+    return size;
+}
+
 static target_ulong h_resize_hpt_prepare(PowerPCCPU *cpu,
                                          sPAPRMachineState *spapr,
                                          target_ulong opcode,
                                          target_ulong *args)
 {
     target_ulong flags = args[0];
-    target_ulong shift = args[1];
+    int shift = args[1];
+    sPAPRPendingHPT *pending = spapr->pending_hpt;
 
     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
         return H_AUTHORITY;
     }
 
     trace_spapr_h_resize_hpt_prepare(flags, shift);
-    return H_HARDWARE;
+
+    if (flags != 0) {
+        return H_PARAMETER;
+    }
+
+    if (shift && ((shift < 18) || (shift > 46))) {
+        return H_PARAMETER;
+    }
+
+    if (pending) {
+        /* something already in progress */
+        if (pending->shift == shift) {
+            /* and it's suitable */
+            if (pending->complete) {
+                return pending->ret;
+            } else {
+                return H_LONG_BUSY_ORDER_100_MSEC;
+            }
+        }
+
+        /* not suitable, cancel and replace */
+        cancel_hpt_prepare(spapr);
+    }
+
+    if (!shift) {
+        /* nothing to do */
+        return H_SUCCESS;
+    }
+
+    /* start new prepare */
+
+    /* We only allow the guest to allocate an HPT one order above what
+     * we'd normally give them (to stop a small guest claiming a huge
+     * chunk of resources in the HPT */
+    if (shift > (spapr_hpt_shift_for_ramsize(get_current_ram_size()) + 1)) {
+        return H_RESOURCE;
+    }
+
+    pending = g_malloc0(sizeof(*pending));
+    pending->shift = shift;
+    pending->ret = H_HARDWARE;
+
+    qemu_thread_create(&pending->thread, "sPAPR HPT prepare",
+                       hpt_prepare_thread, pending, QEMU_THREAD_DETACHED);
+
+    spapr->pending_hpt = pending;
+
+    /* In theory we could estimate the time more accurately based on
+     * the new size, but there's not much point */
+    return H_LONG_BUSY_ORDER_100_MSEC;
+}
+
+static uint64_t new_hpte_load0(void *htab, uint64_t pteg, int slot)
+{
+    uint8_t *addr = htab;
+
+    addr += pteg * HASH_PTEG_SIZE_64;
+    addr += slot * HASH_PTE_SIZE_64;
+    return  ldq_p(addr);
+}
+
+static void new_hpte_store(void *htab, uint64_t pteg, int slot,
+                           uint64_t pte0, uint64_t pte1)
+{
+    uint8_t *addr = htab;
+
+    addr += pteg * HASH_PTEG_SIZE_64;
+    addr += slot * HASH_PTE_SIZE_64;
+
+    stq_p(addr, pte0);
+    stq_p(addr + HASH_PTE_SIZE_64/2, pte1);
+}
+
+static int rehash_hpte(PowerPCCPU *cpu, uint64_t token,
+                       void *old, uint64_t oldsize,
+                       void *new, uint64_t newsize,
+                       uint64_t pteg, int slot)
+{
+    uint64_t old_hash_mask = (oldsize >> 7) - 1;
+    uint64_t new_hash_mask = (newsize >> 7) - 1;
+    target_ulong pte0 = ppc_hash64_load_hpte0(cpu, token, slot);
+    target_ulong pte1;
+    uint64_t avpn;
+    unsigned shift, spshift;
+    uint64_t hash, new_pteg, replace_pte0;
+
+    if (!(pte0 & HPTE64_V_VALID) || !(pte0 & HPTE64_V_BOLTED)) {
+        return H_SUCCESS;
+    }
+
+    pte1 = ppc_hash64_load_hpte1(cpu, token, slot);
+
+    shift = ppc_hash64_hpte_page_shift_noslb(cpu, pte0, pte1, &spshift);
+    assert(shift); /* H_ENTER should never have allowed a bad encoding */
+    avpn = HPTE64_V_AVPN_VAL(pte0) & ~(((1ULL << shift) - 1) >> 23);
+
+    if (pte0 & HPTE64_V_SECONDARY) {
+        pteg = ~pteg;
+    }
+
+    if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_256M) {
+        uint64_t offset, vsid;
+
+        /* We only have 28 - 23 bits of offset in avpn */
+        offset = (avpn & 0x1f) << 23;
+        vsid = avpn >> 5;
+        /* We can find more bits from the pteg value */
+        if (shift < 23) {
+            offset |= ((vsid ^ pteg) & old_hash_mask) << shift;
+        }
+
+        hash = vsid ^ (offset >> shift);
+    } else if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_1T) {
+        uint64_t offset, vsid;
+
+        /* We only have 40 - 23 bits of seg_off in avpn */
+        offset = (avpn & 0x1ffff) << 23;
+        vsid = avpn >> 17;
+        if (shift < 23) {
+            offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << shift;
+        }
+
+        hash = vsid ^ (vsid << 25) ^ (offset >> shift);
+    } else {
+        error_report("rehash_pte: Bad segment size in HPTE");
+        return H_HARDWARE;
+    }
+
+    new_pteg = hash & new_hash_mask;
+    if (pte0 & HPTE64_V_SECONDARY) {
+        assert(~pteg == (hash & old_hash_mask));
+        new_pteg = ~new_pteg;
+    } else {
+        assert(pteg == (hash & old_hash_mask));
+    }
+    assert((oldsize != newsize) || (pteg == new_pteg));
+    replace_pte0 = new_hpte_load0(new, new_pteg, slot);
+    if (replace_pte0 & HPTE64_V_VALID) {
+        assert(newsize < oldsize);
+        if (replace_pte0 & HPTE64_V_BOLTED) {
+            if (pte0 & HPTE64_V_BOLTED) {
+                /* Bolted collision, nothing we can do */
+                return H_PTEG_FULL;
+            } else {
+                /* Discard this hpte */
+                return H_SUCCESS;
+            }
+        }
+    }
+
+    new_hpte_store(new, new_pteg, slot, pte0, pte1);
+    return H_SUCCESS;
+}
+
+static int rehash_hpt(PowerPCCPU *cpu,
+                      void *old, uint64_t oldsize,
+                      void *new, uint64_t newsize)
+{
+    CPUPPCState *env = &cpu->env;
+    uint64_t n_ptegs = oldsize >> 7;
+    uint64_t pteg;
+    int slot;
+    int rc;
+
+    assert(env->external_htab == old);
+
+    for (pteg = 0; pteg < n_ptegs; pteg++) {
+        uint64_t token = ppc_hash64_start_access(cpu, pteg * HPTES_PER_GROUP);
+
+        if (!token) {
+            return H_HARDWARE;
+        }
+
+        for (slot = 0; slot < HPTES_PER_GROUP; slot++) {
+            rc = rehash_hpte(cpu, token, old, oldsize, new, newsize,
+                             pteg, slot);
+            if (rc != H_SUCCESS) {
+                ppc_hash64_stop_access(cpu, token);
+                return rc;
+            }
+        }
+        ppc_hash64_stop_access(cpu, token);
+    }
+
+    return H_SUCCESS;
+}
+
+static void pivot_hpt(void *arg)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    CPUState *cs = arg;
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+    ppc_hash64_set_external_hpt(cpu, spapr->htab, spapr->htab_shift,
+                                &error_fatal);
 }
 
 static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu,
@@ -361,13 +662,52 @@ static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu,
 {
     target_ulong flags = args[0];
     target_ulong shift = args[1];
+    sPAPRPendingHPT *pending = spapr->pending_hpt;
+    int rc;
+    size_t newsize;
 
     if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
         return H_AUTHORITY;
     }
 
     trace_spapr_h_resize_hpt_commit(flags, shift);
-    return H_HARDWARE;
+
+    if (flags != 0) {
+        return H_PARAMETER;
+    }
+
+    if (!pending || (pending->shift != shift)) {
+        /* no matching prepare */
+        return H_CLOSED;
+    }
+
+    if (!pending->complete) {
+        /* prepare has not completed */
+        return H_BUSY;
+    }
+
+    newsize = 1ULL << pending->shift;
+    rc = rehash_hpt(cpu, spapr->htab, HTAB_SIZE(spapr),
+                    pending->hpt, newsize);
+    if (rc == H_SUCCESS) {
+        CPUState *cs;
+
+        qemu_vfree(spapr->htab);
+        spapr->htab = pending->hpt;
+        spapr->htab_shift = pending->shift;
+
+        CPU_FOREACH(cs) {
+            run_on_cpu(cs, pivot_hpt, cs);
+        }
+
+        pending->hpt = NULL; /* so it's not free()d */
+    }
+
+    /* Clean up */
+    spapr->pending_hpt = NULL;
+    free_pending_hpt(pending);
+
+    return rc;
 }
 
 static target_ulong h_set_sprg0(PowerPCCPU *cpu, sPAPRMachineState *spapr,
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 6de5135..345e633 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -12,6 +12,7 @@ struct sPAPRPHBState;
 struct sPAPRNVRAM;
 typedef struct sPAPRConfigureConnectorState sPAPRConfigureConnectorState;
 typedef struct sPAPREventLogEntry sPAPREventLogEntry;
+typedef struct sPAPRPendingHPT sPAPRPendingHPT;
 
 #define HPTE64_V_HPTE_DIRTY     0x0000000000000040ULL
 #define SPAPR_ENTRY_POINT       0x100
@@ -63,6 +64,8 @@ struct sPAPRMachineState {
     sPAPRResizeHPT resize_hpt;
     void *htab;
     uint32_t htab_shift;
+    sPAPRPendingHPT *pending_hpt; /* in-progress resize */
+
     hwaddr rma_size;
     int vrma_adjust;
     hwaddr fdt_addr, rtas_addr;
@@ -596,6 +599,7 @@ void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
                                        uint32_t count);
 void spapr_hotplug_req_remove_by_count(sPAPRDRConnectorType drc_type,
                                           uint32_t count);
+int spapr_hpt_shift_for_ramsize(uint64_t ramsize);
 
 /* rtas-configure-connector state */
 struct sPAPRConfigureConnectorState {
@@ -639,4 +643,6 @@ int spapr_rng_populate_dt(void *fdt);
  */
 #define SPAPR_LMB_FLAGS_ASSIGNED 0x00000008
 
+#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
+
 #endif /* !defined (__HW_SPAPR_H__) */
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 9bf8b9b..1f0d239 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -59,11 +59,15 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 #define HASH_PTE_SIZE_64        16
 #define HASH_PTEG_SIZE_64       (HASH_PTE_SIZE_64 * HPTES_PER_GROUP)
 
+#define HPTE64_V_SSIZE          SLB_VSID_B
+#define HPTE64_V_SSIZE_256M     SLB_VSID_B_256M
+#define HPTE64_V_SSIZE_1T       SLB_VSID_B_1T
 #define HPTE64_V_SSIZE_SHIFT    62
 #define HPTE64_V_AVPN_SHIFT     7
 #define HPTE64_V_AVPN           0x3fffffffffffff80ULL
 #define HPTE64_V_AVPN_VAL(x)    (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT)
 #define HPTE64_V_COMPARE(x, y)  (!(((x) ^ (y)) & 0xffffffffffffff80ULL))
+#define HPTE64_V_BOLTED         0x0000000000000010ULL
 #define HPTE64_V_LARGE          0x0000000000000004ULL
 #define HPTE64_V_SECONDARY      0x0000000000000002ULL
 #define HPTE64_V_VALID          0x0000000000000001ULL
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [RFC 3/6] pseries: Enable HPT resizing for 2.6
  2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 1/6] pseries: Stubs for HPT resizing David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 2/6] pseries: Implement " David Gibson
@ 2016-03-21  4:42 ` David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 4/6] pseries: Use smaller default hash page tables when guest can resize David Gibson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

We've now implemented a PAPR extensions which allows PAPR guests (i.e.
"pseries" machine type) to resize their hash page table during runtime.

However, that extension is only enabled if explicitly chosen on the
command line.  This patch enables it by default for qemu-2.6, but leaves it
disabled (by default) for older machine types.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 53bd8c4..1d831ac 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2358,7 +2358,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
     mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
 
     smc->dr_lmb_enabled = true;
-    smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
+    smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
     fwc->get_dev_path = spapr_get_fw_dev_path;
     nc->nmi_monitor_handler = spapr_nmi;
 }
@@ -2438,6 +2438,7 @@ static void spapr_machine_2_5_class_options(MachineClass *mc)
 
     spapr_machine_2_6_class_options(mc);
     smc->use_ohci_by_default = true;
+    smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
     SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_5);
 }
 
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [RFC 4/6] pseries: Use smaller default hash page tables when guest can resize
  2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
                   ` (2 preceding siblings ...)
  2016-03-21  4:42 ` [Qemu-devel] [RFC 3/6] pseries: Enable HPT resizing for 2.6 David Gibson
@ 2016-03-21  4:42 ` David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 5/6] pseries: Allow HPT resizing on PR KVM David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 6/6] pseries: Allow KVM HV implementation of HPT resizing to be used David Gibson
  5 siblings, 0 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

We've now implemented a PAPR extension allowing PAPR guest to resize
their hash page table (HPT) during runtime.

This patch makes use of that facility to allocate smaller HPTs by default.
Specifically when a guest is aware of the HPT resize facility, qemu sizes
the HPT to the initial memory size, rather than the maximum memory size on
the assumption that the guest will resize its HPT if necessary for hot
plugged memory.

When the initial memory size is much smaller than the maximum memory size
(a common configuration with e.g. oVirt / RHEV) then this can save
significant memory on the HPT.

If the guest does *not* advertise HPT resize awareness when it makes the
ibm,client-architecture-support call, qemu resizes the HPT for maxmimum
memory size (unless it's been configured not to allow such guests at all).

For now we make that reallocation assuming the guest has not yet used the
HPT at all.  That's true in practice, but not, strictly, an architectural
or PAPR requirement.  If we need to in future we can fix this by having
the client-architecture-support call reboot the guest with the revised
HPT size (the client-architecture-support call is explicitly permitted to
trigger a reboot in this way).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr.c         | 14 +++++++++-----
 hw/ppc/spapr_hcall.c   | 28 +++++++++++++++++++++++++++-
 include/hw/ppc/spapr.h |  2 ++
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 1d831ac..0536f86 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1066,8 +1066,8 @@ int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
     return shift;
 }
 
-static void spapr_reallocate_hpt(sPAPRMachineState *spapr, int shift,
-                                 Error **errp)
+void spapr_reallocate_hpt(sPAPRMachineState *spapr, int shift,
+                          Error **errp)
 {
     long rc;
 
@@ -1139,14 +1139,18 @@ static void ppc_spapr_reset(void)
     sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
     PowerPCCPU *first_ppc_cpu;
     uint32_t rtas_limit;
+    int hpt_shift;
 
     /* Check for unknown sysbus devices */
     foreach_dynamic_sysbus_device(find_unknown_sysbus_device, NULL);
 
     /* Allocate and/or reset the hash page table */
-    spapr_reallocate_hpt(spapr,
-                         spapr_hpt_shift_for_ramsize(machine->maxram_size),
-                         &error_fatal);
+    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
+        hpt_shift = spapr_hpt_shift_for_ramsize(machine->maxram_size);
+    } else {
+        hpt_shift = spapr_hpt_shift_for_ramsize(machine->ram_size);
+    }
+    spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
 
     /* Update the RMA size if necessary */
     if (spapr->vrma_adjust) {
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index d56b259..3c2e59f 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1297,12 +1297,14 @@ static void do_set_compat(void *arg)
     ((cpuver) == CPU_POWERPC_LOGICAL_2_07) ? 2070 : 0)
 
 #define OV5_DRCONF_MEMORY 0x20
+#define OV5_HPT_RESIZE    0x80
 
 static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
                                                   sPAPRMachineState *spapr,
                                                   target_ulong opcode,
                                                   target_ulong *args)
 {
+    MachineState *machine = MACHINE(spapr);
     target_ulong list = ppc64_phys_to_real(args[0]);
     target_ulong ov_table, ov5;
     PowerPCCPUClass *pcc_ = POWERPC_CPU_GET_CLASS(cpu_);
@@ -1312,7 +1314,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
     unsigned compat_lvl = 0, cpu_version = 0;
     unsigned max_lvl = get_compat_level(cpu_->max_compat);
     int counter;
-    char ov5_byte2;
+    char ov5_byte2, ov5_byte8;
 
     /* Parse PVR list */
     for (counter = 0; counter < 512; ++counter) {
@@ -1402,6 +1404,30 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
         memory_update = true;
     }
 
+    ov5_byte8 = ldub_phys(&address_space_memory, ov5 + 8);
+    if (!(ov5_byte8 & OV5_HPT_RESIZE)) {
+        int maxshift = spapr_hpt_shift_for_ramsize(machine->maxram_size);
+
+        if (spapr->resize_hpt == SPAPR_RESIZE_HPT_REQUIRED) {
+            error_report(
+                "h_client_architecture_support: Guest doesn't support HPT resizing with resize-hpt=required");
+            exit(1);
+        }
+
+        if (spapr->htab_shift < maxshift) {
+            CPUState *cs;
+            /* Guest doesn't know about HPT resizing, so we
+             * pre-emptively resize for the maximum permitted RAM.  At
+             * the point this is called, nothing should have been
+             * entered into the existing HPT */
+            spapr_reallocate_hpt(spapr, maxshift, &error_fatal);
+            CPU_FOREACH(cs) {
+                run_on_cpu(cs, pivot_hpt, cs);
+            }
+            cpu_update = true;
+        }
+    }
+
     if (spapr_h_cas_compose_response(spapr, args[1], args[2],
                                      cpu_update, memory_update)) {
         qemu_system_reset_request();
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 345e633..7ffe0ea 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -600,6 +600,8 @@ void spapr_hotplug_req_add_by_count(sPAPRDRConnectorType drc_type,
 void spapr_hotplug_req_remove_by_count(sPAPRDRConnectorType drc_type,
                                           uint32_t count);
 int spapr_hpt_shift_for_ramsize(uint64_t ramsize);
+void spapr_reallocate_hpt(sPAPRMachineState *spapr, int shift,
+                          Error **errp);
 
 /* rtas-configure-connector state */
 struct sPAPRConfigureConnectorState {
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [RFC 5/6] pseries: Allow HPT resizing on PR KVM
  2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
                   ` (3 preceding siblings ...)
  2016-03-21  4:42 ` [Qemu-devel] [RFC 4/6] pseries: Use smaller default hash page tables when guest can resize David Gibson
@ 2016-03-21  4:42 ` David Gibson
  2016-03-21  4:42 ` [Qemu-devel] [RFC 6/6] pseries: Allow KVM HV implementation of HPT resizing to be used David Gibson
  5 siblings, 0 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

The initial implementation of PAPR hash page table (HPT) resizing is based
on TCG and will not work with KVM.  Eventually, of course, we want to
implement this in KVM as well.

Long term that will require a new kernel capability flag to let qemu know
if it's capable of handling HPT resizing.  However, the "PR" KVM
implementation already supports HPT resizing, since it has the HPT
managed by QEMU - the TCG implementation will work for PR KVM as well.

This patch adds code to detect PR KVM and permit HPT resizing in this case.
Explicitly detecting PR KVM, rather than using a capability relevant to
the feature at hand is frowned upon, but this should be adequate as a
fallback until that flag is added to the kernel.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target-ppc/kvm.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 989e1d1..562e9fa 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -2208,6 +2208,7 @@ int kvmppc_reset_htab(int shift_hint)
         /* Full emulation, tell caller to allocate htab itself */
         return 0;
     }
+
     if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
         int ret;
         ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
@@ -2579,8 +2580,12 @@ void kvmppc_check_papr_resize_hpt(Error **errp)
         return;
     }
 
-    /* KVM will need to advertise capability for HPT resizing once
-     * implemented, for now we assume that it's not possible with
-     * KVM */
-    error_setg(errp, "Hash page table resizing not available with KVM");
+    /* TODO: Check specific capabilities for HPT resize aware host kernels */
+
+    /* Fall back to checking if we have PR or HV KVM */
+    if (!kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
+        error_setg(errp, "Hash page table resizing not available with HV KVM");
+    }
+
+    /* PR KVM, we should be ok */
 }
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [RFC 6/6] pseries: Allow KVM HV implementation of HPT resizing to be used
  2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
                   ` (4 preceding siblings ...)
  2016-03-21  4:42 ` [Qemu-devel] [RFC 5/6] pseries: Allow HPT resizing on PR KVM David Gibson
@ 2016-03-21  4:42 ` David Gibson
  5 siblings, 0 replies; 7+ messages in thread
From: David Gibson @ 2016-03-21  4:42 UTC (permalink / raw)
  To: paulus, aik; +Cc: agraf, David Gibson, qemu-ppc, qemu-devel, bharata

So far, qemu implements the PAPR Hash Page Table (HPT) resizing extension
with TCG or the KVM PR, since in both cases the guest's HPT is managed by
qemu.  For now we don't allow HPT resizing with KVM HV, since the HPT is
managed by the KVM host kernel in that case.

Instead of using a hack to detect KVM HV versus KVM PR, this patch uses
a special capability in newer kernels to directly see if HPT resizing is
supported by KVM.  The capability can advertise either qemu-managed HPT
resizing (KVM PR) or KVM-managed HPT resizing (KVM HV).

If the kernel doesn't advertise the capability at all, we fall back on the
old PR vs. HV test.

NOTE: This patch updates the linux-headers tree with the define for the
new capability.  Since the corresponding kernel changes aren't yet upstream
this is a temporary hack to be replaced by a proper headers update before
merge.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 linux-headers/linux/kvm.h |  1 +
 target-ppc/kvm.c          | 49 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 4a56b9e..96c5cfb 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -856,6 +856,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 #define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_S390_RI 124
+#define KVM_CAP_SPAPR_RESIZE_HPT 128
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 562e9fa..0e1c487 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -2576,16 +2576,53 @@ int kvmppc_enable_hwrng(void)
 
 void kvmppc_check_papr_resize_hpt(Error **errp)
 {
+    int cap_resize_hpt;
+    int rc;
+
     if (!kvm_enabled()) {
         return;
     }
 
-    /* TODO: Check specific capabilities for HPT resize aware host kernels */
+    cap_resize_hpt = kvm_vm_check_extension(kvm_state,
+                                            KVM_CAP_SPAPR_RESIZE_HPT);
 
-    /* Fall back to checking if we have PR or HV KVM */
-    if (!kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
-        error_setg(errp, "Hash page table resizing not available with HV KVM");
-    }
+    switch (cap_resize_hpt) {
+    case 0:
+        /* Fall back to checking if we have PR or HV KVM */
+        if (!kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
+            error_setg(errp,
+                       "Hash page table resizing not available with HV KVM");
+        }
+        /* PR KVM, we should be ok */
+        return;
 
-    /* PR KVM, we should be ok */
+    case 1:
+        /* Resizing only allowed with a qemu managed HPT */
+        if (kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
+            error_setg(errp,
+                       "Hash page table resizing not available with kernel managed HPT");
+        }
+        /* qemu managed HPT, we should be ok */
+        return;
+
+    case 2:
+        /* HPT resize allowed with kernel managed HPT too */
+        rc = kvmppc_enable_hcall(kvm_state, KVMPPC_H_RESIZE_HPT_PREPARE);
+        if (rc < 0) {
+            error_setg_errno(errp, -rc,
+                             "Unable to enable H_RESIZE_HPT_PREPARE hypercall");
+            return;
+        }
+        rc = kvmppc_enable_hcall(kvm_state, KVMPPC_H_RESIZE_HPT_COMMIT);
+        if (rc < 0) {
+            error_setg_errno(errp, -rc,
+                             "Unable to enable H_RESIZE_HPT_COMMIT hypercall");
+        }
+        return;
+
+    default:
+        error_setg(errp, "Unknown KVM_CAP_SPAPR_RESIZE_HPT value %d",
+                   cap_resize_hpt);
+        return;
+    }
 }
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-03-21  4:41 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-21  4:42 [Qemu-devel] [RFC 0/6] PAPR HPT resizing (qemu host side) David Gibson
2016-03-21  4:42 ` [Qemu-devel] [RFC 1/6] pseries: Stubs for HPT resizing David Gibson
2016-03-21  4:42 ` [Qemu-devel] [RFC 2/6] pseries: Implement " David Gibson
2016-03-21  4:42 ` [Qemu-devel] [RFC 3/6] pseries: Enable HPT resizing for 2.6 David Gibson
2016-03-21  4:42 ` [Qemu-devel] [RFC 4/6] pseries: Use smaller default hash page tables when guest can resize David Gibson
2016-03-21  4:42 ` [Qemu-devel] [RFC 5/6] pseries: Allow HPT resizing on PR KVM David Gibson
2016-03-21  4:42 ` [Qemu-devel] [RFC 6/6] pseries: Allow KVM HV implementation of HPT resizing to be used David Gibson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.