All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
@ 2015-11-04 23:06 Sukadev Bhattiprolu
  2015-11-09  1:57 ` Alexey Kardashevskiy
  2015-11-09  4:58 ` David Gibson
  0 siblings, 2 replies; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-04 23:06 UTC (permalink / raw)
  To: aik, agraf, benh, david, paulus, stewart; +Cc: nacc, qemu-ppc, qemu-devel

Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
call in qemu. This call returns the processor module (socket), chip and core
information as specified in section 7.3.16.18 of PAPR v2.7.

We walk the /proc/device-tree to determine the number of chips, cores and
modules in the _host_ system and return this info to the guest application
that makes the rtas_get_sysparm() call.

We currently hard code the number of module_types to 1, but we should determine
that dynamically somehow later.

Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
---
Changelog[v2]:
        [Alexey Kardashevsk] Use existing interfaces like ldl_be_p(),
                stw_be_phys(), g_hash_table_new_full(), error_report() rather
                than re-inventing; fix indentation, function prottypes etc;
                Drop the fts() interface (which doesn't seem to be available
                on mingw32/mingw64) and use opendir() to walk specific
                directories in the directory tree.
---
 hw/ppc/Makefile.objs        |   1 +
 hw/ppc/spapr_rtas.c         |  35 +++++++
 hw/ppc/spapr_rtas_modinfo.c | 230 ++++++++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_rtas_modinfo.h |  12 +++
 include/hw/ppc/spapr.h      |   1 +
 5 files changed, 279 insertions(+)
 create mode 100644 hw/ppc/spapr_rtas_modinfo.c
 create mode 100644 hw/ppc/spapr_rtas_modinfo.h

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index c1ffc77..57c6b02 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o
 obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
+obj-$(CONFIG_PSERIES) += spapr_rtas_modinfo.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 34b12a3..41fd8a6 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -34,6 +34,8 @@
 #include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_vio.h"
 #include "qapi-event.h"
+
+#include "spapr_rtas_modinfo.h"
 #include "hw/boards.h"
 
 #include <libfdt.h>
@@ -240,6 +242,39 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu,
     target_ulong ret = RTAS_OUT_SUCCESS;
 
     switch (parameter) {
+    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
+        int i;
+        int offset = 0;
+        int size;
+        struct rtas_module_info modinfo;
+
+        if (rtas_get_module_info(&modinfo)) {
+            break;
+        }
+
+        size = sizeof(modinfo);
+        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
+
+        stw_be_phys(&address_space_memory, buffer+offset, size);
+        offset += 2;
+
+        stw_be_phys(&address_space_memory, buffer+offset, modinfo.module_types);
+        offset += 2;
+
+        for (i = 0; i < modinfo.module_types; i++) {
+            stw_be_phys(&address_space_memory, buffer+offset,
+                                    modinfo.si[i].sockets);
+            offset += 2;
+            stw_be_phys(&address_space_memory, buffer+offset,
+                                    modinfo.si[i].chips);
+            offset += 2;
+            stw_be_phys(&address_space_memory, buffer+offset,
+                                    modinfo.si[i].cores_per_chip);
+            offset += 2;
+        }
+        break;
+    }
+
     case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
         char *param_val = g_strdup_printf("MaxEntCap=%d,"
                                           "DesMem=%llu,"
diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
new file mode 100644
index 0000000..068fc2c
--- /dev/null
+++ b/hw/ppc/spapr_rtas_modinfo.c
@@ -0,0 +1,230 @@
+/*
+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
+ *
+ * Hypercall based emulated RTAS
+ *
+ * Copyright (c) 2015 Sukadev Bhattiprolu, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#include <stdio.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <errno.h>
+
+#include <glib.h>
+#include "spapr_rtas_modinfo.h"
+#include "qemu/error-report.h"
+#include "qemu/bswap.h"
+
+static int file_read_buf(char *file_name, char *buf, int len)
+{
+    int rc;
+    FILE *fp;
+
+    fp = fopen(file_name, "r");
+    if (!fp) {
+        error_report("%s: Error opening %s\n", __func__, file_name);
+        return -1;
+    }
+
+    rc = fread(buf, 1, len, fp);
+    fclose(fp);
+
+    if (rc != len) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Read an identifier from the file @path and add the identifier
+ * to the hash table @gt unless its already in the table.
+ */
+static int hash_table_add_contents(GHashTable *gt, char *path)
+{
+    int idx;
+    char buf[16];
+    int *key;
+
+    if (file_read_buf(path, buf, sizeof(int))) {
+        return -1;
+    }
+
+    idx = ldl_be_p(buf);
+
+    if (g_hash_table_contains(gt, &idx)) {
+        return 0;
+    }
+
+    key = g_malloc(sizeof(int));
+
+    *key = idx;
+    if (!g_hash_table_insert(gt, key, NULL)) {
+        error_report("Unable to insert key %d into chips hash table\n", idx);
+        g_free(key);
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Each core in the system is represented by a directory with the
+ * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
+ * Process that directory and count the number of cores in the system.
+ */
+static int count_cores(int *num_cores)
+{
+    int rc;
+    DIR *dir;
+    struct dirent *ent;
+    const char *cpus_dir = "/proc/device-tree/cpus";
+    const char *ppc_prefix = "PowerPC,POWER";
+
+    dir = opendir(cpus_dir);
+    if (!dir) {
+        error_report("Unable to open %s, error %d\n", cpus_dir, errno);
+        return -1;
+    }
+
+    *num_cores = 0;
+    while (1) {
+        errno = 0;
+        ent = readdir(dir);
+        if (!ent) {
+            break;
+        }
+
+        if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) {
+            (*num_cores)++;
+        }
+    }
+
+    rc = 0;
+    if (errno) {
+        rc = -1;
+    }
+
+    closedir(dir);
+    return rc;
+}
+
+/*
+ * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
+ * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
+ * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
+ *
+ * A module can contain more than one chip and a chip can contain more
+ * than one core. So there are likely to be duplicates in the module
+ * and chip identifiers (i.e more than one xscom directory can contain
+ * the same module/chip id).
+ *
+ * Search the xscom directories and count the number of _UNIQUE_ module
+ * and chip identifiers in the system.
+ */
+static int count_modules_chips(int *num_modules, int *num_chips)
+{
+    int rc;
+    DIR *dir;
+    struct dirent *ent;
+    char path[PATH_MAX];
+    const char *xscom_dir = "/proc/device-tree";
+    const char *xscom_prefix = "xscom@";
+    GHashTable *chips_tbl, *modules_tbl;
+
+    dir = opendir(xscom_dir);
+    if (!dir) {
+        error_report("Unable to open %s, error %d\n", xscom_dir, errno);
+        return -1;
+    }
+
+    modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
+    chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
+
+    rc = -1;
+    while (1) {
+        errno = 0;
+        ent = readdir(dir);
+        if (!ent) {
+            break;
+        }
+
+        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
+            continue;
+        }
+
+        sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name);
+        if (hash_table_add_contents(chips_tbl, path)) {
+            goto cleanup;
+        }
+
+        sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name);
+        if (hash_table_add_contents(modules_tbl, path)) {
+            goto cleanup;
+        }
+    }
+
+    if (errno) {
+        goto cleanup;
+    }
+
+    *num_modules = g_hash_table_size(modules_tbl);
+    *num_chips = g_hash_table_size(chips_tbl);
+    rc = 0;
+
+cleanup:
+    g_hash_table_remove_all(modules_tbl);
+    g_hash_table_destroy(modules_tbl);
+
+    g_hash_table_remove_all(chips_tbl);
+    g_hash_table_destroy(chips_tbl);
+
+    closedir(dir);
+
+    return rc;
+}
+
+int rtas_get_module_info(struct rtas_module_info *modinfo)
+{
+    int cores;
+    int chips;
+    int modules;
+
+    memset(modinfo, 0, sizeof(struct rtas_module_info));
+
+    if (count_modules_chips(&modules, &chips) || count_cores(&cores)) {
+        return -1;
+    }
+
+    /*
+     * TODO: Hard code number of module_types for now till
+     *       we figure out how to determine it dynamically.
+     */
+    modinfo->module_types = 1;
+    modinfo->si[0].sockets = modules;
+    modinfo->si[0].chips = chips;
+    modinfo->si[0].cores_per_chip = cores / chips;
+
+    return 0;
+}
diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
new file mode 100644
index 0000000..356a2b5
--- /dev/null
+++ b/hw/ppc/spapr_rtas_modinfo.h
@@ -0,0 +1,12 @@
+
+struct rtas_socket_info {
+    unsigned short sockets;
+    unsigned short chips;
+    unsigned short cores_per_chip;
+};
+struct rtas_module_info {
+    unsigned short module_types;
+    struct rtas_socket_info si[1];
+};
+
+extern int rtas_get_module_info(struct rtas_module_info *topo);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 5baa906..cfe7fa2 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -463,6 +463,7 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 /* RTAS ibm,get-system-parameter token values */
 #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
 #define RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE        42
+#define RTAS_SYSPARM_PROCESSOR_MODULE_INFO       43
 #define RTAS_SYSPARM_UUID                        48
 
 /* RTAS indicator/sensor types
-- 
2.5.3

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-04 23:06 [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO) Sukadev Bhattiprolu
@ 2015-11-09  1:57 ` Alexey Kardashevskiy
  2015-11-09  5:01   ` David Gibson
  2015-11-10  3:57   ` Sukadev Bhattiprolu
  2015-11-09  4:58 ` David Gibson
  1 sibling, 2 replies; 22+ messages in thread
From: Alexey Kardashevskiy @ 2015-11-09  1:57 UTC (permalink / raw)
  To: Sukadev Bhattiprolu, agraf, benh, david, paulus, stewart
  Cc: nacc, qemu-ppc, qemu-devel

On 11/05/2015 10:06 AM, Sukadev Bhattiprolu wrote:
> Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> call in qemu. This call returns the processor module (socket), chip and core
> information as specified in section 7.3.16.18 of PAPR v2.7.
>
> We walk the /proc/device-tree to determine the number of chips, cores and
> modules in the _host_ system and return this info to the guest application
> that makes the rtas_get_sysparm() call.
>
> We currently hard code the number of module_types to 1, but we should determine
> that dynamically somehow later.
>
> Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.

"iy" is missing :)


>
> Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> ---
> Changelog[v2]:
>          [Alexey Kardashevsk] Use existing interfaces like ldl_be_p(),
>                  stw_be_phys(), g_hash_table_new_full(), error_report() rather
>                  than re-inventing; fix indentation, function prottypes etc;
>                  Drop the fts() interface (which doesn't seem to be available
>                  on mingw32/mingw64) and use opendir() to walk specific
>                  directories in the directory tree.
> ---
>   hw/ppc/Makefile.objs        |   1 +
>   hw/ppc/spapr_rtas.c         |  35 +++++++
>   hw/ppc/spapr_rtas_modinfo.c | 230 ++++++++++++++++++++++++++++++++++++++++++++
>   hw/ppc/spapr_rtas_modinfo.h |  12 +++
>   include/hw/ppc/spapr.h      |   1 +
>   5 files changed, 279 insertions(+)
>   create mode 100644 hw/ppc/spapr_rtas_modinfo.c
>   create mode 100644 hw/ppc/spapr_rtas_modinfo.h
>
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index c1ffc77..57c6b02 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o
>   obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
>   obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>   obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> +obj-$(CONFIG_PSERIES) += spapr_rtas_modinfo.o
>   ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>   obj-y += spapr_pci_vfio.o
>   endif
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 34b12a3..41fd8a6 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -34,6 +34,8 @@
>   #include "hw/ppc/spapr.h"
>   #include "hw/ppc/spapr_vio.h"
>   #include "qapi-event.h"
> +
> +#include "spapr_rtas_modinfo.h"
>   #include "hw/boards.h"
>
>   #include <libfdt.h>
> @@ -240,6 +242,39 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu,
>       target_ulong ret = RTAS_OUT_SUCCESS;
>
>       switch (parameter) {
> +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
> +        int i;
> +        int offset = 0;
> +        int size;

Nit - could be one line.


> +        struct rtas_module_info modinfo;
> +
> +        if (rtas_get_module_info(&modinfo)) {
> +            break;


@ret will be still 0 in this case, set @ret to RTAS_OUT_HW_ERROR here.

Also, rtas_ibm_set_system_parameter() must return RTAS_OUT_NOT_AUTHORIZED 
on RTAS_SYSPARM_PROCESSOR_MODULE_INFO, as PAPR says.


> +        }
> +
> +        size = sizeof(modinfo);
> +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
> +
> +        stw_be_phys(&address_space_memory, buffer+offset, size);
> +        offset += 2;
> +
> +        stw_be_phys(&address_space_memory, buffer+offset, modinfo.module_types);
> +        offset += 2;
> +
> +        for (i = 0; i < modinfo.module_types; i++) {
> +            stw_be_phys(&address_space_memory, buffer+offset,
> +                                    modinfo.si[i].sockets);


checkpatch.pl does not warn on this but new lines start under opening brace 
in the previous line.

In terms on vim, it would be:
set expandtab
set tabstop=4
set shiftwidth=4
set cino=:0,(0



> +            offset += 2;
> +            stw_be_phys(&address_space_memory, buffer+offset,
> +                                    modinfo.si[i].chips);
> +            offset += 2;
> +            stw_be_phys(&address_space_memory, buffer+offset,
> +                                    modinfo.si[i].cores_per_chip);
> +            offset += 2;
> +        }
> +        break;
> +    }
> +
>       case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
>           char *param_val = g_strdup_printf("MaxEntCap=%d,"
>                                             "DesMem=%llu,"
> diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
> new file mode 100644
> index 0000000..068fc2c
> --- /dev/null
> +++ b/hw/ppc/spapr_rtas_modinfo.c
> @@ -0,0 +1,230 @@
> +/*
> + * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
> + *
> + * Hypercall based emulated RTAS


This is a description of hw/ppc/spapr_rtas.c, not of the new file.

Not sure the new code deserves a separate file, I'd either:
- add it into hw/ppc/spapr_rtas.c OR
- move rtas_ibm_get_system_parameter() + rtas_ibm_set_system_parameter() to 
a separate file (let's call it  hw/ppc/spapr_rtas_syspar.c) and add this 
new parameter there as there will be new parameters in the future anyway.

But I'll leave to the maintainer (David, hi :) ).



> + *
> + * Copyright (c) 2015 Sukadev Bhattiprolu, IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + *
> + */
> +#include <stdio.h>
> +#include <dirent.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <glib.h>
> +#include "spapr_rtas_modinfo.h"
> +#include "qemu/error-report.h"
> +#include "qemu/bswap.h"
> +
> +static int file_read_buf(char *file_name, char *buf, int len)
> +{
> +    int rc;
> +    FILE *fp;
> +
> +    fp = fopen(file_name, "r");
> +    if (!fp) {
> +        error_report("%s: Error opening %s\n", __func__, file_name);


error_report() does not need "\n", remote it here and below.

Another thing - you passed __func__ here but you did not in other places, 
please make this consistent and pass __func__ everywhere OR make error 
messages more descriptive, the latter seems to be preferable way in QEMU, 
either way this should be easily grep'able, for example - "Unable to open 
%s, error %d" is too generic.


> +        return -1;
> +    }
> +
> +    rc = fread(buf, 1, len, fp);
> +    fclose(fp);
> +
> +    if (rc != len) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Read an identifier from the file @path and add the identifier
> + * to the hash table @gt unless its already in the table.
> + */
> +static int hash_table_add_contents(GHashTable *gt, char *path)

Move this helper before count_modules_chips(). I thought for a sec that 
count_cores() uses it too :)


> +{
> +    int idx;
> +    char buf[16];
> +    int *key;
> +
> +    if (file_read_buf(path, buf, sizeof(int))) {
> +        return -1;
> +    }
> +
> +    idx = ldl_be_p(buf);
> +
> +    if (g_hash_table_contains(gt, &idx)) {
> +        return 0;
> +    }
> +
> +    key = g_malloc(sizeof(int));


I kind of hoped that g_direct_hash() (and probably GINT_TO_POINTER() but I 
do not see kvm-all.c using it) will let you avoid g_malloc()'ing the keys.



> +
> +    *key = idx;
> +    if (!g_hash_table_insert(gt, key, NULL)) {
> +        error_report("Unable to insert key %d into chips hash table\n", idx);
> +        g_free(key);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Each core in the system is represented by a directory with the
> + * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
> + * Process that directory and count the number of cores in the system.
> + */
> +static int count_cores(int *num_cores)
> +{
> +    int rc;
> +    DIR *dir;
> +    struct dirent *ent;
> +    const char *cpus_dir = "/proc/device-tree/cpus";
> +    const char *ppc_prefix = "PowerPC,POWER";
> +
> +    dir = opendir(cpus_dir);
> +    if (!dir) {
> +        error_report("Unable to open %s, error %d\n", cpus_dir, errno);
> +        return -1;
> +    }
> +
> +    *num_cores = 0;
> +    while (1) {
> +        errno = 0;
> +        ent = readdir(dir);
> +        if (!ent) {

rc = -errno; ....


> +            break;
> +        }
> +
> +        if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) {
> +            (*num_cores)++;
> +        }
> +    }
> +
> +    rc = 0;
> +    if (errno) {
> +        rc = -1;
> +    }


... and remove these 4 lines above?



> +
> +    closedir(dir);
> +    return rc;
> +}
> +
> +/*
> + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
> + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
> + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
> + *
> + * A module can contain more than one chip and a chip can contain more
> + * than one core. So there are likely to be duplicates in the module
> + * and chip identifiers (i.e more than one xscom directory can contain
> + * the same module/chip id).
> + *
> + * Search the xscom directories and count the number of _UNIQUE_ module
> + * and chip identifiers in the system.
> + */
> +static int count_modules_chips(int *num_modules, int *num_chips)
> +{
> +    int rc;

int rc = -1;


> +    DIR *dir;
> +    struct dirent *ent;
> +    char path[PATH_MAX];
> +    const char *xscom_dir = "/proc/device-tree";
> +    const char *xscom_prefix = "xscom@";
> +    GHashTable *chips_tbl, *modules_tbl;
> +
> +    dir = opendir(xscom_dir);
> +    if (!dir) {
> +        error_report("Unable to open %s, error %d\n", xscom_dir, errno);
> +        return -1;
> +    }
> +
> +    modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
> +    chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
> +
> +    rc = -1;


Remove this.

> +    while (1) {
> +        errno = 0;
> +        ent = readdir(dir);
> +        if (!ent) {

Add this:
rc = -errno;
goto cleanup;

> +            break;
> +        }
> +
> +        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
> +            continue;
> +        }
> +
> +        sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name);
> +        if (hash_table_add_contents(chips_tbl, path)) {
> +            goto cleanup;
> +        }
> +
> +        sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name);
> +        if (hash_table_add_contents(modules_tbl, path)) {
> +            goto cleanup;
> +        }
> +    }
> +
> +    if (errno) {
> +        goto cleanup;
> +    }

Do not need these 3 lines.


> +
> +    *num_modules = g_hash_table_size(modules_tbl);
> +    *num_chips = g_hash_table_size(chips_tbl);
> +    rc = 0;


Remove this.

count_modules_chips() and count_cores() do pretty similar things, it would 
improve the code if error handling was done similar ways.


> +
> +cleanup:
> +    g_hash_table_remove_all(modules_tbl);
> +    g_hash_table_destroy(modules_tbl);
> +
> +    g_hash_table_remove_all(chips_tbl);
> +    g_hash_table_destroy(chips_tbl);
> +
> +    closedir(dir);
> +
> +    return rc;
> +}
> +
> +int rtas_get_module_info(struct rtas_module_info *modinfo)
> +{
> +    int cores;
> +    int chips;
> +    int modules;

Nit - could be one line.


> +
> +    memset(modinfo, 0, sizeof(struct rtas_module_info));
> +
> +    if (count_modules_chips(&modules, &chips) || count_cores(&cores)) {
> +        return -1;
> +    }
> +
> +    /*
> +     * TODO: Hard code number of module_types for now till
> +     *       we figure out how to determine it dynamically.
> +     */
> +    modinfo->module_types = 1;
> +    modinfo->si[0].sockets = modules;


A "module" here is what modinfo describes so I'd expect @modules to be "1" 
in the existing code and count_modules_chips() to be named 
count_sockets_chips() and return @sockets, not @modules.

When exactly does a socket become a module? The SPAPR spec uses "sockets" here.


> +    modinfo->si[0].chips = chips;
> +    modinfo->si[0].cores_per_chip = cores / chips;


What if no "ibm,chip-id" was found and chips == 0?


> +
> +    return 0;
> +}
> diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
> new file mode 100644
> index 0000000..356a2b5
> --- /dev/null
> +++ b/hw/ppc/spapr_rtas_modinfo.h


This file is missing a license and
#ifndef SPAPR_RTAS_MODINFO_H
#define SPAPR_RTAS_MODINFO_H
#endif

But I'd rather put the content to include/hw/ppc/spapr.h and avoid creating 
new files.


> @@ -0,0 +1,12 @@
> +
> +struct rtas_socket_info {
> +    unsigned short sockets;
> +    unsigned short chips;
> +    unsigned short cores_per_chip;
> +};
> +struct rtas_module_info {
> +    unsigned short module_types;
> +    struct rtas_socket_info si[1];
> +};


Will the number of rtas_socket_info be always a constant or (sure, in the 
future) it may take different values?

Normally when you invent API like rtas_get_module_info(), you tell the 
helper how much memory is allocated in the rtas_module_info struct (in our 
case it is ARRAY_SIZE(rtas_module_info.si) which is fine) and then the 
helper signals somehow how many module types it has found (which is missing 
here).


> +
> +extern int rtas_get_module_info(struct rtas_module_info *topo);
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 5baa906..cfe7fa2 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -463,6 +463,7 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>   /* RTAS ibm,get-system-parameter token values */
>   #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
>   #define RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE        42
> +#define RTAS_SYSPARM_PROCESSOR_MODULE_INFO       43
>   #define RTAS_SYSPARM_UUID                        48
>
>   /* RTAS indicator/sensor types
>


-- 
Alexey

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-04 23:06 [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO) Sukadev Bhattiprolu
  2015-11-09  1:57 ` Alexey Kardashevskiy
@ 2015-11-09  4:58 ` David Gibson
  2015-11-10  4:22   ` Sukadev Bhattiprolu
  1 sibling, 1 reply; 22+ messages in thread
From: David Gibson @ 2015-11-09  4:58 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: stewart, benh, aik, nacc, agraf, qemu-devel, qemu-ppc, paulus

[-- Attachment #1: Type: text/plain, Size: 14062 bytes --]

On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote:
> Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> call in qemu. This call returns the processor module (socket), chip and core
> information as specified in section 7.3.16.18 of PAPR v2.7.

PAPR v2.7 isn't available publically.  For upstream patches, please
reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).

> We walk the /proc/device-tree to determine the number of chips, cores and
> modules in the _host_ system and return this info to the guest application
> that makes the rtas_get_sysparm() call.
> 
> We currently hard code the number of module_types to 1, but we should determine
> that dynamically somehow later.
> 
> Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>

This isn't ready to go yet - you need to put some more consideration
into the uncommon cases: PR KVM, TCG and non-Power hosts.

> ---
> Changelog[v2]:
>         [Alexey Kardashevsk] Use existing interfaces like ldl_be_p(),
>                 stw_be_phys(), g_hash_table_new_full(), error_report() rather
>                 than re-inventing; fix indentation, function prottypes etc;
>                 Drop the fts() interface (which doesn't seem to be available
>                 on mingw32/mingw64) and use opendir() to walk specific
>                 directories in the directory tree.
> ---
>  hw/ppc/Makefile.objs        |   1 +
>  hw/ppc/spapr_rtas.c         |  35 +++++++
>  hw/ppc/spapr_rtas_modinfo.c | 230 ++++++++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_rtas_modinfo.h |  12 +++
>  include/hw/ppc/spapr.h      |   1 +
>  5 files changed, 279 insertions(+)
>  create mode 100644 hw/ppc/spapr_rtas_modinfo.c
>  create mode 100644 hw/ppc/spapr_rtas_modinfo.h
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index c1ffc77..57c6b02 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o
>  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> +obj-$(CONFIG_PSERIES) += spapr_rtas_modinfo.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 34b12a3..41fd8a6 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -34,6 +34,8 @@
>  #include "hw/ppc/spapr.h"
>  #include "hw/ppc/spapr_vio.h"
>  #include "qapi-event.h"
> +
> +#include "spapr_rtas_modinfo.h"
>  #include "hw/boards.h"
>  
>  #include <libfdt.h>
> @@ -240,6 +242,39 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu,
>      target_ulong ret = RTAS_OUT_SUCCESS;
>  
>      switch (parameter) {
> +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
> +        int i;
> +        int offset = 0;
> +        int size;
> +        struct rtas_module_info modinfo;
> +
> +        if (rtas_get_module_info(&modinfo)) {
> +            break;
> +        }

So, you handle the variable size of this structure before sending it
to the guest, but you don't handle it in allocation of the structure
right here.  You'll get away with that because for now you only ever
have one entry in the sockets array, but it's a bit icky.

> +
> +        size = sizeof(modinfo);
> +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);

More seriously, this calculation will break horribly if you change the
size of the array in struct rtas_module_info.

> +        stw_be_phys(&address_space_memory, buffer+offset, size);
> +        offset += 2;
> +
> +        stw_be_phys(&address_space_memory, buffer+offset, modinfo.module_types);
> +        offset += 2;
> +
> +        for (i = 0; i < modinfo.module_types; i++) {
> +            stw_be_phys(&address_space_memory, buffer+offset,
> +                                    modinfo.si[i].sockets);
> +            offset += 2;
> +            stw_be_phys(&address_space_memory, buffer+offset,
> +                                    modinfo.si[i].chips);
> +            offset += 2;
> +            stw_be_phys(&address_space_memory, buffer+offset,
> +                                    modinfo.si[i].cores_per_chip);
> +            offset += 2;
> +        }
> +        break;
> +    }
> +
>      case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
>          char *param_val = g_strdup_printf("MaxEntCap=%d,"
>                                            "DesMem=%llu,"
> diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
> new file mode 100644
> index 0000000..068fc2c
> --- /dev/null
> +++ b/hw/ppc/spapr_rtas_modinfo.c
> @@ -0,0 +1,230 @@
> +/*
> + * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
> + *
> + * Hypercall based emulated RTAS
> + *
> + * Copyright (c) 2015 Sukadev Bhattiprolu, IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + *
> + */
> +#include <stdio.h>
> +#include <dirent.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <glib.h>
> +#include "spapr_rtas_modinfo.h"
> +#include "qemu/error-report.h"
> +#include "qemu/bswap.h"

This entirely file assumes that (a) you have a ppc64 Linux host, which
may not be true, and (b) that it's ok to expose host details to the guest.

> +static int file_read_buf(char *file_name, char *buf, int len)
> +{
> +    int rc;
> +    FILE *fp;
> +
> +    fp = fopen(file_name, "r");
> +    if (!fp) {
> +        error_report("%s: Error opening %s\n", __func__, file_name);
> +        return -1;
> +    }
> +
> +    rc = fread(buf, 1, len, fp);
> +    fclose(fp);
> +
> +    if (rc != len) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Read an identifier from the file @path and add the identifier
> + * to the hash table @gt unless its already in the table.
> + */
> +static int hash_table_add_contents(GHashTable *gt, char *path)
> +{
> +    int idx;
> +    char buf[16];
> +    int *key;
> +
> +    if (file_read_buf(path, buf, sizeof(int))) {

If you're just reading sizeof(int), why is the buf 16 bytes?  You
should allocate buf to be the right size and use sizeof(buf).

Also since this is a value you're getting directly from externally,
you should use a fixed width type, rather than 'int'.

> +        return -1;
> +    }
> +
> +    idx = ldl_be_p(buf);

Easier to use the 'beNN_to_cpu' functions than ldl_be here.

> +    if (g_hash_table_contains(gt, &idx)) {
> +        return 0;
> +    }
> +
> +    key = g_malloc(sizeof(int));
> +
> +    *key = idx;
> +    if (!g_hash_table_insert(gt, key, NULL)) {
> +        error_report("Unable to insert key %d into chips hash table\n", idx);
> +        g_free(key);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Each core in the system is represented by a directory with the
> + * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
> + * Process that directory and count the number of cores in the system.

True on IBM POWER systems, but not necessarily everywhere - e.g. PR
KVM on an embedded PowerPC host.

> + */
> +static int count_cores(int *num_cores)
> +{
> +    int rc;
> +    DIR *dir;
> +    struct dirent *ent;
> +    const char *cpus_dir = "/proc/device-tree/cpus";
> +    const char *ppc_prefix = "PowerPC,POWER";
> +
> +    dir = opendir(cpus_dir);
> +    if (!dir) {
> +        error_report("Unable to open %s, error %d\n", cpus_dir, errno);
> +        return -1;
> +    }

You could probably do this more simply with glob(3).

> +    *num_cores = 0;
> +    while (1) {
> +        errno = 0;
> +        ent = readdir(dir);
> +        if (!ent) {
> +            break;
> +        }
> +
> +        if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) {
> +            (*num_cores)++;
> +        }
> +    }
> +
> +    rc = 0;
> +    if (errno) {
> +        rc = -1;
> +    }
> +
> +    closedir(dir);
> +    return rc;
> +}
> +
> +/*
> + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
> + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
> + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
> + *
> + * A module can contain more than one chip and a chip can contain more
> + * than one core. So there are likely to be duplicates in the module
> + * and chip identifiers (i.e more than one xscom directory can contain
> + * the same module/chip id).
> + *
> + * Search the xscom directories and count the number of _UNIQUE_ module
> + * and chip identifiers in the system.

There's no direct way to go from a core
(i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or
module?

> + */
> +static int count_modules_chips(int *num_modules, int *num_chips)
> +{
> +    int rc;
> +    DIR *dir;
> +    struct dirent *ent;
> +    char path[PATH_MAX];
> +    const char *xscom_dir = "/proc/device-tree";
> +    const char *xscom_prefix = "xscom@";
> +    GHashTable *chips_tbl, *modules_tbl;
> +
> +    dir = opendir(xscom_dir);
> +    if (!dir) {
> +        error_report("Unable to open %s, error %d\n", xscom_dir, errno);
> +        return -1;
> +    }
> +
> +    modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
> +    chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
> +
> +    rc = -1;
> +    while (1) {
> +        errno = 0;
> +        ent = readdir(dir);
> +        if (!ent) {
> +            break;
> +        }

Again glob(3) could simplify this.

> +
> +        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
> +            continue;
> +        }
> +
> +        sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name);
> +        if (hash_table_add_contents(chips_tbl, path)) {
> +            goto cleanup;
> +        }
> +
> +        sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name);
> +        if (hash_table_add_contents(modules_tbl, path)) {
> +            goto cleanup;
> +        }
> +    }
> +
> +    if (errno) {
> +        goto cleanup;
> +    }
> +
> +    *num_modules = g_hash_table_size(modules_tbl);
> +    *num_chips = g_hash_table_size(chips_tbl);
> +    rc = 0;
> +
> +cleanup:
> +    g_hash_table_remove_all(modules_tbl);
> +    g_hash_table_destroy(modules_tbl);
> +
> +    g_hash_table_remove_all(chips_tbl);
> +    g_hash_table_destroy(chips_tbl);
> +
> +    closedir(dir);
> +
> +    return rc;
> +}
> +
> +int rtas_get_module_info(struct rtas_module_info *modinfo)
> +{
> +    int cores;
> +    int chips;
> +    int modules;
> +
> +    memset(modinfo, 0, sizeof(struct rtas_module_info));
> +
> +    if (count_modules_chips(&modules, &chips) || count_cores(&cores)) {
> +        return -1;
> +    }
> +
> +    /*
> +     * TODO: Hard code number of module_types for now till
> +     *       we figure out how to determine it dynamically.
> +     */
> +    modinfo->module_types = 1;
> +    modinfo->si[0].sockets = modules;
> +    modinfo->si[0].chips = chips;
> +    modinfo->si[0].cores_per_chip = cores / chips;
> +
> +    return 0;
> +}
> diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
> new file mode 100644
> index 0000000..356a2b5
> --- /dev/null
> +++ b/hw/ppc/spapr_rtas_modinfo.h
> @@ -0,0 +1,12 @@
> +
> +struct rtas_socket_info {
> +    unsigned short sockets;
> +    unsigned short chips;
> +    unsigned short cores_per_chip;
> +};
> +struct rtas_module_info {
> +    unsigned short module_types;
> +    struct rtas_socket_info si[1];

You probably want a "MAX_MODULE_TYPES" #define or similar instead of
harcoding this to 1.

> +};
> +
> +extern int rtas_get_module_info(struct rtas_module_info *topo);
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 5baa906..cfe7fa2 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -463,6 +463,7 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
>  /* RTAS ibm,get-system-parameter token values */
>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
>  #define RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE        42
> +#define RTAS_SYSPARM_PROCESSOR_MODULE_INFO       43
>  #define RTAS_SYSPARM_UUID                        48
>  
>  /* RTAS indicator/sensor types

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-09  1:57 ` Alexey Kardashevskiy
@ 2015-11-09  5:01   ` David Gibson
  2015-11-10  3:57   ` Sukadev Bhattiprolu
  1 sibling, 0 replies; 22+ messages in thread
From: David Gibson @ 2015-11-09  5:01 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: stewart, benh, nacc, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

[-- Attachment #1: Type: text/plain, Size: 6000 bytes --]

On Mon, Nov 09, 2015 at 12:57:48PM +1100, Alexey Kardashevskiy wrote:
> On 11/05/2015 10:06 AM, Sukadev Bhattiprolu wrote:
> >Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> >call in qemu. This call returns the processor module (socket), chip and core
> >information as specified in section 7.3.16.18 of PAPR v2.7.
> >
> >We walk the /proc/device-tree to determine the number of chips, cores and
> >modules in the _host_ system and return this info to the guest application
> >that makes the rtas_get_sysparm() call.
> >
> >We currently hard code the number of module_types to 1, but we should determine
> >that dynamically somehow later.
> >
> >Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
> 
> "iy" is missing :)
> 
> 
> >
> >Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> >---
> >Changelog[v2]:
> >         [Alexey Kardashevsk] Use existing interfaces like ldl_be_p(),
> >                 stw_be_phys(), g_hash_table_new_full(), error_report() rather
> >                 than re-inventing; fix indentation, function prottypes etc;
> >                 Drop the fts() interface (which doesn't seem to be available
> >                 on mingw32/mingw64) and use opendir() to walk specific
> >                 directories in the directory tree.
> >---
> >  hw/ppc/Makefile.objs        |   1 +
> >  hw/ppc/spapr_rtas.c         |  35 +++++++
> >  hw/ppc/spapr_rtas_modinfo.c | 230 ++++++++++++++++++++++++++++++++++++++++++++
> >  hw/ppc/spapr_rtas_modinfo.h |  12 +++
> >  include/hw/ppc/spapr.h      |   1 +
> >  5 files changed, 279 insertions(+)
> >  create mode 100644 hw/ppc/spapr_rtas_modinfo.c
> >  create mode 100644 hw/ppc/spapr_rtas_modinfo.h
> >
> >diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >index c1ffc77..57c6b02 100644
> >--- a/hw/ppc/Makefile.objs
> >+++ b/hw/ppc/Makefile.objs
> >@@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o
> >  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
> >  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
> >  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
> >+obj-$(CONFIG_PSERIES) += spapr_rtas_modinfo.o
> >  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
> >  obj-y += spapr_pci_vfio.o
> >  endif
> >diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> >index 34b12a3..41fd8a6 100644
> >--- a/hw/ppc/spapr_rtas.c
> >+++ b/hw/ppc/spapr_rtas.c
> >@@ -34,6 +34,8 @@
> >  #include "hw/ppc/spapr.h"
> >  #include "hw/ppc/spapr_vio.h"
> >  #include "qapi-event.h"
> >+
> >+#include "spapr_rtas_modinfo.h"
> >  #include "hw/boards.h"
> >
> >  #include <libfdt.h>
> >@@ -240,6 +242,39 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu,
> >      target_ulong ret = RTAS_OUT_SUCCESS;
> >
> >      switch (parameter) {
> >+    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
> >+        int i;
> >+        int offset = 0;
> >+        int size;
> 
> Nit - could be one line.
> 
> 
> >+        struct rtas_module_info modinfo;
> >+
> >+        if (rtas_get_module_info(&modinfo)) {
> >+            break;
> 
> 
> @ret will be still 0 in this case, set @ret to RTAS_OUT_HW_ERROR here.
> 
> Also, rtas_ibm_set_system_parameter() must return RTAS_OUT_NOT_AUTHORIZED on
> RTAS_SYSPARM_PROCESSOR_MODULE_INFO, as PAPR says.
> 
> 
> >+        }
> >+
> >+        size = sizeof(modinfo);
> >+        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
> >+
> >+        stw_be_phys(&address_space_memory, buffer+offset, size);
> >+        offset += 2;
> >+
> >+        stw_be_phys(&address_space_memory, buffer+offset, modinfo.module_types);
> >+        offset += 2;
> >+
> >+        for (i = 0; i < modinfo.module_types; i++) {
> >+            stw_be_phys(&address_space_memory, buffer+offset,
> >+                                    modinfo.si[i].sockets);
> 
> 
> checkpatch.pl does not warn on this but new lines start under opening brace
> in the previous line.
> 
> In terms on vim, it would be:
> set expandtab
> set tabstop=4
> set shiftwidth=4
> set cino=:0,(0
> 
> 
> 
> >+            offset += 2;
> >+            stw_be_phys(&address_space_memory, buffer+offset,
> >+                                    modinfo.si[i].chips);
> >+            offset += 2;
> >+            stw_be_phys(&address_space_memory, buffer+offset,
> >+                                    modinfo.si[i].cores_per_chip);
> >+            offset += 2;
> >+        }
> >+        break;
> >+    }
> >+
> >      case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
> >          char *param_val = g_strdup_printf("MaxEntCap=%d,"
> >                                            "DesMem=%llu,"
> >diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
> >new file mode 100644
> >index 0000000..068fc2c
> >--- /dev/null
> >+++ b/hw/ppc/spapr_rtas_modinfo.c
> >@@ -0,0 +1,230 @@
> >+/*
> >+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
> >+ *
> >+ * Hypercall based emulated RTAS
> 
> 
> This is a description of hw/ppc/spapr_rtas.c, not of the new file.
> 
> Not sure the new code deserves a separate file, I'd either:
> - add it into hw/ppc/spapr_rtas.c OR
> - move rtas_ibm_get_system_parameter() + rtas_ibm_set_system_parameter() to
> a separate file (let's call it  hw/ppc/spapr_rtas_syspar.c) and add this new
> parameter there as there will be new parameters in the future anyway.
> 
> But I'll leave to the maintainer (David, hi :) ).

Actually it probably should go in target-ppc/kvm.c.  Looking up things
on the host in this way only makes sense for KVM, and we have other
code in there that looks in the host /proc/device-tree.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-09  1:57 ` Alexey Kardashevskiy
  2015-11-09  5:01   ` David Gibson
@ 2015-11-10  3:57   ` Sukadev Bhattiprolu
  2015-11-10  4:25     ` Alexey Kardashevskiy
  1 sibling, 1 reply; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-10  3:57 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: stewart, benh, nacc, agraf, qemu-devel, qemu-ppc, paulus, david

Alexey Kardashevskiy [aik@ozlabs.ru] wrote:
| On 11/05/2015 10:06 AM, Sukadev Bhattiprolu wrote:
| >Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
| >call in qemu. This call returns the processor module (socket), chip and core
| >information as specified in section 7.3.16.18 of PAPR v2.7.
| >
| >We walk the /proc/device-tree to determine the number of chips, cores and
| >modules in the _host_ system and return this info to the guest application
| >that makes the rtas_get_sysparm() call.
| >
| >We currently hard code the number of module_types to 1, but we should determine
| >that dynamically somehow later.
| >
| >Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
| 
| "iy" is missing :)

Argh. Sorry :-)

Agree with your other comments. Couple of questions/comments below.

| 
| checkpatch.pl does not warn on this but new lines start under
| opening brace in the previous line.
| 
| In terms on vim, it would be:
| set expandtab
| set tabstop=4
| set shiftwidth=4
| set cino=:0,(0
| 
| 
| 
| >+            offset += 2;
| >+            stw_be_phys(&address_space_memory, buffer+offset,
| >+                                    modinfo.si[i].chips);
| >+            offset += 2;
| >+            stw_be_phys(&address_space_memory, buffer+offset,
| >+                                    modinfo.si[i].cores_per_chip);
| >+            offset += 2;
| >+        }
| >+        break;
| >+    }
| >+
| >      case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
| >          char *param_val = g_strdup_printf("MaxEntCap=%d,"
| >                                            "DesMem=%llu,"
| >diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
| >new file mode 100644
| >index 0000000..068fc2c
| >--- /dev/null
| >+++ b/hw/ppc/spapr_rtas_modinfo.c
| >@@ -0,0 +1,230 @@
| >+/*
| >+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
| >+ *
| >+ * Hypercall based emulated RTAS
| 
| 
| This is a description of hw/ppc/spapr_rtas.c, not of the new file.

Agree.

| 
| Not sure the new code deserves a separate file, I'd either:
| - add it into hw/ppc/spapr_rtas.c OR
| - move rtas_ibm_get_system_parameter() +
| rtas_ibm_set_system_parameter() to a separate file (let's call it
| hw/ppc/spapr_rtas_syspar.c) and add this new parameter there as
| there will be new parameters in the future anyway.
| 
| But I'll leave to the maintainer (David, hi :) ).

Will discuss this on the other thread from David (maybe move to
target-ppc/kvm.c?) 

| 
| 
<snip>

| >+
| >+static int file_read_buf(char *file_name, char *buf, int len)
| >+{
| >+    int rc;
| >+    FILE *fp;
| >+
| >+    fp = fopen(file_name, "r");
| >+    if (!fp) {
| >+        error_report("%s: Error opening %s\n", __func__, file_name);
| 
| 
| error_report() does not need "\n", remote it here and below.
| 
| Another thing - you passed __func__ here but you did not in other
| places, please make this consistent and pass __func__ everywhere OR
| make error messages more descriptive, the latter seems to be
| preferable way in QEMU, either way this should be easily grep'able,
| for example - "Unable to open %s, error %d" is too generic.

will change to "Error opening device tree file %s" and drop the
__func__.

| 
| 
| >+        return -1;
| >+    }
| >+
| >+    rc = fread(buf, 1, len, fp);
| >+    fclose(fp);
| >+
| >+    if (rc != len) {
| >+        return -1;
| >+    }
| >+
| >+    return 0;
| >+}
| >+
| >+/*
| >+ * Read an identifier from the file @path and add the identifier
| >+ * to the hash table @gt unless its already in the table.
| >+ */
| >+static int hash_table_add_contents(GHashTable *gt, char *path)
| 
| Move this helper before count_modules_chips(). I thought for a sec
| that count_cores() uses it too :)

Ok.

| 
| 
| >+{
| >+    int idx;
| >+    char buf[16];
| >+    int *key;
| >+
| >+    if (file_read_buf(path, buf, sizeof(int))) {
| >+        return -1;
| >+    }
| >+
| >+    idx = ldl_be_p(buf);
| >+
| >+    if (g_hash_table_contains(gt, &idx)) {
| >+        return 0;
| >+    }
| >+
| >+    key = g_malloc(sizeof(int));
| 
| 
| I kind of hoped that g_direct_hash() (and probably GINT_TO_POINTER()
| but I do not see kvm-all.c using it) will let you avoid
| g_malloc()'ing the keys.

I think g_direct_hash() and GINT_TO_POINTER will work. Will try them
out.

| 
| 
| 
| >+
| >+    *key = idx;
| >+    if (!g_hash_table_insert(gt, key, NULL)) {
| >+        error_report("Unable to insert key %d into chips hash table\n", idx);
| >+        g_free(key);
| >+        return -1;
| >+    }
| >+
| >+    return 0;
| >+}
| >+
| >+/*
| >+ * Each core in the system is represented by a directory with the
| >+ * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
| >+ * Process that directory and count the number of cores in the system.
| >+ */
| >+static int count_cores(int *num_cores)
| >+{
| >+    int rc;
| >+    DIR *dir;
| >+    struct dirent *ent;
| >+    const char *cpus_dir = "/proc/device-tree/cpus";
| >+    const char *ppc_prefix = "PowerPC,POWER";
| >+
| >+    dir = opendir(cpus_dir);
| >+    if (!dir) {
| >+        error_report("Unable to open %s, error %d\n", cpus_dir, errno);
| >+        return -1;
| >+    }
| >+
| >+    *num_cores = 0;
| >+    while (1) {
| >+        errno = 0;
| >+        ent = readdir(dir);
| >+        if (!ent) {
| 
| rc = -errno; ....

Yes. Thanks,
| 
| 
| >+            break;
| >+        }
| >+
| >+        if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) {
| >+            (*num_cores)++;
| >+        }
| >+    }
| >+
| >+    rc = 0;
| >+    if (errno) {
| >+        rc = -1;
| >+    }
| 
| 
| ... and remove these 4 lines above?

Ok.

| 
| 
| 
| >+
| >+    closedir(dir);
| >+    return rc;
| >+}
| >+
| >+/*
| >+ * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
| >+ * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
| >+ * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
| >+ *
| >+ * A module can contain more than one chip and a chip can contain more
| >+ * than one core. So there are likely to be duplicates in the module
| >+ * and chip identifiers (i.e more than one xscom directory can contain
| >+ * the same module/chip id).
| >+ *
| >+ * Search the xscom directories and count the number of _UNIQUE_ module
| >+ * and chip identifiers in the system.
| >+ */
| >+static int count_modules_chips(int *num_modules, int *num_chips)
| >+{
| >+    int rc;
| 
| int rc = -1;
| 
| 
| >+    DIR *dir;
| >+    struct dirent *ent;
| >+    char path[PATH_MAX];
| >+    const char *xscom_dir = "/proc/device-tree";
| >+    const char *xscom_prefix = "xscom@";
| >+    GHashTable *chips_tbl, *modules_tbl;
| >+
| >+    dir = opendir(xscom_dir);
| >+    if (!dir) {
| >+        error_report("Unable to open %s, error %d\n", xscom_dir, errno);
| >+        return -1;
| >+    }
| >+
| >+    modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
| >+    chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
| >+
| >+    rc = -1;
| 
| 
| Remove this.
| 
| >+    while (1) {
| >+        errno = 0;
| >+        ent = readdir(dir);
| >+        if (!ent) {
| 
| Add this:
| rc = -errno;
| goto cleanup;
| 
| >+            break;
| >+        }
| >+
| >+        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
| >+            continue;
| >+        }
| >+
| >+        sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name);
| >+        if (hash_table_add_contents(chips_tbl, path)) {
| >+            goto cleanup;
| >+        }
| >+
| >+        sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name);
| >+        if (hash_table_add_contents(modules_tbl, path)) {
| >+            goto cleanup;
| >+        }
| >+    }
| >+
| >+    if (errno) {
| >+        goto cleanup;
| >+    }
| 
| Do not need these 3 lines.
| 
| 
| >+
| >+    *num_modules = g_hash_table_size(modules_tbl);
| >+    *num_chips = g_hash_table_size(chips_tbl);
| >+    rc = 0;
| 
| 
| Remove this.
| 
| count_modules_chips() and count_cores() do pretty similar things, it
| would improve the code if error handling was done similar ways.

Agree.

| 
| 
| >+
| >+cleanup:
| >+    g_hash_table_remove_all(modules_tbl);
| >+    g_hash_table_destroy(modules_tbl);
| >+
| >+    g_hash_table_remove_all(chips_tbl);
| >+    g_hash_table_destroy(chips_tbl);
| >+
| >+    closedir(dir);
| >+
| >+    return rc;
| >+}
| >+
| >+int rtas_get_module_info(struct rtas_module_info *modinfo)
| >+{
| >+    int cores;
| >+    int chips;
| >+    int modules;
| 
| Nit - could be one line.
| 
| 
| >+
| >+    memset(modinfo, 0, sizeof(struct rtas_module_info));
| >+
| >+    if (count_modules_chips(&modules, &chips) || count_cores(&cores)) {
| >+        return -1;
| >+    }
| >+
| >+    /*
| >+     * TODO: Hard code number of module_types for now till
| >+     *       we figure out how to determine it dynamically.
| >+     */
| >+    modinfo->module_types = 1;
| >+    modinfo->si[0].sockets = modules;
| 
| 
| A "module" here is what modinfo describes so I'd expect @modules to
| be "1" in the existing code and count_modules_chips() to be named
| count_sockets_chips() and return @sockets, not @modules.
| 
| When exactly does a socket become a module? The SPAPR spec uses "sockets" here.

I am trying to get the terminology too :-) Is socket a slot where a
module is attached?

I will change the variable name 'modules' to 'sockets'.
| 
| 
| >+    modinfo->si[0].chips = chips;
| >+    modinfo->si[0].cores_per_chip = cores / chips;
| 
| 
| What if no "ibm,chip-id" was found and chips == 0?

If we fail to readdir(xscom) or fail to read the 'ibm,chip-id',
we return an error which we check above.

| 
| 
| >+
| >+    return 0;
| >+}
| >diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
| >new file mode 100644
| >index 0000000..356a2b5
| >--- /dev/null
| >+++ b/hw/ppc/spapr_rtas_modinfo.h
| 
| 
| This file is missing a license and
| #ifndef SPAPR_RTAS_MODINFO_H
| #define SPAPR_RTAS_MODINFO_H
| #endif
| 
| But I'd rather put the content to include/hw/ppc/spapr.h and avoid
| creating new files.

Ok.

| 
| 
| >@@ -0,0 +1,12 @@
| >+
| >+struct rtas_socket_info {
| >+    unsigned short sockets;
| >+    unsigned short chips;
| >+    unsigned short cores_per_chip;
| >+};
| >+struct rtas_module_info {
| >+    unsigned short module_types;
| >+    struct rtas_socket_info si[1];
| >+};
| 
| 
| Will the number of rtas_socket_info be always a constant or (sure,
| in the future) it may take different values?

TBH, I am not sure. One thing though, array size of rtas_socket_info
depends on number of module _types_. Like Nishanth Aravamudan mentioned
offline, we are not sure if both a DCM and SCM types can be on a system
at once (or even if they can be added dynamically).

Another thing I have on my list to check is how this works with
hotplug CPU (in the host). If the device-tree is properly updated
then these calls will return the updated info? The values in the
array will change but the number of entries (types) in the array is
still 1.

| 
| Normally when you invent API like rtas_get_module_info(), you tell
| the helper how much memory is allocated in the rtas_module_info
| struct (in our case it is ARRAY_SIZE(rtas_module_info.si) which is
| fine) and then the helper signals somehow how many module types it
| has found (which is missing here).

Can we assume that the number of types is static for now i.e both
caller and callee know the size (although I should fix the computation
of size in spapr_rtas.c and use MAX_MODULE_TYPES that David pointed out).

Thanks

Sukadev

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-09  4:58 ` David Gibson
@ 2015-11-10  4:22   ` Sukadev Bhattiprolu
  2015-11-10  9:53     ` Thomas Huth
  2015-11-11  0:17     ` David Gibson
  0 siblings, 2 replies; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-10  4:22 UTC (permalink / raw)
  To: David Gibson
  Cc: stewart, benh, aik, nacc, agraf, qemu-devel, qemu-ppc, paulus

David Gibson [david@gibson.dropbear.id.au] wrote:
| On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote:
| > Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
| > call in qemu. This call returns the processor module (socket), chip and core
| > information as specified in section 7.3.16.18 of PAPR v2.7.
| 
| PAPR v2.7 isn't available publically.  For upstream patches, please
| reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).

Ok.

| 
| > We walk the /proc/device-tree to determine the number of chips, cores and
| > modules in the _host_ system and return this info to the guest application
| > that makes the rtas_get_sysparm() call.
| > 
| > We currently hard code the number of module_types to 1, but we should determine
| > that dynamically somehow later.
| > 
| > Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
| > 
| > Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
| 
| This isn't ready to go yet - you need to put some more consideration
| into the uncommon cases: PR KVM, TCG and non-Power hosts.

Ok. Is there a we can make this code applicable only a Powerpc host?
(would moving this code to target-ppc/kvm.c do that?)

| 
| > ---
| > Changelog[v2]:
| >         [Alexey Kardashevsk] Use existing interfaces like ldl_be_p(),
| >                 stw_be_phys(), g_hash_table_new_full(), error_report() rather
| >                 than re-inventing; fix indentation, function prottypes etc;
| >                 Drop the fts() interface (which doesn't seem to be available
| >                 on mingw32/mingw64) and use opendir() to walk specific
| >                 directories in the directory tree.
| > ---
| >  hw/ppc/Makefile.objs        |   1 +
| >  hw/ppc/spapr_rtas.c         |  35 +++++++
| >  hw/ppc/spapr_rtas_modinfo.c | 230 ++++++++++++++++++++++++++++++++++++++++++++
| >  hw/ppc/spapr_rtas_modinfo.h |  12 +++
| >  include/hw/ppc/spapr.h      |   1 +
| >  5 files changed, 279 insertions(+)
| >  create mode 100644 hw/ppc/spapr_rtas_modinfo.c
| >  create mode 100644 hw/ppc/spapr_rtas_modinfo.h
| > 
| > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
| > index c1ffc77..57c6b02 100644
| > --- a/hw/ppc/Makefile.objs
| > +++ b/hw/ppc/Makefile.objs
| > @@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o
| >  obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
| >  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
| >  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
| > +obj-$(CONFIG_PSERIES) += spapr_rtas_modinfo.o
| >  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
| >  obj-y += spapr_pci_vfio.o
| >  endif
| > diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
| > index 34b12a3..41fd8a6 100644
| > --- a/hw/ppc/spapr_rtas.c
| > +++ b/hw/ppc/spapr_rtas.c
| > @@ -34,6 +34,8 @@
| >  #include "hw/ppc/spapr.h"
| >  #include "hw/ppc/spapr_vio.h"
| >  #include "qapi-event.h"
| > +
| > +#include "spapr_rtas_modinfo.h"
| >  #include "hw/boards.h"
| >  
| >  #include <libfdt.h>
| > @@ -240,6 +242,39 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu,
| >      target_ulong ret = RTAS_OUT_SUCCESS;
| >  
| >      switch (parameter) {
| > +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
| > +        int i;
| > +        int offset = 0;
| > +        int size;
| > +        struct rtas_module_info modinfo;
| > +
| > +        if (rtas_get_module_info(&modinfo)) {
| > +            break;
| > +        }
| 
| So, you handle the variable size of this structure before sending it
| to the guest, but you don't handle it in allocation of the structure
| right here.  You'll get away with that because for now you only ever
| have one entry in the sockets array, but it's a bit icky.

Can we assume that the size is static for now...
| 
| > +
| > +        size = sizeof(modinfo);
| > +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
| 
| More seriously, this calculation will break horribly if you change the
| size of the array in struct rtas_module_info.

and just set 'size' to sizeof(modinfo)?.

| 
| > +        stw_be_phys(&address_space_memory, buffer+offset, size);
| > +        offset += 2;
| > +
| > +        stw_be_phys(&address_space_memory, buffer+offset, modinfo.module_types);
| > +        offset += 2;
| > +
| > +        for (i = 0; i < modinfo.module_types; i++) {
| > +            stw_be_phys(&address_space_memory, buffer+offset,
| > +                                    modinfo.si[i].sockets);
| > +            offset += 2;
| > +            stw_be_phys(&address_space_memory, buffer+offset,
| > +                                    modinfo.si[i].chips);
| > +            offset += 2;
| > +            stw_be_phys(&address_space_memory, buffer+offset,
| > +                                    modinfo.si[i].cores_per_chip);
| > +            offset += 2;
| > +        }
| > +        break;
| > +    }
| > +
| >      case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
| >          char *param_val = g_strdup_printf("MaxEntCap=%d,"
| >                                            "DesMem=%llu,"
| > diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
| > new file mode 100644
| > index 0000000..068fc2c
| > --- /dev/null
| > +++ b/hw/ppc/spapr_rtas_modinfo.c
| > @@ -0,0 +1,230 @@
| > +/*
| > + * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
| > + *
| > + * Hypercall based emulated RTAS
| > + *
| > + * Copyright (c) 2015 Sukadev Bhattiprolu, IBM Corporation.
| > + *
| > + * Permission is hereby granted, free of charge, to any person obtaining a copy
| > + * of this software and associated documentation files (the "Software"), to deal
| > + * in the Software without restriction, including without limitation the rights
| > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
| > + * copies of the Software, and to permit persons to whom the Software is
| > + * furnished to do so, subject to the following conditions:
| > + *
| > + * The above copyright notice and this permission notice shall be included in
| > + * all copies or substantial portions of the Software.
| > + *
| > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
| > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
| > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
| > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
| > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
| > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
| > + * THE SOFTWARE.
| > + *
| > + */
| > +#include <stdio.h>
| > +#include <dirent.h>
| > +#include <sys/types.h>
| > +#include <sys/stat.h>
| > +#include <string.h>
| > +#include <errno.h>
| > +
| > +#include <glib.h>
| > +#include "spapr_rtas_modinfo.h"
| > +#include "qemu/error-report.h"
| > +#include "qemu/bswap.h"
| 
| This entirely file assumes that (a) you have a ppc64 Linux host, which
| may not be true, and (b) that it's ok to expose host details to the guest.

Is there a way we can skip/ignore this code if not on a ppc64 Linux host?

Are there steps I should take on other hosts to not expose host details
to the guests?

| 
| > +static int file_read_buf(char *file_name, char *buf, int len)
| > +{
| > +    int rc;
| > +    FILE *fp;
| > +
| > +    fp = fopen(file_name, "r");
| > +    if (!fp) {
| > +        error_report("%s: Error opening %s\n", __func__, file_name);
| > +        return -1;
| > +    }
| > +
| > +    rc = fread(buf, 1, len, fp);
| > +    fclose(fp);
| > +
| > +    if (rc != len) {
| > +        return -1;
| > +    }
| > +
| > +    return 0;
| > +}
| > +
| > +/*
| > + * Read an identifier from the file @path and add the identifier
| > + * to the hash table @gt unless its already in the table.
| > + */
| > +static int hash_table_add_contents(GHashTable *gt, char *path)
| > +{
| > +    int idx;
| > +    char buf[16];
| > +    int *key;
| > +
| > +    if (file_read_buf(path, buf, sizeof(int))) {
| 
| If you're just reading sizeof(int), why is the buf 16 bytes?  You
| should allocate buf to be the right size and use sizeof(buf).
| 
| Also since this is a value you're getting directly from externally,
| you should use a fixed width type, rather than 'int'.

Maybe uint32_t?

| 
| > +        return -1;
| > +    }
| > +
| > +    idx = ldl_be_p(buf);
| 
| Easier to use the 'beNN_to_cpu' functions than ldl_be here.

Ok.

| 
| > +    if (g_hash_table_contains(gt, &idx)) {
| > +        return 0;
| > +    }
| > +
| > +    key = g_malloc(sizeof(int));
| > +
| > +    *key = idx;
| > +    if (!g_hash_table_insert(gt, key, NULL)) {
| > +        error_report("Unable to insert key %d into chips hash table\n", idx);
| > +        g_free(key);
| > +        return -1;
| > +    }
| > +
| > +    return 0;
| > +}
| > +
| > +/*
| > + * Each core in the system is represented by a directory with the
| > + * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
| > + * Process that directory and count the number of cores in the system.
| 
| True on IBM POWER systems, but not necessarily everywhere - e.g. PR
| KVM on an embedded PowerPC host.

What is PR KVM?

| 
| > + */
| > +static int count_cores(int *num_cores)
| > +{
| > +    int rc;
| > +    DIR *dir;
| > +    struct dirent *ent;
| > +    const char *cpus_dir = "/proc/device-tree/cpus";
| > +    const char *ppc_prefix = "PowerPC,POWER";
| > +
| > +    dir = opendir(cpus_dir);
| > +    if (!dir) {
| > +        error_report("Unable to open %s, error %d\n", cpus_dir, errno);
| > +        return -1;
| > +    }
| 
| You could probably do this more simply with glob(3).

Agree, it simplifiies code a lot. Thanks.

| 
| > +    *num_cores = 0;
| > +    while (1) {
| > +        errno = 0;
| > +        ent = readdir(dir);
| > +        if (!ent) {
| > +            break;
| > +        }
| > +
| > +        if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) {
| > +            (*num_cores)++;
| > +        }
| > +    }
| > +
| > +    rc = 0;
| > +    if (errno) {
| > +        rc = -1;
| > +    }
| > +
| > +    closedir(dir);
| > +    return rc;
| > +}
| > +
| > +/*
| > + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
| > + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
| > + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
| > + *
| > + * A module can contain more than one chip and a chip can contain more
| > + * than one core. So there are likely to be duplicates in the module
| > + * and chip identifiers (i.e more than one xscom directory can contain
| > + * the same module/chip id).
| > + *
| > + * Search the xscom directories and count the number of _UNIQUE_ module
| > + * and chip identifiers in the system.
| 
| There's no direct way to go from a core
| (i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or
| module?

Yes, it would logical to find the chip and module from the core :-)

While 'ibm,chip-id' is in the core dir (/proc/device-tree/cpus/PowerPC,*/), 
the 'ibm,hw-module-id' is not there (on my Tuleta system). Maybe the
'ibm,hw-module-id' will be added in the future?

I am using the xscom node to be consistent in counting chips and modules.

| 
| > + */
| > +static int count_modules_chips(int *num_modules, int *num_chips)
| > +{
| > +    int rc;
| > +    DIR *dir;
| > +    struct dirent *ent;
| > +    char path[PATH_MAX];
| > +    const char *xscom_dir = "/proc/device-tree";
| > +    const char *xscom_prefix = "xscom@";
| > +    GHashTable *chips_tbl, *modules_tbl;
| > +
| > +    dir = opendir(xscom_dir);
| > +    if (!dir) {
| > +        error_report("Unable to open %s, error %d\n", xscom_dir, errno);
| > +        return -1;
| > +    }
| > +
| > +    modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
| > +    chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
| > +
| > +    rc = -1;
| > +    while (1) {
| > +        errno = 0;
| > +        ent = readdir(dir);
| > +        if (!ent) {
| > +            break;
| > +        }
| 
| Again glob(3) could simplify this.

Agree.

| 
| > +
| > +        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
| > +            continue;
| > +        }
| > +
| > +        sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name);
| > +        if (hash_table_add_contents(chips_tbl, path)) {
| > +            goto cleanup;
| > +        }
| > +
| > +        sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name);
| > +        if (hash_table_add_contents(modules_tbl, path)) {
| > +            goto cleanup;
| > +        }
| > +    }
| > +
| > +    if (errno) {
| > +        goto cleanup;
| > +    }
| > +
| > +    *num_modules = g_hash_table_size(modules_tbl);
| > +    *num_chips = g_hash_table_size(chips_tbl);
| > +    rc = 0;
| > +
| > +cleanup:
| > +    g_hash_table_remove_all(modules_tbl);
| > +    g_hash_table_destroy(modules_tbl);
| > +
| > +    g_hash_table_remove_all(chips_tbl);
| > +    g_hash_table_destroy(chips_tbl);
| > +
| > +    closedir(dir);
| > +
| > +    return rc;
| > +}
| > +
| > +int rtas_get_module_info(struct rtas_module_info *modinfo)
| > +{
| > +    int cores;
| > +    int chips;
| > +    int modules;
| > +
| > +    memset(modinfo, 0, sizeof(struct rtas_module_info));
| > +
| > +    if (count_modules_chips(&modules, &chips) || count_cores(&cores)) {
| > +        return -1;
| > +    }
| > +
| > +    /*
| > +     * TODO: Hard code number of module_types for now till
| > +     *       we figure out how to determine it dynamically.
| > +     */
| > +    modinfo->module_types = 1;
| > +    modinfo->si[0].sockets = modules;
| > +    modinfo->si[0].chips = chips;
| > +    modinfo->si[0].cores_per_chip = cores / chips;
| > +
| > +    return 0;
| > +}
| > diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
| > new file mode 100644
| > index 0000000..356a2b5
| > --- /dev/null
| > +++ b/hw/ppc/spapr_rtas_modinfo.h
| > @@ -0,0 +1,12 @@
| > +
| > +struct rtas_socket_info {
| > +    unsigned short sockets;
| > +    unsigned short chips;
| > +    unsigned short cores_per_chip;
| > +};
| > +struct rtas_module_info {
| > +    unsigned short module_types;
| > +    struct rtas_socket_info si[1];
| 
| You probably want a "MAX_MODULE_TYPES" #define or similar instead of
| harcoding this to 1.

Agree.

| 
| > +};
| > +
| > +extern int rtas_get_module_info(struct rtas_module_info *topo);
| > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
| > index 5baa906..cfe7fa2 100644
| > --- a/include/hw/ppc/spapr.h
| > +++ b/include/hw/ppc/spapr.h
| > @@ -463,6 +463,7 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
| >  /* RTAS ibm,get-system-parameter token values */
| >  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
| >  #define RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE        42
| > +#define RTAS_SYSPARM_PROCESSOR_MODULE_INFO       43
| >  #define RTAS_SYSPARM_UUID                        48
| >  
| >  /* RTAS indicator/sensor types
| 
| -- 
| David Gibson			| I'll have my music baroque, and my code
| david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
| 				| _way_ _around_!
| http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  3:57   ` Sukadev Bhattiprolu
@ 2015-11-10  4:25     ` Alexey Kardashevskiy
  2015-11-10  4:46       ` Sukadev Bhattiprolu
  0 siblings, 1 reply; 22+ messages in thread
From: Alexey Kardashevskiy @ 2015-11-10  4:25 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: stewart, benh, nacc, agraf, qemu-devel, qemu-ppc, paulus, david

On 11/10/2015 02:57 PM, Sukadev Bhattiprolu wrote:
> Alexey Kardashevskiy [aik@ozlabs.ru] wrote:
> | On 11/05/2015 10:06 AM, Sukadev Bhattiprolu wrote:
> | >Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> | >call in qemu. This call returns the processor module (socket), chip and core
> | >information as specified in section 7.3.16.18 of PAPR v2.7.
> | >
> | >We walk the /proc/device-tree to determine the number of chips, cores and
> | >modules in the _host_ system and return this info to the guest application
> | >that makes the rtas_get_sysparm() call.
> | >
> | >We currently hard code the number of module_types to 1, but we should determine
> | >that dynamically somehow later.
> | >
> | >Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
> |
> | "iy" is missing :)
>
> Argh. Sorry :-)
>
> Agree with your other comments. Couple of questions/comments below.
>
> |
> | checkpatch.pl does not warn on this but new lines start under
> | opening brace in the previous line.
> |
> | In terms on vim, it would be:
> | set expandtab
> | set tabstop=4
> | set shiftwidth=4
> | set cino=:0,(0
> |
> |
> |
> | >+            offset += 2;
> | >+            stw_be_phys(&address_space_memory, buffer+offset,
> | >+                                    modinfo.si[i].chips);
> | >+            offset += 2;
> | >+            stw_be_phys(&address_space_memory, buffer+offset,
> | >+                                    modinfo.si[i].cores_per_chip);
> | >+            offset += 2;
> | >+        }
> | >+        break;
> | >+    }
> | >+
> | >      case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: {
> | >          char *param_val = g_strdup_printf("MaxEntCap=%d,"
> | >                                            "DesMem=%llu,"
> | >diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c
> | >new file mode 100644
> | >index 0000000..068fc2c
> | >--- /dev/null
> | >+++ b/hw/ppc/spapr_rtas_modinfo.c
> | >@@ -0,0 +1,230 @@
> | >+/*
> | >+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
> | >+ *
> | >+ * Hypercall based emulated RTAS
> |
> |
> | This is a description of hw/ppc/spapr_rtas.c, not of the new file.
>
> Agree.
>
> |
> | Not sure the new code deserves a separate file, I'd either:
> | - add it into hw/ppc/spapr_rtas.c OR
> | - move rtas_ibm_get_system_parameter() +
> | rtas_ibm_set_system_parameter() to a separate file (let's call it
> | hw/ppc/spapr_rtas_syspar.c) and add this new parameter there as
> | there will be new parameters in the future anyway.
> |
> | But I'll leave to the maintainer (David, hi :) ).
>
> Will discuss this on the other thread from David (maybe move to
> target-ppc/kvm.c?)


Sounds right.


>
> |
> |
> <snip>
>
> | >+
> | >+static int file_read_buf(char *file_name, char *buf, int len)
> | >+{
> | >+    int rc;
> | >+    FILE *fp;
> | >+
> | >+    fp = fopen(file_name, "r");
> | >+    if (!fp) {
> | >+        error_report("%s: Error opening %s\n", __func__, file_name);
> |
> |
> | error_report() does not need "\n", remote it here and below.
> |
> | Another thing - you passed __func__ here but you did not in other
> | places, please make this consistent and pass __func__ everywhere OR
> | make error messages more descriptive, the latter seems to be
> | preferable way in QEMU, either way this should be easily grep'able,
> | for example - "Unable to open %s, error %d" is too generic.
>
> will change to "Error opening device tree file %s" and drop the
> __func__.
>
> |
> |
> | >+        return -1;
> | >+    }
> | >+
> | >+    rc = fread(buf, 1, len, fp);
> | >+    fclose(fp);
> | >+
> | >+    if (rc != len) {
> | >+        return -1;
> | >+    }
> | >+
> | >+    return 0;
> | >+}
> | >+
> | >+/*
> | >+ * Read an identifier from the file @path and add the identifier
> | >+ * to the hash table @gt unless its already in the table.
> | >+ */
> | >+static int hash_table_add_contents(GHashTable *gt, char *path)
> |
> | Move this helper before count_modules_chips(). I thought for a sec
> | that count_cores() uses it too :)
>
> Ok.
>
> |
> |
> | >+{
> | >+    int idx;
> | >+    char buf[16];
> | >+    int *key;
> | >+
> | >+    if (file_read_buf(path, buf, sizeof(int))) {
> | >+        return -1;
> | >+    }
> | >+
> | >+    idx = ldl_be_p(buf);
> | >+
> | >+    if (g_hash_table_contains(gt, &idx)) {
> | >+        return 0;
> | >+    }
> | >+
> | >+    key = g_malloc(sizeof(int));
> |
> |
> | I kind of hoped that g_direct_hash() (and probably GINT_TO_POINTER()
> | but I do not see kvm-all.c using it) will let you avoid
> | g_malloc()'ing the keys.
>
> I think g_direct_hash() and GINT_TO_POINTER will work. Will try them
> out.
>
> |
> |
> |
> | >+
> | >+    *key = idx;
> | >+    if (!g_hash_table_insert(gt, key, NULL)) {
> | >+        error_report("Unable to insert key %d into chips hash table\n", idx);
> | >+        g_free(key);
> | >+        return -1;
> | >+    }
> | >+
> | >+    return 0;
> | >+}
> | >+
> | >+/*
> | >+ * Each core in the system is represented by a directory with the
> | >+ * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
> | >+ * Process that directory and count the number of cores in the system.
> | >+ */
> | >+static int count_cores(int *num_cores)
> | >+{
> | >+    int rc;
> | >+    DIR *dir;
> | >+    struct dirent *ent;
> | >+    const char *cpus_dir = "/proc/device-tree/cpus";
> | >+    const char *ppc_prefix = "PowerPC,POWER";
> | >+
> | >+    dir = opendir(cpus_dir);
> | >+    if (!dir) {
> | >+        error_report("Unable to open %s, error %d\n", cpus_dir, errno);
> | >+        return -1;
> | >+    }
> | >+
> | >+    *num_cores = 0;
> | >+    while (1) {
> | >+        errno = 0;
> | >+        ent = readdir(dir);
> | >+        if (!ent) {
> |
> | rc = -errno; ....
>
> Yes. Thanks,
> |
> |
> | >+            break;
> | >+        }
> | >+
> | >+        if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) {
> | >+            (*num_cores)++;
> | >+        }
> | >+    }
> | >+
> | >+    rc = 0;
> | >+    if (errno) {
> | >+        rc = -1;
> | >+    }
> |
> |
> | ... and remove these 4 lines above?
>
> Ok.
>
> |
> |
> |
> | >+
> | >+    closedir(dir);
> | >+    return rc;
> | >+}
> | >+
> | >+/*
> | >+ * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
> | >+ * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
> | >+ * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
> | >+ *
> | >+ * A module can contain more than one chip and a chip can contain more
> | >+ * than one core. So there are likely to be duplicates in the module
> | >+ * and chip identifiers (i.e more than one xscom directory can contain
> | >+ * the same module/chip id).
> | >+ *
> | >+ * Search the xscom directories and count the number of _UNIQUE_ module
> | >+ * and chip identifiers in the system.
> | >+ */
> | >+static int count_modules_chips(int *num_modules, int *num_chips)
> | >+{
> | >+    int rc;
> |
> | int rc = -1;
> |
> |
> | >+    DIR *dir;
> | >+    struct dirent *ent;
> | >+    char path[PATH_MAX];
> | >+    const char *xscom_dir = "/proc/device-tree";
> | >+    const char *xscom_prefix = "xscom@";
> | >+    GHashTable *chips_tbl, *modules_tbl;
> | >+
> | >+    dir = opendir(xscom_dir);
> | >+    if (!dir) {
> | >+        error_report("Unable to open %s, error %d\n", xscom_dir, errno);
> | >+        return -1;
> | >+    }
> | >+
> | >+    modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
> | >+    chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL);
> | >+
> | >+    rc = -1;
> |
> |
> | Remove this.
> |
> | >+    while (1) {
> | >+        errno = 0;
> | >+        ent = readdir(dir);
> | >+        if (!ent) {
> |
> | Add this:
> | rc = -errno;
> | goto cleanup;
> |
> | >+            break;
> | >+        }
> | >+
> | >+        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
> | >+            continue;
> | >+        }
> | >+
> | >+        sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name);
> | >+        if (hash_table_add_contents(chips_tbl, path)) {
> | >+            goto cleanup;
> | >+        }
> | >+
> | >+        sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name);
> | >+        if (hash_table_add_contents(modules_tbl, path)) {
> | >+            goto cleanup;
> | >+        }
> | >+    }
> | >+
> | >+    if (errno) {
> | >+        goto cleanup;
> | >+    }
> |
> | Do not need these 3 lines.
> |
> |
> | >+
> | >+    *num_modules = g_hash_table_size(modules_tbl);
> | >+    *num_chips = g_hash_table_size(chips_tbl);
> | >+    rc = 0;
> |
> |
> | Remove this.
> |
> | count_modules_chips() and count_cores() do pretty similar things, it
> | would improve the code if error handling was done similar ways.
>
> Agree.
>
> |
> |
> | >+
> | >+cleanup:
> | >+    g_hash_table_remove_all(modules_tbl);
> | >+    g_hash_table_destroy(modules_tbl);
> | >+
> | >+    g_hash_table_remove_all(chips_tbl);
> | >+    g_hash_table_destroy(chips_tbl);
> | >+
> | >+    closedir(dir);
> | >+
> | >+    return rc;
> | >+}
> | >+
> | >+int rtas_get_module_info(struct rtas_module_info *modinfo)
> | >+{
> | >+    int cores;
> | >+    int chips;
> | >+    int modules;
> |
> | Nit - could be one line.
> |
> |
> | >+
> | >+    memset(modinfo, 0, sizeof(struct rtas_module_info));
> | >+
> | >+    if (count_modules_chips(&modules, &chips) || count_cores(&cores)) {
> | >+        return -1;
> | >+    }
> | >+
> | >+    /*
> | >+     * TODO: Hard code number of module_types for now till
> | >+     *       we figure out how to determine it dynamically.
> | >+     */
> | >+    modinfo->module_types = 1;
> | >+    modinfo->si[0].sockets = modules;
> |
> |
> | A "module" here is what modinfo describes so I'd expect @modules to
> | be "1" in the existing code and count_modules_chips() to be named
> | count_sockets_chips() and return @sockets, not @modules.
> |
> | When exactly does a socket become a module? The SPAPR spec uses "sockets" here.
>
> I am trying to get the terminology too :-) Is socket a slot where a
> module is attached?

Sorry, no idea.


>
> I will change the variable name 'modules' to 'sockets'.
> |
> |
> | >+    modinfo->si[0].chips = chips;
> | >+    modinfo->si[0].cores_per_chip = cores / chips;
> |
> |
> | What if no "ibm,chip-id" was found and chips == 0?
>
> If we fail to readdir(xscom) or fail to read the 'ibm,chip-id',
> we return an error which we check above.


You assume that if there is /proc/device-tree, then there is always 
"xscom@" but this might not be always the case, like PR KVM on embedded PPC64.



>
> |
> |
> | >+
> | >+    return 0;
> | >+}
> | >diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
> | >new file mode 100644
> | >index 0000000..356a2b5
> | >--- /dev/null
> | >+++ b/hw/ppc/spapr_rtas_modinfo.h
> |
> |
> | This file is missing a license and
> | #ifndef SPAPR_RTAS_MODINFO_H
> | #define SPAPR_RTAS_MODINFO_H
> | #endif
> |
> | But I'd rather put the content to include/hw/ppc/spapr.h and avoid
> | creating new files.
>
> Ok.
>
> |
> |
> | >@@ -0,0 +1,12 @@
> | >+
> | >+struct rtas_socket_info {
> | >+    unsigned short sockets;
> | >+    unsigned short chips;
> | >+    unsigned short cores_per_chip;
> | >+};
> | >+struct rtas_module_info {
> | >+    unsigned short module_types;
> | >+    struct rtas_socket_info si[1];
> | >+};
> |
> |
> | Will the number of rtas_socket_info be always a constant or (sure,
> | in the future) it may take different values?
>
> TBH, I am not sure. One thing though, array size of rtas_socket_info
> depends on number of module _types_. Like Nishanth Aravamudan mentioned
> offline, we are not sure if both a DCM and SCM types can be on a system
> at once (or even if they can be added dynamically).

What are DCM and SCM? What type do we have in Tuleta?


> Another thing I have on my list to check is how this works with
> hotplug CPU (in the host). If the device-tree is properly updated
> then these calls will return the updated info? The values in the
> array will change but the number of entries (types) in the array is
> still 1.
>
> |
> | Normally when you invent API like rtas_get_module_info(), you tell
> | the helper how much memory is allocated in the rtas_module_info
> | struct (in our case it is ARRAY_SIZE(rtas_module_info.si) which is
> | fine) and then the helper signals somehow how many module types it
> | has found (which is missing here).
>
> Can we assume that the number of types is static for now i.e both
> caller and callee know the size (although I should fix the computation
> of size in spapr_rtas.c and use MAX_MODULE_TYPES that David pointed out).


I do not know what these modules can be so I cannot advise on this, sorry. 
I honestly do not see much point in implementing this system parameter - 
what will the guest do with this information?

You could get rid of rtas_module_info for now and have a helper for 
rtas_socket_info only - rtas_get_xxx_module_info() (where xxx is DCM or SCM 
or whatever it is now and what we support). Then it would be up to 
rtas_ibm_get_system_parameter() to decide how many modules types we want to 
expose to the guest (now - one) and when we get a new module type, we will 
either extend rtas_get_xxx_module_info() or implement another helper.


-- 
Alexey

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  4:25     ` Alexey Kardashevskiy
@ 2015-11-10  4:46       ` Sukadev Bhattiprolu
  2015-11-10  6:58         ` Alexey Kardashevskiy
  0 siblings, 1 reply; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-10  4:46 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: stewart, benh, nacc, agraf, qemu-devel, qemu-ppc, paulus, david

Alexey Kardashevskiy [aik@ozlabs.ru] wrote:
<snip>

| >| When exactly does a socket become a module? The SPAPR spec uses "sockets" here.
| >
| >I am trying to get the terminology too :-) Is socket a slot where a
| >module is attached?
| 
| Sorry, no idea.

Ok.

| 
| 
| >
| >I will change the variable name 'modules' to 'sockets'.
| >|
| >|
| >| >+    modinfo->si[0].chips = chips;
| >| >+    modinfo->si[0].cores_per_chip = cores / chips;
| >|
| >|
| >| What if no "ibm,chip-id" was found and chips == 0?
| >
| >If we fail to readdir(xscom) or fail to read the 'ibm,chip-id',
| >we return an error which we check above.
| 
| 
| You assume that if there is /proc/device-tree, then there is always
| "xscom@" but this might not be always the case, like PR KVM on
| embedded PPC64.

For ibm,chip-id, we do try to read the file (and eliminate duplicates
chip ids) If we can't read the file (hash_table_add_contents()) we
return an error, but will check again.

| 
| 
| 
| >
| >|
| >|
| >| >+
| >| >+    return 0;
| >| >+}
| >| >diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h
| >| >new file mode 100644
| >| >index 0000000..356a2b5
| >| >--- /dev/null
| >| >+++ b/hw/ppc/spapr_rtas_modinfo.h
| >|
| >|
| >| This file is missing a license and
| >| #ifndef SPAPR_RTAS_MODINFO_H
| >| #define SPAPR_RTAS_MODINFO_H
| >| #endif
| >|
| >| But I'd rather put the content to include/hw/ppc/spapr.h and avoid
| >| creating new files.
| >
| >Ok.
| >
| >|
| >|
| >| >@@ -0,0 +1,12 @@
| >| >+
| >| >+struct rtas_socket_info {
| >| >+    unsigned short sockets;
| >| >+    unsigned short chips;
| >| >+    unsigned short cores_per_chip;
| >| >+};
| >| >+struct rtas_module_info {
| >| >+    unsigned short module_types;
| >| >+    struct rtas_socket_info si[1];
| >| >+};
| >|
| >|
| >| Will the number of rtas_socket_info be always a constant or (sure,
| >| in the future) it may take different values?
| >
| >TBH, I am not sure. One thing though, array size of rtas_socket_info
| >depends on number of module _types_. Like Nishanth Aravamudan mentioned
| >offline, we are not sure if both a DCM and SCM types can be on a system
| >at once (or even if they can be added dynamically).
| 
| What are DCM and SCM? What type do we have in Tuleta?

Dual Chip Module and Single Chip Module. Maybe its standard configuration
but on our Tuleta, we have a DCM (2 chips per module): 

	$ lsprop /proc/device-tree/xscom*/ibm,hw-module-id
	/proc/device-tree/xscom@3fc0000000000/ibm,hw-module-id
			 00000000
	/proc/device-tree/xscom@3fc0800000000/ibm,hw-module-id
			 00000000
	/proc/device-tree/xscom@3fc8000000000/ibm,hw-module-id
			 00000001
	/proc/device-tree/xscom@3fc8800000000/ibm,hw-module-id
			 00000001

Each xscom represents a chip. Two chips have module id 0, two have
module id 1.

| 
| 
| >Another thing I have on my list to check is how this works with
| >hotplug CPU (in the host). If the device-tree is properly updated
| >then these calls will return the updated info? The values in the
| >array will change but the number of entries (types) in the array is
| >still 1.
| >
| >|
| >| Normally when you invent API like rtas_get_module_info(), you tell
| >| the helper how much memory is allocated in the rtas_module_info
| >| struct (in our case it is ARRAY_SIZE(rtas_module_info.si) which is
| >| fine) and then the helper signals somehow how many module types it
| >| has found (which is missing here).
| >
| >Can we assume that the number of types is static for now i.e both
| >caller and callee know the size (although I should fix the computation
| >of size in spapr_rtas.c and use MAX_MODULE_TYPES that David pointed out).
| 
| 
| I do not know what these modules can be so I cannot advise on this,
| sorry. I honestly do not see much point in implementing this system
| parameter - what will the guest do with this information?

AFAICT, this system parameter is needed for distro licensing on powerKVM :-)

| 
| You could get rid of rtas_module_info for now and have a helper for
| rtas_socket_info only - rtas_get_xxx_module_info() (where xxx is DCM
| or SCM or whatever it is now and what we support). Then it would be
| up to rtas_ibm_get_system_parameter() to decide how many modules
| types we want to expose to the guest (now - one) and when we get a
| new module type, we will either extend rtas_get_xxx_module_info() or
| implement another helper.
| 
| 
| -- 
| Alexey

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  4:46       ` Sukadev Bhattiprolu
@ 2015-11-10  6:58         ` Alexey Kardashevskiy
  2015-11-10 18:27           ` Sukadev Bhattiprolu
  0 siblings, 1 reply; 22+ messages in thread
From: Alexey Kardashevskiy @ 2015-11-10  6:58 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: stewart, benh, nacc, agraf, qemu-devel, qemu-ppc, paulus, david

On 11/10/2015 03:46 PM, Sukadev Bhattiprolu wrote:
> Alexey Kardashevskiy [aik@ozlabs.ru] wrote:
> <snip>
>
> | >| When exactly does a socket become a module? The SPAPR spec uses "sockets" here.
> | >
> | >I am trying to get the terminology too :-) Is socket a slot where a
> | >module is attached?
> |
> | Sorry, no idea.
>
> Ok.
>
> |
> |
> | >
> | >I will change the variable name 'modules' to 'sockets'.
> | >|
> | >|
> | >| >+    modinfo->si[0].chips = chips;
> | >| >+    modinfo->si[0].cores_per_chip = cores / chips;
> | >|
> | >|
> | >| What if no "ibm,chip-id" was found and chips == 0?
> | >
> | >If we fail to readdir(xscom) or fail to read the 'ibm,chip-id',
> | >we return an error which we check above.
> |
> |
> | You assume that if there is /proc/device-tree, then there is always
> | "xscom@" but this might not be always the case, like PR KVM on
> | embedded PPC64.
>
> For ibm,chip-id, we do try to read the file (and eliminate duplicates


No, you do not try reading "ibm,chip-id" if there is no "xscom@" under 
/proc/device-tree, there is "continue":

+        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
+            continue;
+        }



> chip ids) If we can't read the file (hash_table_add_contents()) we
> return an error, but will check again.



-- 
Alexey

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  4:22   ` Sukadev Bhattiprolu
@ 2015-11-10  9:53     ` Thomas Huth
  2015-11-13 20:29       ` Sukadev Bhattiprolu
  2015-11-11  0:17     ` David Gibson
  1 sibling, 1 reply; 22+ messages in thread
From: Thomas Huth @ 2015-11-10  9:53 UTC (permalink / raw)
  To: Sukadev Bhattiprolu, David Gibson
  Cc: stewart, benh, aik, nacc, agraf, qemu-devel, qemu-ppc, paulus

On 10/11/15 05:22, Sukadev Bhattiprolu wrote:
[...]
> | > +static int file_read_buf(char *file_name, char *buf, int len)
> | > +{
> | > +    int rc;
> | > +    FILE *fp;
> | > +
> | > +    fp = fopen(file_name, "r");
> | > +    if (!fp) {
> | > +        error_report("%s: Error opening %s\n", __func__, file_name);
> | > +        return -1;
> | > +    }
> | > +
> | > +    rc = fread(buf, 1, len, fp);
> | > +    fclose(fp);
> | > +
> | > +    if (rc != len) {
> | > +        return -1;
> | > +    }
> | > +
> | > +    return 0;
> | > +}

Could you maybe use g_file_get_contents() instead?

> | > +/*
> | > + * Each core in the system is represented by a directory with the
> | > + * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
> | > + * Process that directory and count the number of cores in the system.
> | 
> | True on IBM POWER systems, but not necessarily everywhere - e.g. PR
> | KVM on an embedded PowerPC host.
> 
> What is PR KVM?

On PPC, there are multiple kinds of KVM kernel modules, e.g. KVM-HV and
KVM-PR (and further implementations for embedded PPCs, too). KVM-HV is
using the hypervisor hardware feature of the current POWER7 and POWER8
chips, while KVM-PR is using the PRoblem state to emulate a virtual
machine. KVM-PR thus also works on older PPC hardware.
So there are multiple PPC environments where QEMU can run on, and you
must not assume that you always have nodes like "PowerPC,POWER" in the
device tree.
(BTW, you can also build kernels without the /proc/device-tree file
system as far as I know ... so you never should fully rely on that
without a fallback strategy)

 Thomas

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  6:58         ` Alexey Kardashevskiy
@ 2015-11-10 18:27           ` Sukadev Bhattiprolu
  0 siblings, 0 replies; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-10 18:27 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: stewart, benh, nacc, agraf, qemu-devel, qemu-ppc, paulus, david

Alexey Kardashevskiy [aik@ozlabs.ru] wrote:
| 
| 
| No, you do not try reading "ibm,chip-id" if there is no "xscom@"
| under /proc/device-tree, there is "continue":
| 
| +        if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) {
| +            continue;
| +        }
| 

Ah, yes, Thanks,

Sukadev.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  4:22   ` Sukadev Bhattiprolu
  2015-11-10  9:53     ` Thomas Huth
@ 2015-11-11  0:17     ` David Gibson
  2015-11-11  0:56       ` Nishanth Aravamudan
  2015-11-13 20:21       ` Sukadev Bhattiprolu
  1 sibling, 2 replies; 22+ messages in thread
From: David Gibson @ 2015-11-11  0:17 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: stewart, benh, aik, nacc, agraf, qemu-devel, qemu-ppc, paulus

[-- Attachment #1: Type: text/plain, Size: 4914 bytes --]

On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> David Gibson [david@gibson.dropbear.id.au] wrote:
> | On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote:
> | > Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> | > call in qemu. This call returns the processor module (socket), chip and core
> | > information as specified in section 7.3.16.18 of PAPR v2.7.
> | 
> | PAPR v2.7 isn't available publically.  For upstream patches, please
> | reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).
> 
> Ok.
> 
> | 
> | > We walk the /proc/device-tree to determine the number of chips, cores and
> | > modules in the _host_ system and return this info to the guest application
> | > that makes the rtas_get_sysparm() call.
> | > 
> | > We currently hard code the number of module_types to 1, but we should determine
> | > that dynamically somehow later.
> | > 
> | > Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
> | > 
> | > Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> | 
> | This isn't ready to go yet - you need to put some more consideration
> | into the uncommon cases: PR KVM, TCG and non-Power hosts.
> 
> Ok. Is there a we can make this code applicable only a Powerpc host?
> (would moving this code to target-ppc/kvm.c do that?)

Yes, moving it to target-ppc/kvm.c would mostly do that.  You'd need
some logic to make sure it fails gracefully in other cases, of course.

[snip]
> | >      switch (parameter) {
> | > +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
> | > +        int i;
> | > +        int offset = 0;
> | > +        int size;
> | > +        struct rtas_module_info modinfo;
> | > +
> | > +        if (rtas_get_module_info(&modinfo)) {
> | > +            break;
> | > +        }
> | 
> | So, you handle the variable size of this structure before sending it
> | to the guest, but you don't handle it in allocation of the structure
> | right here.  You'll get away with that because for now you only ever
> | have one entry in the sockets array, but it's a bit icky.
> 
> Can we assume that the size is static for now...
> | 
> | > +
> | > +        size = sizeof(modinfo);
> | > +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
> | 
> | More seriously, this calculation will break horribly if you change the
> | size of the array in struct rtas_module_info.
> 
> and just set 'size' to sizeof(modinfo)?.

For purposes of allocation you could just use a fixed size.  But the
guest might get confused by additional data beyond the declared size,
so you do need to get the value correct that you send back to the guest.

[snip]
> | > +/*
> | > + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
> | > + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
> | > + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
> | > + *
> | > + * A module can contain more than one chip and a chip can contain more
> | > + * than one core. So there are likely to be duplicates in the module
> | > + * and chip identifiers (i.e more than one xscom directory can contain
> | > + * the same module/chip id).
> | > + *
> | > + * Search the xscom directories and count the number of _UNIQUE_ module
> | > + * and chip identifiers in the system.
> | 
> | There's no direct way to go from a core
> | (i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or
> | module?
> 
> Yes, it would logical to find the chip and module from the core :-)
> 
> While 'ibm,chip-id' is in the core dir (/proc/device-tree/cpus/PowerPC,*/), 
> the 'ibm,hw-module-id' is not there (on my Tuleta system). Maybe the
> 'ibm,hw-module-id' will be added in the future?

Hm, I see.  Is there any device node that represents the "chip"?

> I am using the xscom node to be consistent in counting chips and modules.

The trouble with xscom is that it's extremely specific to the way the
current IBM servers present things.  It won't work on other types of
host machine (which could happen with PR KVM), and could even break if
IBM changes the way it organizes the SCOMs in a future machine.

Working from the nodes in /cpus still has some dependencies on IBM
specific properties, but it's at least partially based on OF
standards.

There's also another possible approach here, though I don't know if it
will work.  Instead of looking directly in the device tree, try to get
the information from lscpu, or libosinfo.  That would at least give
you some hope of providing meaningful information on other host types.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-11  0:17     ` David Gibson
@ 2015-11-11  0:56       ` Nishanth Aravamudan
  2015-11-11  1:41         ` David Gibson
  2015-11-13 20:21       ` Sukadev Bhattiprolu
  1 sibling, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2015-11-11  0:56 UTC (permalink / raw)
  To: David Gibson
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> > David Gibson [david@gibson.dropbear.id.au] wrote:
> > | On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote:
> > | > Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> > | > call in qemu. This call returns the processor module (socket), chip and core
> > | > information as specified in section 7.3.16.18 of PAPR v2.7.
> > | 
> > | PAPR v2.7 isn't available publically.  For upstream patches, please
> > | reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).
> > 
> > Ok.
> > 
> > | 
> > | > We walk the /proc/device-tree to determine the number of chips, cores and
> > | > modules in the _host_ system and return this info to the guest application
> > | > that makes the rtas_get_sysparm() call.
> > | > 
> > | > We currently hard code the number of module_types to 1, but we should determine
> > | > that dynamically somehow later.
> > | > 
> > | > Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
> > | > 
> > | > Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> > | 
> > | This isn't ready to go yet - you need to put some more consideration
> > | into the uncommon cases: PR KVM, TCG and non-Power hosts.
> > 
> > Ok. Is there a we can make this code applicable only a Powerpc host?
> > (would moving this code to target-ppc/kvm.c do that?)
> 
> Yes, moving it to target-ppc/kvm.c would mostly do that.  You'd need
> some logic to make sure it fails gracefully in other cases, of course.
> 
> [snip]
> > | >      switch (parameter) {
> > | > +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
> > | > +        int i;
> > | > +        int offset = 0;
> > | > +        int size;
> > | > +        struct rtas_module_info modinfo;
> > | > +
> > | > +        if (rtas_get_module_info(&modinfo)) {
> > | > +            break;
> > | > +        }
> > | 
> > | So, you handle the variable size of this structure before sending it
> > | to the guest, but you don't handle it in allocation of the structure
> > | right here.  You'll get away with that because for now you only ever
> > | have one entry in the sockets array, but it's a bit icky.
> > 
> > Can we assume that the size is static for now...
> > | 
> > | > +
> > | > +        size = sizeof(modinfo);
> > | > +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
> > | 
> > | More seriously, this calculation will break horribly if you change the
> > | size of the array in struct rtas_module_info.
> > 
> > and just set 'size' to sizeof(modinfo)?.
> 
> For purposes of allocation you could just use a fixed size.  But the
> guest might get confused by additional data beyond the declared size,
> so you do need to get the value correct that you send back to the guest.
> 
> [snip]
> > | > +/*
> > | > + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
> > | > + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
> > | > + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
> > | > + *
> > | > + * A module can contain more than one chip and a chip can contain more
> > | > + * than one core. So there are likely to be duplicates in the module
> > | > + * and chip identifiers (i.e more than one xscom directory can contain
> > | > + * the same module/chip id).
> > | > + *
> > | > + * Search the xscom directories and count the number of _UNIQUE_ module
> > | > + * and chip identifiers in the system.
> > | 
> > | There's no direct way to go from a core
> > | (i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or
> > | module?
> > 
> > Yes, it would logical to find the chip and module from the core :-)
> > 
> > While 'ibm,chip-id' is in the core dir (/proc/device-tree/cpus/PowerPC,*/), 
> > the 'ibm,hw-module-id' is not there (on my Tuleta system). Maybe the
> > 'ibm,hw-module-id' will be added in the future?
> 
> Hm, I see.  Is there any device node that represents the "chip"?
> 
> > I am using the xscom node to be consistent in counting chips and modules.
> 
> The trouble with xscom is that it's extremely specific to the way the
> current IBM servers present things.  It won't work on other types of
> host machine (which could happen with PR KVM), and could even break if
> IBM changes the way it organizes the SCOMs in a future machine.
> 
> Working from the nodes in /cpus still has some dependencies on IBM
> specific properties, but it's at least partially based on OF
> standards.
> 
> There's also another possible approach here, though I don't know if it
> will work.  Instead of looking directly in the device tree, try to get
> the information from lscpu, or libosinfo.  That would at least give
> you some hope of providing meaningful information on other host types.

Heh, the issue that is underlying all of this, is that `lscpu` itself is
quite wrong.

On PAPR-compliant hypervisors (well, PowerVM, at least), the only
supported means of determining the underlying hardware CPU information
(which is what licensing models want in the end), is to use this RTAS
call in an LPAR. `lscpu` is explicitly incorrect in these environments
(it's values are "derived" from sysfs and some are adjusted to ensure
the division of values works out).

So, we are trying to at least resolve what PowerKVM guest can see by
supporting this RTAS call there. We should report *something* to the
guest, if possible, and we can adjust what is reported to the guests as
we go, from the host perspective.

I haven't followed along too closely in this thread, but woudl it be
reasonable to only report this RTAS call as being supported under KVM?
How are other RTAS calls dealt with for PR and non-IBM models currently?

-Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-11  0:56       ` Nishanth Aravamudan
@ 2015-11-11  1:41         ` David Gibson
  2015-11-11 22:10           ` Nishanth Aravamudan
  0 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2015-11-11  1:41 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

[-- Attachment #1: Type: text/plain, Size: 6876 bytes --]

On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> > > David Gibson [david@gibson.dropbear.id.au] wrote:
> > > | On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote:
> > > | > Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
> > > | > call in qemu. This call returns the processor module (socket), chip and core
> > > | > information as specified in section 7.3.16.18 of PAPR v2.7.
> > > | 
> > > | PAPR v2.7 isn't available publically.  For upstream patches, please
> > > | reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).
> > > 
> > > Ok.
> > > 
> > > | 
> > > | > We walk the /proc/device-tree to determine the number of chips, cores and
> > > | > modules in the _host_ system and return this info to the guest application
> > > | > that makes the rtas_get_sysparm() call.
> > > | > 
> > > | > We currently hard code the number of module_types to 1, but we should determine
> > > | > that dynamically somehow later.
> > > | > 
> > > | > Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
> > > | > 
> > > | > Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> > > | 
> > > | This isn't ready to go yet - you need to put some more consideration
> > > | into the uncommon cases: PR KVM, TCG and non-Power hosts.
> > > 
> > > Ok. Is there a we can make this code applicable only a Powerpc host?
> > > (would moving this code to target-ppc/kvm.c do that?)
> > 
> > Yes, moving it to target-ppc/kvm.c would mostly do that.  You'd need
> > some logic to make sure it fails gracefully in other cases, of course.
> > 
> > [snip]
> > > | >      switch (parameter) {
> > > | > +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
> > > | > +        int i;
> > > | > +        int offset = 0;
> > > | > +        int size;
> > > | > +        struct rtas_module_info modinfo;
> > > | > +
> > > | > +        if (rtas_get_module_info(&modinfo)) {
> > > | > +            break;
> > > | > +        }
> > > | 
> > > | So, you handle the variable size of this structure before sending it
> > > | to the guest, but you don't handle it in allocation of the structure
> > > | right here.  You'll get away with that because for now you only ever
> > > | have one entry in the sockets array, but it's a bit icky.
> > > 
> > > Can we assume that the size is static for now...
> > > | 
> > > | > +
> > > | > +        size = sizeof(modinfo);
> > > | > +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
> > > | 
> > > | More seriously, this calculation will break horribly if you change the
> > > | size of the array in struct rtas_module_info.
> > > 
> > > and just set 'size' to sizeof(modinfo)?.
> > 
> > For purposes of allocation you could just use a fixed size.  But the
> > guest might get confused by additional data beyond the declared size,
> > so you do need to get the value correct that you send back to the guest.
> > 
> > [snip]
> > > | > +/*
> > > | > + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
> > > | > + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
> > > | > + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
> > > | > + *
> > > | > + * A module can contain more than one chip and a chip can contain more
> > > | > + * than one core. So there are likely to be duplicates in the module
> > > | > + * and chip identifiers (i.e more than one xscom directory can contain
> > > | > + * the same module/chip id).
> > > | > + *
> > > | > + * Search the xscom directories and count the number of _UNIQUE_ module
> > > | > + * and chip identifiers in the system.
> > > | 
> > > | There's no direct way to go from a core
> > > | (i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or
> > > | module?
> > > 
> > > Yes, it would logical to find the chip and module from the core :-)
> > > 
> > > While 'ibm,chip-id' is in the core dir (/proc/device-tree/cpus/PowerPC,*/), 
> > > the 'ibm,hw-module-id' is not there (on my Tuleta system). Maybe the
> > > 'ibm,hw-module-id' will be added in the future?
> > 
> > Hm, I see.  Is there any device node that represents the "chip"?
> > 
> > > I am using the xscom node to be consistent in counting chips and modules.
> > 
> > The trouble with xscom is that it's extremely specific to the way the
> > current IBM servers present things.  It won't work on other types of
> > host machine (which could happen with PR KVM), and could even break if
> > IBM changes the way it organizes the SCOMs in a future machine.
> > 
> > Working from the nodes in /cpus still has some dependencies on IBM
> > specific properties, but it's at least partially based on OF
> > standards.
> > 
> > There's also another possible approach here, though I don't know if it
> > will work.  Instead of looking directly in the device tree, try to get
> > the information from lscpu, or libosinfo.  That would at least give
> > you some hope of providing meaningful information on other host types.
> 
> Heh, the issue that is underlying all of this, is that `lscpu` itself is
> quite wrong.
> 
> On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> supported means of determining the underlying hardware CPU information
> (which is what licensing models want in the end), is to use this RTAS
> call in an LPAR. `lscpu` is explicitly incorrect in these environments
> (it's values are "derived" from sysfs and some are adjusted to ensure
> the division of values works out).

So.. I'm not sure if you're just saying that lscpu is wrong because it
gives the guest information, or because of other problems.

What I was suggesting is implementing the RTAS call so that it
effectively lets the guest get lscpu information from the host.

> So, we are trying to at least resolve what PowerKVM guest can see by
> supporting this RTAS call there. We should report *something* to the
> guest, if possible, and we can adjust what is reported to the guests as
> we go, from the host perspective.
> 
> I haven't followed along too closely in this thread, but woudl it be
> reasonable to only report this RTAS call as being supported under
> KVM?

Possibly, yes.

> How are other RTAS calls dealt with for PR and non-IBM models
> currently?

Most of them still make sense in PR or TCG.  A few do look in the host
device tree, in which case they're likely to fail on non-KVM.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-11  1:41         ` David Gibson
@ 2015-11-11 22:10           ` Nishanth Aravamudan
  2015-11-12  4:47             ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2015-11-11 22:10 UTC (permalink / raw)
  To: David Gibson
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

On 11.11.2015 [12:41:26 +1100], David Gibson wrote:
> On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> > On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:

<snip>

> > > The trouble with xscom is that it's extremely specific to the way the
> > > current IBM servers present things.  It won't work on other types of
> > > host machine (which could happen with PR KVM), and could even break if
> > > IBM changes the way it organizes the SCOMs in a future machine.
> > > 
> > > Working from the nodes in /cpus still has some dependencies on IBM
> > > specific properties, but it's at least partially based on OF
> > > standards.
> > > 
> > > There's also another possible approach here, though I don't know if it
> > > will work.  Instead of looking directly in the device tree, try to get
> > > the information from lscpu, or libosinfo.  That would at least give
> > > you some hope of providing meaningful information on other host types.
> > 
> > Heh, the issue that is underlying all of this, is that `lscpu` itself is
> > quite wrong.
> > 
> > On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> > supported means of determining the underlying hardware CPU information
> > (which is what licensing models want in the end), is to use this RTAS
> > call in an LPAR. `lscpu` is explicitly incorrect in these environments
> > (it's values are "derived" from sysfs and some are adjusted to ensure
> > the division of values works out).
> 
> So.. I'm not sure if you're just saying that lscpu is wrong because it
> gives the guest information, or because of other problems.

`lscpu`'s man-page specifically says that on virtualized platforms, the
output may be inaccurate. And, in fact, on Power, in a KVM guest (and
in a LPAR), `lscpu` is outputting the guest CPU information, which is
completely fake. This is true on x86 KVM guests too, afaict.

*If* we have a valid RTAS implementation on PowerKVM (or under qemu
generally), I think we can modify `lscpu` to do the right thing in at
least those two environments.

> What I was suggesting is implementing the RTAS call so that it
> effectively lets the guest get lscpu information from the host.

A bit of a chicken & egg problem, I'd say. The `lscpu` output in PowerNV
is also wrong :)

> > So, we are trying to at least resolve what PowerKVM guest can see by
> > supporting this RTAS call there. We should report *something* to the
> > guest, if possible, and we can adjust what is reported to the guests as
> > we go, from the host perspective.
> > 
> > I haven't followed along too closely in this thread, but woudl it be
> > reasonable to only report this RTAS call as being supported under
> > KVM?
> 
> Possibly, yes.

At least, as a first step, I guess.

> > How are other RTAS calls dealt with for PR and non-IBM models
> > currently?
> 
> Most of them still make sense in PR or TCG.  A few do look in the host
> device tree, in which case they're likely to fail on non-KVM.

Got it, thanks.

So my investigation overall led me to this set of conclusions:

1) Under PowerVM, we do not use this RTAS call, which is the only (as
asserted by pHyp developers) valid way to get hardware information about
the machine. Therefore, the PowerVM `lscpu` output is the "virtual" CPU
information -- where cores are as defined by sharing of the L2-cache.

2) Under PowerKVM, we do not use this RTAS call, because it's not
supported, and just spit out whatever the qemu topology is (which has no
connection to the host (physical) CPU information).

 --> so if we implement the RTAS call of some sort under PowerKVM, then
we can update `lscpu` to use that RTAS call.

3) Under PowerNV, there is a dependency on the hack that is ibm,chip-id
from OPAL, which leads to twice as many sockets potentially being
reported. `lscpu` also uses the sysfs files directly, which may or may
not be the physical topology (I'm still tracking all of this down). 

*Also* `lscpu` has no knowledge of offline/online CPUs, so as you
online/offline CPUs, the output of `lscpu` starts to change.

I think what we eventually want to do is add some fields to `lscpu` to
indicate the "physical" data vs. the "virtual" data.

-Nish

-Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-11 22:10           ` Nishanth Aravamudan
@ 2015-11-12  4:47             ` David Gibson
  2015-11-12 16:46               ` Nishanth Aravamudan
  0 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2015-11-12  4:47 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

[-- Attachment #1: Type: text/plain, Size: 5209 bytes --]

On Wed, Nov 11, 2015 at 02:10:48PM -0800, Nishanth Aravamudan wrote:
> On 11.11.2015 [12:41:26 +1100], David Gibson wrote:
> > On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> > > On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > > > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> 
> <snip>
> 
> > > > The trouble with xscom is that it's extremely specific to the way the
> > > > current IBM servers present things.  It won't work on other types of
> > > > host machine (which could happen with PR KVM), and could even break if
> > > > IBM changes the way it organizes the SCOMs in a future machine.
> > > > 
> > > > Working from the nodes in /cpus still has some dependencies on IBM
> > > > specific properties, but it's at least partially based on OF
> > > > standards.
> > > > 
> > > > There's also another possible approach here, though I don't know if it
> > > > will work.  Instead of looking directly in the device tree, try to get
> > > > the information from lscpu, or libosinfo.  That would at least give
> > > > you some hope of providing meaningful information on other host types.
> > > 
> > > Heh, the issue that is underlying all of this, is that `lscpu` itself is
> > > quite wrong.
> > > 
> > > On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> > > supported means of determining the underlying hardware CPU information
> > > (which is what licensing models want in the end), is to use this RTAS
> > > call in an LPAR. `lscpu` is explicitly incorrect in these environments
> > > (it's values are "derived" from sysfs and some are adjusted to ensure
> > > the division of values works out).
> > 
> > So.. I'm not sure if you're just saying that lscpu is wrong because it
> > gives the guest information, or because of other problems.
> 
> `lscpu`'s man-page specifically says that on virtualized platforms, the
> output may be inaccurate. And, in fact, on Power, in a KVM guest (and
> in a LPAR), `lscpu` is outputting the guest CPU information, which is
> completely fake. This is true on x86 KVM guests too, afaict.

Um.. yes, I was assuming lscpu reporting information about virtual
cpus and sockets was intended and correct behaviour.

> *If* we have a valid RTAS implementation on PowerKVM (or under qemu
> generally), I think we can modify `lscpu` to do the right thing in at
> least those two environments.
> 
> > What I was suggesting is implementing the RTAS call so that it
> > effectively lets the guest get lscpu information from the host.
> 
> A bit of a chicken & egg problem, I'd say. The `lscpu` output in PowerNV
> is also wrong :)

Ok.. why is it wrong in PowerNV?  This sounds like something you'd
want to fix anyway.

> > > So, we are trying to at least resolve what PowerKVM guest can see by
> > > supporting this RTAS call there. We should report *something* to the
> > > guest, if possible, and we can adjust what is reported to the guests as
> > > we go, from the host perspective.
> > > 
> > > I haven't followed along too closely in this thread, but woudl it be
> > > reasonable to only report this RTAS call as being supported under
> > > KVM?
> > 
> > Possibly, yes.
> 
> At least, as a first step, I guess.
> 
> > > How are other RTAS calls dealt with for PR and non-IBM models
> > > currently?
> > 
> > Most of them still make sense in PR or TCG.  A few do look in the host
> > device tree, in which case they're likely to fail on non-KVM.
> 
> Got it, thanks.
> 
> So my investigation overall led me to this set of conclusions:
> 
> 1) Under PowerVM, we do not use this RTAS call, which is the only (as
> asserted by pHyp developers) valid way to get hardware information about
> the machine. Therefore, the PowerVM `lscpu` output is the "virtual" CPU
> information -- where cores are as defined by sharing of the L2-cache.
> 
> 2) Under PowerKVM, we do not use this RTAS call, because it's not
> supported, and just spit out whatever the qemu topology is (which has no
> connection to the host (physical) CPU information).

Right.. so does that mean nothing is using this call yet?

>  --> so if we implement the RTAS call of some sort under PowerKVM, then
> we can update `lscpu` to use that RTAS call.

Yeah, I'm not convinced that's correct.  Shouldn't lscpu return the
virtual cpu information, at least by default.

> 3) Under PowerNV, there is a dependency on the hack that is ibm,chip-id
> from OPAL, which leads to twice as many sockets potentially being
> reported. `lscpu` also uses the sysfs files directly, which may or may
> not be the physical topology (I'm still tracking all of this down). 
> 
> *Also* `lscpu` has no knowledge of offline/online CPUs, so as you
> online/offline CPUs, the output of `lscpu` starts to change.

Ah, true.

> I think what we eventually want to do is add some fields to `lscpu` to
> indicate the "physical" data vs. the "virtual" data.

Ok.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-12  4:47             ` David Gibson
@ 2015-11-12 16:46               ` Nishanth Aravamudan
  2015-12-01  3:41                 ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2015-11-12 16:46 UTC (permalink / raw)
  To: David Gibson
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

On 12.11.2015 [15:47:15 +1100], David Gibson wrote:
> On Wed, Nov 11, 2015 at 02:10:48PM -0800, Nishanth Aravamudan wrote:
> > On 11.11.2015 [12:41:26 +1100], David Gibson wrote:
> > > On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> > > > On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > > > > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> > 
> > <snip>
> > 
> > > > > The trouble with xscom is that it's extremely specific to the way the
> > > > > current IBM servers present things.  It won't work on other types of
> > > > > host machine (which could happen with PR KVM), and could even break if
> > > > > IBM changes the way it organizes the SCOMs in a future machine.
> > > > > 
> > > > > Working from the nodes in /cpus still has some dependencies on IBM
> > > > > specific properties, but it's at least partially based on OF
> > > > > standards.
> > > > > 
> > > > > There's also another possible approach here, though I don't know if it
> > > > > will work.  Instead of looking directly in the device tree, try to get
> > > > > the information from lscpu, or libosinfo.  That would at least give
> > > > > you some hope of providing meaningful information on other host types.
> > > > 
> > > > Heh, the issue that is underlying all of this, is that `lscpu` itself is
> > > > quite wrong.
> > > > 
> > > > On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> > > > supported means of determining the underlying hardware CPU information
> > > > (which is what licensing models want in the end), is to use this RTAS
> > > > call in an LPAR. `lscpu` is explicitly incorrect in these environments
> > > > (it's values are "derived" from sysfs and some are adjusted to ensure
> > > > the division of values works out).
> > > 
> > > So.. I'm not sure if you're just saying that lscpu is wrong because it
> > > gives the guest information, or because of other problems.
> > 
> > `lscpu`'s man-page specifically says that on virtualized platforms, the
> > output may be inaccurate. And, in fact, on Power, in a KVM guest (and
> > in a LPAR), `lscpu` is outputting the guest CPU information, which is
> > completely fake. This is true on x86 KVM guests too, afaict.
> 
> Um.. yes, I was assuming lscpu reporting information about virtual
> cpus and sockets was intended and correct behaviour.

"lscpu - display information about the CPU architecture"

but at the same time "lscpu   gathers   CPU   architecture   information
from   sysfs   and /proc/cpuinfo" which is explicitly logical (or
virtual).

but at the same time "There is also information about the CPU caches and
cache sharing, family, model, bogoMIPS, byte order, and stepping." which
seems rather physical to me.

So perhaps, as I kind of stumbled upon myself in my last reply, we
should explicitly indicate the physical vs. virtual information.

I will raise this with the lscpu maintainer.

> > *If* we have a valid RTAS implementation on PowerKVM (or under qemu
> > generally), I think we can modify `lscpu` to do the right thing in at
> > least those two environments.
> > 
> > > What I was suggesting is implementing the RTAS call so that it
> > > effectively lets the guest get lscpu information from the host.
> > 
> > A bit of a chicken & egg problem, I'd say. The `lscpu` output in PowerNV
> > is also wrong :)
> 
> Ok.. why is it wrong in PowerNV?  This sounds like something you'd
> want to fix anyway.

Yes, I never said we wouldn't? It's wrong on PowerNV because chips are
being counted as sockets, i.e. a 2 DCM system is being counted as a 4
socket system, rather than a 2 socket system.

> > > > So, we are trying to at least resolve what PowerKVM guest can see by
> > > > supporting this RTAS call there. We should report *something* to the
> > > > guest, if possible, and we can adjust what is reported to the guests as
> > > > we go, from the host perspective.
> > > > 
> > > > I haven't followed along too closely in this thread, but woudl it be
> > > > reasonable to only report this RTAS call as being supported under
> > > > KVM?
> > > 
> > > Possibly, yes.
> > 
> > At least, as a first step, I guess.
> > 
> > > > How are other RTAS calls dealt with for PR and non-IBM models
> > > > currently?
> > > 
> > > Most of them still make sense in PR or TCG.  A few do look in the host
> > > device tree, in which case they're likely to fail on non-KVM.
> > 
> > Got it, thanks.
> > 
> > So my investigation overall led me to this set of conclusions:
> > 
> > 1) Under PowerVM, we do not use this RTAS call, which is the only (as
> > asserted by pHyp developers) valid way to get hardware information about
> > the machine. Therefore, the PowerVM `lscpu` output is the "virtual" CPU
> > information -- where cores are as defined by sharing of the L2-cache.
> > 
> > 2) Under PowerKVM, we do not use this RTAS call, because it's not
> > supported, and just spit out whatever the qemu topology is (which has no
> > connection to the host (physical) CPU information).
> 
> Right.. so does that mean nothing is using this call yet?

Correct.

> >  --> so if we implement the RTAS call of some sort under PowerKVM, then
> > we can update `lscpu` to use that RTAS call.
> 
> Yeah, I'm not convinced that's correct.  Shouldn't lscpu return the
> virtual cpu information, at least by default.

I think it should return both. *cough* this is a request from your
employer, actually *cough* :) For billing purposes, physical topology is
apparently relevant, not virtual (which makes sense, I can make a KVM
guest with 100 sockets, but I definitely shouldn't be billed for 100
sockets worth of RH seats, if the physical system only has 2 sockets).

> > 3) Under PowerNV, there is a dependency on the hack that is ibm,chip-id
> > from OPAL, which leads to twice as many sockets potentially being
> > reported. `lscpu` also uses the sysfs files directly, which may or may
> > not be the physical topology (I'm still tracking all of this down). 
> > 
> > *Also* `lscpu` has no knowledge of offline/online CPUs, so as you
> > online/offline CPUs, the output of `lscpu` starts to change.
> 
> Ah, true.

Yeah, I'm still trying tofigure out the nuances of this out.

-Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-11  0:17     ` David Gibson
  2015-11-11  0:56       ` Nishanth Aravamudan
@ 2015-11-13 20:21       ` Sukadev Bhattiprolu
  1 sibling, 0 replies; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-13 20:21 UTC (permalink / raw)
  To: David Gibson
  Cc: stewart, benh, aik, nacc, agraf, qemu-devel, qemu-ppc, paulus

David Gibson [david@gibson.dropbear.id.au] wrote:
| On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
| > David Gibson [david@gibson.dropbear.id.au] wrote:
| > | On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote:
| > | > Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm()
| > | > call in qemu. This call returns the processor module (socket), chip and core
| > | > information as specified in section 7.3.16.18 of PAPR v2.7.
| > | 
| > | PAPR v2.7 isn't available publically.  For upstream patches, please
| > | reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).
| > 
| > Ok.
| > 
| > | 
| > | > We walk the /proc/device-tree to determine the number of chips, cores and
| > | > modules in the _host_ system and return this info to the guest application
| > | > that makes the rtas_get_sysparm() call.
| > | > 
| > | > We currently hard code the number of module_types to 1, but we should determine
| > | > that dynamically somehow later.
| > | > 
| > | > Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk.
| > | > 
| > | > Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
| > | 
| > | This isn't ready to go yet - you need to put some more consideration
| > | into the uncommon cases: PR KVM, TCG and non-Power hosts.
| > 
| > Ok. Is there a we can make this code applicable only a Powerpc host?
| > (would moving this code to target-ppc/kvm.c do that?)
| 
| Yes, moving it to target-ppc/kvm.c would mostly do that.  You'd need
| some logic to make sure it fails gracefully in other cases, of course.

Ok. I have added a stub under #ifndef KVM in target-ppc/kvm_ppc.h.

| 
| [snip]
| > | >      switch (parameter) {
| > | > +    case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: {
| > | > +        int i;
| > | > +        int offset = 0;
| > | > +        int size;
| > | > +        struct rtas_module_info modinfo;
| > | > +
| > | > +        if (rtas_get_module_info(&modinfo)) {
| > | > +            break;
| > | > +        }
| > | 
| > | So, you handle the variable size of this structure before sending it
| > | to the guest, but you don't handle it in allocation of the structure
| > | right here.  You'll get away with that because for now you only ever
| > | have one entry in the sockets array, but it's a bit icky.
| > 
| > Can we assume that the size is static for now...
| > | 
| > | > +
| > | > +        size = sizeof(modinfo);
| > | > +        size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info);
| > | 
| > | More seriously, this calculation will break horribly if you change the
| > | size of the array in struct rtas_module_info.
| > 
| > and just set 'size' to sizeof(modinfo)?.
| 
| For purposes of allocation you could just use a fixed size.  But the
| guest might get confused by additional data beyond the declared size,
| so you do need to get the value correct that you send back to the guest.

If we use a fixed size and memset the buffer to 0 and then set the
values would that be sufficient?

| 
| [snip]
| > | > +/*
| > | > + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id'
| > | > + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each
| > | > + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory.
| > | > + *
| > | > + * A module can contain more than one chip and a chip can contain more
| > | > + * than one core. So there are likely to be duplicates in the module
| > | > + * and chip identifiers (i.e more than one xscom directory can contain
| > | > + * the same module/chip id).
| > | > + *
| > | > + * Search the xscom directories and count the number of _UNIQUE_ module
| > | > + * and chip identifiers in the system.
| > | 
| > | There's no direct way to go from a core
| > | (i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or
| > | module?
| > 
| > Yes, it would logical to find the chip and module from the core :-)
| > 
| > While 'ibm,chip-id' is in the core dir (/proc/device-tree/cpus/PowerPC,*/), 
| > the 'ibm,hw-module-id' is not there (on my Tuleta system). Maybe the
| > 'ibm,hw-module-id' will be added in the future?
| 
| Hm, I see.  Is there any device node that represents the "chip"?

AFAICT, in the current skiboot code, xscom represents a chip.

| 
| > I am using the xscom node to be consistent in counting chips and modules.
| 
| The trouble with xscom is that it's extremely specific to the way the
| current IBM servers present things.  It won't work on other types of
| host machine (which could happen with PR KVM), and could even break if
| IBM changes the way it organizes the SCOMs in a future machine.

Not sure yet how to future proof this at this point. If the SCOMs change
or are removed, we will end up returning zeroes (and break the licensing
again)

| 
| Working from the nodes in /cpus still has some dependencies on IBM
| specific properties, but it's at least partially based on OF
| standards.

yes, I will check with skiboot folks to see if they can add the
hw-module-id for each core. 

Sukadev

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-10  9:53     ` Thomas Huth
@ 2015-11-13 20:29       ` Sukadev Bhattiprolu
  0 siblings, 0 replies; 22+ messages in thread
From: Sukadev Bhattiprolu @ 2015-11-13 20:29 UTC (permalink / raw)
  To: Thomas Huth
  Cc: stewart, benh, aik, nacc, agraf, qemu-devel, qemu-ppc, paulus,
	David Gibson

Thomas Huth [thuth@redhat.com] wrote:
| On 10/11/15 05:22, Sukadev Bhattiprolu wrote:
| [...]
| > | > +static int file_read_buf(char *file_name, char *buf, int len)
| > | > +{
| > | > +    int rc;
| > | > +    FILE *fp;
| > | > +
| > | > +    fp = fopen(file_name, "r");
| > | > +    if (!fp) {
| > | > +        error_report("%s: Error opening %s\n", __func__, file_name);
| > | > +        return -1;
| > | > +    }
| > | > +
| > | > +    rc = fread(buf, 1, len, fp);
| > | > +    fclose(fp);
| > | > +
| > | > +    if (rc != len) {
| > | > +        return -1;
| > | > +    }
| > | > +
| > | > +    return 0;
| > | > +}
| 
| Could you maybe use g_file_get_contents() instead?

I guess I could, but since we are moving the new code to target-ppc/kvm.c,
I found some existing code in kvmppc_read_int_cpu_dt() that I could
reuse. Will post the patch later today, appreciate if you could double
check.

| 
| > | > +/*
| > | > + * Each core in the system is represented by a directory with the
| > | > + * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/.
| > | > + * Process that directory and count the number of cores in the system.
| > | 
| > | True on IBM POWER systems, but not necessarily everywhere - e.g. PR
| > | KVM on an embedded PowerPC host.
| > 
| > What is PR KVM?
| 
| On PPC, there are multiple kinds of KVM kernel modules, e.g. KVM-HV and
| KVM-PR (and further implementations for embedded PPCs, too). KVM-HV is
| using the hypervisor hardware feature of the current POWER7 and POWER8
| chips, while KVM-PR is using the PRoblem state to emulate a virtual
| machine. KVM-PR thus also works on older PPC hardware.

Ok. Thanks for the detailed info.

| So there are multiple PPC environments where QEMU can run on, and you
| must not assume that you always have nodes like "PowerPC,POWER" in the
| device tree.
| (BTW, you can also build kernels without the /proc/device-tree file
| system as far as I know ... so you never should fully rely on that
| without a fallback strategy)

Yes, I am now adding a stub for the #ifndef KVM in target-pcc/kvm_ppc.h.
I am checking for zeros in chips, modules and cores but the new RTAS
parameter will likely return zeroes on other targets for now - until we
can figure out how PR KVM or the embedded PPCs can determine this RTAS
info.

|  Thomas
| 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-11-12 16:46               ` Nishanth Aravamudan
@ 2015-12-01  3:41                 ` David Gibson
  2015-12-05  1:04                   ` Nishanth Aravamudan
  0 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2015-12-01  3:41 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

[-- Attachment #1: Type: text/plain, Size: 7906 bytes --]

On Thu, Nov 12, 2015 at 08:46:27AM -0800, Nishanth Aravamudan wrote:
> On 12.11.2015 [15:47:15 +1100], David Gibson wrote:
> > On Wed, Nov 11, 2015 at 02:10:48PM -0800, Nishanth Aravamudan wrote:
> > > On 11.11.2015 [12:41:26 +1100], David Gibson wrote:
> > > > On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> > > > > On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > > > > > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> > > 
> > > <snip>
> > > 
> > > > > > The trouble with xscom is that it's extremely specific to the way the
> > > > > > current IBM servers present things.  It won't work on other types of
> > > > > > host machine (which could happen with PR KVM), and could even break if
> > > > > > IBM changes the way it organizes the SCOMs in a future machine.
> > > > > > 
> > > > > > Working from the nodes in /cpus still has some dependencies on IBM
> > > > > > specific properties, but it's at least partially based on OF
> > > > > > standards.
> > > > > > 
> > > > > > There's also another possible approach here, though I don't know if it
> > > > > > will work.  Instead of looking directly in the device tree, try to get
> > > > > > the information from lscpu, or libosinfo.  That would at least give
> > > > > > you some hope of providing meaningful information on other host types.
> > > > > 
> > > > > Heh, the issue that is underlying all of this, is that `lscpu` itself is
> > > > > quite wrong.
> > > > > 
> > > > > On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> > > > > supported means of determining the underlying hardware CPU information
> > > > > (which is what licensing models want in the end), is to use this RTAS
> > > > > call in an LPAR. `lscpu` is explicitly incorrect in these environments
> > > > > (it's values are "derived" from sysfs and some are adjusted to ensure
> > > > > the division of values works out).
> > > > 
> > > > So.. I'm not sure if you're just saying that lscpu is wrong because it
> > > > gives the guest information, or because of other problems.
> > > 
> > > `lscpu`'s man-page specifically says that on virtualized platforms, the
> > > output may be inaccurate. And, in fact, on Power, in a KVM guest (and
> > > in a LPAR), `lscpu` is outputting the guest CPU information, which is
> > > completely fake. This is true on x86 KVM guests too, afaict.
> > 
> > Um.. yes, I was assuming lscpu reporting information about virtual
> > cpus and sockets was intended and correct behaviour.
> 
> "lscpu - display information about the CPU architecture"

Right, without qualification I'd take that as virtual architecture.

> but at the same time "lscpu   gathers   CPU   architecture   information
> from   sysfs   and /proc/cpuinfo" which is explicitly logical (or
> virtual).
> 
> but at the same time "There is also information about the CPU caches and
> cache sharing, family, model, bogoMIPS, byte order, and stepping." which
> seems rather physical to me.

bogomips and byte order are absolutely properties of a virtual cpu.
As are family and model, really, since they're generally at least
partially visible to a guest, and there may be some capacity for
faking them (x86 is more flexible in this regard than Power).
Stepping might be depending on exactly what level the system is
virtualized at (it's not for the case of PAPR).

Cache info is probably purely physical but amongst everything else
that's a property of the virtual cpu, I don't think that's an argument
that lscpu should return host cpu information in general.


> So perhaps, as I kind of stumbled upon myself in my last reply, we
> should explicitly indicate the physical vs. virtual information.
> 
> I will raise this with the lscpu maintainer.
> 
> > > *If* we have a valid RTAS implementation on PowerKVM (or under qemu
> > > generally), I think we can modify `lscpu` to do the right thing in at
> > > least those two environments.
> > > 
> > > > What I was suggesting is implementing the RTAS call so that it
> > > > effectively lets the guest get lscpu information from the host.
> > > 
> > > A bit of a chicken & egg problem, I'd say. The `lscpu` output in PowerNV
> > > is also wrong :)
> > 
> > Ok.. why is it wrong in PowerNV?  This sounds like something you'd
> > want to fix anyway.
> 
> Yes, I never said we wouldn't? It's wrong on PowerNV because chips are
> being counted as sockets, i.e. a 2 DCM system is being counted as a 4
> socket system, rather than a 2 socket system.

Well, sure, but the fact that the tool for the job has a bug doesn't
seem like a great reason to re-implement that tool directly in qemu.

I don't see any chicken and egg problem here the *powernv* lscpu has
no dependency on the cpu hypercall information.

> > > > > So, we are trying to at least resolve what PowerKVM guest can see by
> > > > > supporting this RTAS call there. We should report *something* to the
> > > > > guest, if possible, and we can adjust what is reported to the guests as
> > > > > we go, from the host perspective.
> > > > > 
> > > > > I haven't followed along too closely in this thread, but woudl it be
> > > > > reasonable to only report this RTAS call as being supported under
> > > > > KVM?
> > > > 
> > > > Possibly, yes.
> > > 
> > > At least, as a first step, I guess.
> > > 
> > > > > How are other RTAS calls dealt with for PR and non-IBM models
> > > > > currently?
> > > > 
> > > > Most of them still make sense in PR or TCG.  A few do look in the host
> > > > device tree, in which case they're likely to fail on non-KVM.
> > > 
> > > Got it, thanks.
> > > 
> > > So my investigation overall led me to this set of conclusions:
> > > 
> > > 1) Under PowerVM, we do not use this RTAS call, which is the only (as
> > > asserted by pHyp developers) valid way to get hardware information about
> > > the machine. Therefore, the PowerVM `lscpu` output is the "virtual" CPU
> > > information -- where cores are as defined by sharing of the L2-cache.
> > > 
> > > 2) Under PowerKVM, we do not use this RTAS call, because it's not
> > > supported, and just spit out whatever the qemu topology is (which has no
> > > connection to the host (physical) CPU information).
> > 
> > Right.. so does that mean nothing is using this call yet?
> 
> Correct.
> 
> > >  --> so if we implement the RTAS call of some sort under PowerKVM, then
> > > we can update `lscpu` to use that RTAS call.
> > 
> > Yeah, I'm not convinced that's correct.  Shouldn't lscpu return the
> > virtual cpu information, at least by default.
> 
> I think it should return both. *cough* this is a request from your
> employer, actually *cough* :) For billing purposes, physical topology is
> apparently relevant, not virtual (which makes sense, I can make a KVM
> guest with 100 sockets, but I definitely shouldn't be billed for 100
> sockets worth of RH seats, if the physical system only has 2 sockets).

Well, ok.  Do you have any contact information so I can find out
internally what it is they actually need?

> > > 3) Under PowerNV, there is a dependency on the hack that is ibm,chip-id
> > > from OPAL, which leads to twice as many sockets potentially being
> > > reported. `lscpu` also uses the sysfs files directly, which may or may
> > > not be the physical topology (I'm still tracking all of this down). 
> > > 
> > > *Also* `lscpu` has no knowledge of offline/online CPUs, so as you
> > > online/offline CPUs, the output of `lscpu` starts to change.
> > 
> > Ah, true.
> 
> Yeah, I'm still trying tofigure out the nuances of this out.
> 
> -Nish
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-12-01  3:41                 ` David Gibson
@ 2015-12-05  1:04                   ` Nishanth Aravamudan
  2015-12-10  3:55                     ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Nishanth Aravamudan @ 2015-12-05  1:04 UTC (permalink / raw)
  To: David Gibson
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

On 01.12.2015 [14:41:25 +1100], David Gibson wrote:
> On Thu, Nov 12, 2015 at 08:46:27AM -0800, Nishanth Aravamudan wrote:
> > On 12.11.2015 [15:47:15 +1100], David Gibson wrote:
> > > On Wed, Nov 11, 2015 at 02:10:48PM -0800, Nishanth Aravamudan wrote:
> > > > On 11.11.2015 [12:41:26 +1100], David Gibson wrote:
> > > > > On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> > > > > > On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > > > > > > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> > > > 
> > > > <snip>
> > > > 
> > > > > > > The trouble with xscom is that it's extremely specific to the way the
> > > > > > > current IBM servers present things.  It won't work on other types of
> > > > > > > host machine (which could happen with PR KVM), and could even break if
> > > > > > > IBM changes the way it organizes the SCOMs in a future machine.
> > > > > > > 
> > > > > > > Working from the nodes in /cpus still has some dependencies on IBM
> > > > > > > specific properties, but it's at least partially based on OF
> > > > > > > standards.
> > > > > > > 
> > > > > > > There's also another possible approach here, though I don't know if it
> > > > > > > will work.  Instead of looking directly in the device tree, try to get
> > > > > > > the information from lscpu, or libosinfo.  That would at least give
> > > > > > > you some hope of providing meaningful information on other host types.
> > > > > > 
> > > > > > Heh, the issue that is underlying all of this, is that `lscpu` itself is
> > > > > > quite wrong.
> > > > > > 
> > > > > > On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> > > > > > supported means of determining the underlying hardware CPU information
> > > > > > (which is what licensing models want in the end), is to use this RTAS
> > > > > > call in an LPAR. `lscpu` is explicitly incorrect in these environments
> > > > > > (it's values are "derived" from sysfs and some are adjusted to ensure
> > > > > > the division of values works out).
> > > > > 
> > > > > So.. I'm not sure if you're just saying that lscpu is wrong because it
> > > > > gives the guest information, or because of other problems.
> > > > 
> > > > `lscpu`'s man-page specifically says that on virtualized platforms, the
> > > > output may be inaccurate. And, in fact, on Power, in a KVM guest (and
> > > > in a LPAR), `lscpu` is outputting the guest CPU information, which is
> > > > completely fake. This is true on x86 KVM guests too, afaict.
> > > 
> > > Um.. yes, I was assuming lscpu reporting information about virtual
> > > cpus and sockets was intended and correct behaviour.
> > 
> > "lscpu - display information about the CPU architecture"
> 
> Right, without qualification I'd take that as virtual architecture.

Ok, I did not. I suppose the manpage could be reviewed and/or updated.

Regardless, Suka has gotten a patch merged which at least for power, can
display the physical information via `lscpu`, when the RTAS call is
available.

> > but at the same time "lscpu   gathers   CPU   architecture   information
> > from   sysfs   and /proc/cpuinfo" which is explicitly logical (or
> > virtual).
> > 
> > but at the same time "There is also information about the CPU caches and
> > cache sharing, family, model, bogoMIPS, byte order, and stepping." which
> > seems rather physical to me.
> 
> bogomips and byte order are absolutely properties of a virtual cpu.

I think the distinction to be made is they are *also* properties of a
virtual CPU. That is, they are properties of both,and where they vary,
it might be relevant to know both cases.

> As are family and model, really, since they're generally at least
> partially visible to a guest, and there may be some capacity for
> faking them (x86 is more flexible in this regard than Power).
> Stepping might be depending on exactly what level the system is
> virtualized at (it's not for the case of PAPR).
> 
> Cache info is probably purely physical but amongst everything else
> that's a property of the virtual cpu, I don't think that's an argument
> that lscpu should return host cpu information in general.
> 
> 
> > So perhaps, as I kind of stumbled upon myself in my last reply, we
> > should explicitly indicate the physical vs. virtual information.
> > 
> > I will raise this with the lscpu maintainer.
> > 
> > > > *If* we have a valid RTAS implementation on PowerKVM (or under qemu
> > > > generally), I think we can modify `lscpu` to do the right thing in at
> > > > least those two environments.
> > > > 
> > > > > What I was suggesting is implementing the RTAS call so that it
> > > > > effectively lets the guest get lscpu information from the host.
> > > > 
> > > > A bit of a chicken & egg problem, I'd say. The `lscpu` output in PowerNV
> > > > is also wrong :)
> > > 
> > > Ok.. why is it wrong in PowerNV?  This sounds like something you'd
> > > want to fix anyway.
> > 
> > Yes, I never said we wouldn't? It's wrong on PowerNV because chips are
> > being counted as sockets, i.e. a 2 DCM system is being counted as a 4
> > socket system, rather than a 2 socket system.
> 
> Well, sure, but the fact that the tool for the job has a bug doesn't
> seem like a great reason to re-implement that tool directly in qemu.
> 
> I don't see any chicken and egg problem here the *powernv* lscpu has
> no dependency on the cpu hypercall information.

Maybe I'm mistaken, but I think we're simply talking at odds.

There are three cases to consider, which I feel like I've stated before:

1) PowerVM provides this information (# of physical socketc, etc) via an
RTAS call.

2) PowerKVM does not currently provide it at all.

3) PowerNV provides it via `lscpu` and outputs incorrect information.

Suka is fixing 2) by adding the RTAS call to qemu. What the RTAS call
uses to populate the data seems to be the root of the question. I guess
you are suggesting use `lscpu` output from the PowerNV environment, even
if it's wrong, and once it's right, it's all done for PowerKVM.

Suka is fixing 3) as well, via patches to lscpu, but as you've pointed
out separately even in the qemu-side, it's not quite clear how to
proceed in the best generic way.

> > > > > > So, we are trying to at least resolve what PowerKVM guest can see by
> > > > > > supporting this RTAS call there. We should report *something* to the
> > > > > > guest, if possible, and we can adjust what is reported to the guests as
> > > > > > we go, from the host perspective.
> > > > > > 
> > > > > > I haven't followed along too closely in this thread, but woudl it be
> > > > > > reasonable to only report this RTAS call as being supported under
> > > > > > KVM?
> > > > > 
> > > > > Possibly, yes.
> > > > 
> > > > At least, as a first step, I guess.
> > > > 
> > > > > > How are other RTAS calls dealt with for PR and non-IBM models
> > > > > > currently?
> > > > > 
> > > > > Most of them still make sense in PR or TCG.  A few do look in the host
> > > > > device tree, in which case they're likely to fail on non-KVM.
> > > > 
> > > > Got it, thanks.
> > > > 
> > > > So my investigation overall led me to this set of conclusions:
> > > > 
> > > > 1) Under PowerVM, we do not use this RTAS call, which is the only (as
> > > > asserted by pHyp developers) valid way to get hardware information about
> > > > the machine. Therefore, the PowerVM `lscpu` output is the "virtual" CPU
> > > > information -- where cores are as defined by sharing of the L2-cache.
> > > > 
> > > > 2) Under PowerKVM, we do not use this RTAS call, because it's not
> > > > supported, and just spit out whatever the qemu topology is (which has no
> > > > connection to the host (physical) CPU information).
> > > 
> > > Right.. so does that mean nothing is using this call yet?
> > 
> > Correct.
> > 
> > > >  --> so if we implement the RTAS call of some sort under PowerKVM, then
> > > > we can update `lscpu` to use that RTAS call.
> > > 
> > > Yeah, I'm not convinced that's correct.  Shouldn't lscpu return the
> > > virtual cpu information, at least by default.
> > 
> > I think it should return both. *cough* this is a request from your
> > employer, actually *cough* :) For billing purposes, physical topology is
> > apparently relevant, not virtual (which makes sense, I can make a KVM
> > guest with 100 sockets, but I definitely shouldn't be billed for 100
> > sockets worth of RH seats, if the physical system only has 2 sockets).
> 
> Well, ok.  Do you have any contact information so I can find out
> internally what it is they actually need?

Yes, will send off-list.

-Nish

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO)
  2015-12-05  1:04                   ` Nishanth Aravamudan
@ 2015-12-10  3:55                     ` David Gibson
  0 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2015-12-10  3:55 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: stewart, benh, aik, agraf, qemu-devel, qemu-ppc, paulus,
	Sukadev Bhattiprolu

[-- Attachment #1: Type: text/plain, Size: 9898 bytes --]

On Fri, Dec 04, 2015 at 05:04:16PM -0800, Nishanth Aravamudan wrote:
> On 01.12.2015 [14:41:25 +1100], David Gibson wrote:
> > On Thu, Nov 12, 2015 at 08:46:27AM -0800, Nishanth Aravamudan wrote:
> > > On 12.11.2015 [15:47:15 +1100], David Gibson wrote:
> > > > On Wed, Nov 11, 2015 at 02:10:48PM -0800, Nishanth Aravamudan wrote:
> > > > > On 11.11.2015 [12:41:26 +1100], David Gibson wrote:
> > > > > > On Tue, Nov 10, 2015 at 04:56:38PM -0800, Nishanth Aravamudan wrote:
> > > > > > > On 11.11.2015 [11:17:58 +1100], David Gibson wrote:
> > > > > > > > On Mon, Nov 09, 2015 at 08:22:32PM -0800, Sukadev Bhattiprolu wrote:
> > > > > 
> > > > > <snip>
> > > > > 
> > > > > > > > The trouble with xscom is that it's extremely specific to the way the
> > > > > > > > current IBM servers present things.  It won't work on other types of
> > > > > > > > host machine (which could happen with PR KVM), and could even break if
> > > > > > > > IBM changes the way it organizes the SCOMs in a future machine.
> > > > > > > > 
> > > > > > > > Working from the nodes in /cpus still has some dependencies on IBM
> > > > > > > > specific properties, but it's at least partially based on OF
> > > > > > > > standards.
> > > > > > > > 
> > > > > > > > There's also another possible approach here, though I don't know if it
> > > > > > > > will work.  Instead of looking directly in the device tree, try to get
> > > > > > > > the information from lscpu, or libosinfo.  That would at least give
> > > > > > > > you some hope of providing meaningful information on other host types.
> > > > > > > 
> > > > > > > Heh, the issue that is underlying all of this, is that `lscpu` itself is
> > > > > > > quite wrong.
> > > > > > > 
> > > > > > > On PAPR-compliant hypervisors (well, PowerVM, at least), the only
> > > > > > > supported means of determining the underlying hardware CPU information
> > > > > > > (which is what licensing models want in the end), is to use this RTAS
> > > > > > > call in an LPAR. `lscpu` is explicitly incorrect in these environments
> > > > > > > (it's values are "derived" from sysfs and some are adjusted to ensure
> > > > > > > the division of values works out).
> > > > > > 
> > > > > > So.. I'm not sure if you're just saying that lscpu is wrong because it
> > > > > > gives the guest information, or because of other problems.
> > > > > 
> > > > > `lscpu`'s man-page specifically says that on virtualized platforms, the
> > > > > output may be inaccurate. And, in fact, on Power, in a KVM guest (and
> > > > > in a LPAR), `lscpu` is outputting the guest CPU information, which is
> > > > > completely fake. This is true on x86 KVM guests too, afaict.
> > > > 
> > > > Um.. yes, I was assuming lscpu reporting information about virtual
> > > > cpus and sockets was intended and correct behaviour.
> > > 
> > > "lscpu - display information about the CPU architecture"
> > 
> > Right, without qualification I'd take that as virtual architecture.
> 
> Ok, I did not. I suppose the manpage could be reviewed and/or updated.
> 
> Regardless, Suka has gotten a patch merged which at least for power, can
> display the physical information via `lscpu`, when the RTAS call is
> available.

Ok.

> > > but at the same time "lscpu   gathers   CPU   architecture   information
> > > from   sysfs   and /proc/cpuinfo" which is explicitly logical (or
> > > virtual).
> > > 
> > > but at the same time "There is also information about the CPU caches and
> > > cache sharing, family, model, bogoMIPS, byte order, and stepping." which
> > > seems rather physical to me.
> > 
> > bogomips and byte order are absolutely properties of a virtual cpu.
> 
> I think the distinction to be made is they are *also* properties of a
> virtual CPU. That is, they are properties of both,and where they vary,
> it might be relevant to know both cases.

No, they're really not.

Byte order isn't even really a property of the cpu at all, but of a
particularly piece of software running on it.  Bogomips is an internal
parameter of the Linux kernel which depends (somewhat) on the CPU.  To
the guest kernel, the host bogomips is absolutely irrelevant.

> > As are family and model, really, since they're generally at least
> > partially visible to a guest, and there may be some capacity for
> > faking them (x86 is more flexible in this regard than Power).
> > Stepping might be depending on exactly what level the system is
> > virtualized at (it's not for the case of PAPR).
> > 
> > Cache info is probably purely physical but amongst everything else
> > that's a property of the virtual cpu, I don't think that's an argument
> > that lscpu should return host cpu information in general.
> > 
> > 
> > > So perhaps, as I kind of stumbled upon myself in my last reply, we
> > > should explicitly indicate the physical vs. virtual information.
> > > 
> > > I will raise this with the lscpu maintainer.
> > > 
> > > > > *If* we have a valid RTAS implementation on PowerKVM (or under qemu
> > > > > generally), I think we can modify `lscpu` to do the right thing in at
> > > > > least those two environments.
> > > > > 
> > > > > > What I was suggesting is implementing the RTAS call so that it
> > > > > > effectively lets the guest get lscpu information from the host.
> > > > > 
> > > > > A bit of a chicken & egg problem, I'd say. The `lscpu` output in PowerNV
> > > > > is also wrong :)
> > > > 
> > > > Ok.. why is it wrong in PowerNV?  This sounds like something you'd
> > > > want to fix anyway.
> > > 
> > > Yes, I never said we wouldn't? It's wrong on PowerNV because chips are
> > > being counted as sockets, i.e. a 2 DCM system is being counted as a 4
> > > socket system, rather than a 2 socket system.
> > 
> > Well, sure, but the fact that the tool for the job has a bug doesn't
> > seem like a great reason to re-implement that tool directly in qemu.
> > 
> > I don't see any chicken and egg problem here the *powernv* lscpu has
> > no dependency on the cpu hypercall information.
> 
> Maybe I'm mistaken, but I think we're simply talking at odds.
> 
> There are three cases to consider, which I feel like I've stated before:
> 
> 1) PowerVM provides this information (# of physical socketc, etc) via an
> RTAS call.
> 
> 2) PowerKVM does not currently provide it at all.
> 
> 3) PowerNV provides it via `lscpu` and outputs incorrect information.
> 
> Suka is fixing 2) by adding the RTAS call to qemu. What the RTAS call
> uses to populate the data seems to be the root of the question. I guess
> you are suggesting use `lscpu` output from the PowerNV environment, even
> if it's wrong, and once it's right, it's all done for PowerKVM.

More or less, yes.  It just seems more sensible to fix a mostly-right
source of the information in the host, rather than to write a
completely new implementation to gather the information.

> Suka is fixing 3) as well, via patches to lscpu, but as you've pointed
> out separately even in the qemu-side, it's not quite clear how to
> proceed in the best generic way.
> 
> > > > > > > So, we are trying to at least resolve what PowerKVM guest can see by
> > > > > > > supporting this RTAS call there. We should report *something* to the
> > > > > > > guest, if possible, and we can adjust what is reported to the guests as
> > > > > > > we go, from the host perspective.
> > > > > > > 
> > > > > > > I haven't followed along too closely in this thread, but woudl it be
> > > > > > > reasonable to only report this RTAS call as being supported under
> > > > > > > KVM?
> > > > > > 
> > > > > > Possibly, yes.
> > > > > 
> > > > > At least, as a first step, I guess.
> > > > > 
> > > > > > > How are other RTAS calls dealt with for PR and non-IBM models
> > > > > > > currently?
> > > > > > 
> > > > > > Most of them still make sense in PR or TCG.  A few do look in the host
> > > > > > device tree, in which case they're likely to fail on non-KVM.
> > > > > 
> > > > > Got it, thanks.
> > > > > 
> > > > > So my investigation overall led me to this set of conclusions:
> > > > > 
> > > > > 1) Under PowerVM, we do not use this RTAS call, which is the only (as
> > > > > asserted by pHyp developers) valid way to get hardware information about
> > > > > the machine. Therefore, the PowerVM `lscpu` output is the "virtual" CPU
> > > > > information -- where cores are as defined by sharing of the L2-cache.
> > > > > 
> > > > > 2) Under PowerKVM, we do not use this RTAS call, because it's not
> > > > > supported, and just spit out whatever the qemu topology is (which has no
> > > > > connection to the host (physical) CPU information).
> > > > 
> > > > Right.. so does that mean nothing is using this call yet?
> > > 
> > > Correct.
> > > 
> > > > >  --> so if we implement the RTAS call of some sort under PowerKVM, then
> > > > > we can update `lscpu` to use that RTAS call.
> > > > 
> > > > Yeah, I'm not convinced that's correct.  Shouldn't lscpu return the
> > > > virtual cpu information, at least by default.
> > > 
> > > I think it should return both. *cough* this is a request from your
> > > employer, actually *cough* :) For billing purposes, physical topology is
> > > apparently relevant, not virtual (which makes sense, I can make a KVM
> > > guest with 100 sockets, but I definitely shouldn't be billed for 100
> > > sockets worth of RH seats, if the physical system only has 2 sockets).
> > 
> > Well, ok.  Do you have any contact information so I can find out
> > internally what it is they actually need?
> 
> Yes, will send off-list.
> 
> -Nish
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2015-12-10  3:55 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-04 23:06 [Qemu-devel] [PATCH v2 1/1] target-ppc: Implement rtas_get_sysparm(PROCESSOR_MODULE_INFO) Sukadev Bhattiprolu
2015-11-09  1:57 ` Alexey Kardashevskiy
2015-11-09  5:01   ` David Gibson
2015-11-10  3:57   ` Sukadev Bhattiprolu
2015-11-10  4:25     ` Alexey Kardashevskiy
2015-11-10  4:46       ` Sukadev Bhattiprolu
2015-11-10  6:58         ` Alexey Kardashevskiy
2015-11-10 18:27           ` Sukadev Bhattiprolu
2015-11-09  4:58 ` David Gibson
2015-11-10  4:22   ` Sukadev Bhattiprolu
2015-11-10  9:53     ` Thomas Huth
2015-11-13 20:29       ` Sukadev Bhattiprolu
2015-11-11  0:17     ` David Gibson
2015-11-11  0:56       ` Nishanth Aravamudan
2015-11-11  1:41         ` David Gibson
2015-11-11 22:10           ` Nishanth Aravamudan
2015-11-12  4:47             ` David Gibson
2015-11-12 16:46               ` Nishanth Aravamudan
2015-12-01  3:41                 ` David Gibson
2015-12-05  1:04                   ` Nishanth Aravamudan
2015-12-10  3:55                     ` David Gibson
2015-11-13 20:21       ` Sukadev Bhattiprolu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.