All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 1/4] 9p: mitigates most QID path collisions
@ 2019-05-03 16:21   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck @ 2019-05-03 16:21 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

This first patch here is an updated version of Antonios Motakis'
original 4-patch set, merged to one patch:

https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02283.html

* Updated to latest git master, specifically to new qht interface.

* Merged the original 4 patches to this single patch.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/9p-marshal.h |   4 +-
 hw/9pfs/9p.c       | 200 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/9pfs/9p.h       |  21 ++++++
 3 files changed, 204 insertions(+), 21 deletions(-)

diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
index c8823d878f..d1ad3645c4 100644
--- a/fsdev/9p-marshal.h
+++ b/fsdev/9p-marshal.h
@@ -10,8 +10,8 @@ typedef struct V9fsString
 typedef struct V9fsQID
 {
     int8_t type;
-    int32_t version;
-    int64_t path;
+    uint32_t version;
+    uint64_t path;
 } V9fsQID;
 
 typedef struct V9fsStat
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 55821343e5..b9bbdcbaee 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -25,6 +25,7 @@
 #include "trace.h"
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
+#include "qemu/xxhash.h"
 
 int open_fd_hw;
 int total_open_fd;
@@ -571,14 +572,135 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
-/* This is the algorithm from ufs in spfs */
-static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
+
+/* creative abuse of qemu_xxhash7, which is based on xxhash */
+static uint32_t qpp_hash(QppEntry e)
 {
-    size_t size;
+    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
+}
+
+static uint32_t qpf_hash(QpfEntry e)
+{
+    return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
+}
+
+static bool qpp_cmp_func(const void *obj, const void *userp)
+{
+    const QppEntry *e1 = obj, *e2 = userp;
+    return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
+}
+
+static bool qpf_cmp_func(const void *obj, const void *userp)
+{
+    const QpfEntry *e1 = obj, *e2 = userp;
+    return (e1->dev == e2->dev) && (e1->ino == e2->ino);
+}
+
+static void qp_table_remove(void *p, uint32_t h, void *up)
+{
+    g_free(p);
+}
+
+static void qp_table_destroy(struct qht *ht)
+{
+    qht_iter(ht, qp_table_remove, NULL);
+    qht_destroy(ht);
+}
+
+static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
+                            uint64_t *path)
+{
+    QpfEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino = stbuf->st_ino
+    }, *val;
+    uint32_t hash = qpf_hash(lookup);
+
+    /* most users won't need the fullmap, so init the table lazily */
+    if (!pdu->s->qpf_table.map) {
+        qht_init(&pdu->s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
+    }
+
+    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_fullpath_next == 0) {
+            /* no more files can be mapped :'( */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        if (!val) {
+            return -ENOMEM;
+        }
+        *val = lookup;
+
+        /* new unique inode and device combo */
+        val->path = pdu->s->qp_fullpath_next++;
+        pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+        qht_insert(&pdu->s->qpf_table, val, hash, NULL);
+    }
+
+    *path = val->path;
+    return 0;
+}
+
+/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+ * to a unique QID path (64 bits). To avoid having to map and keep track
+ * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
+ * the device id to the 16 highest bits of the QID path. The 48 lowest bits
+ * of the QID path equal to the lowest bits of the inode number.
+ *
+ * This takes advantage of the fact that inode number are usually not
+ * random but allocated sequentially, so we have fewer items to keep
+ * track of.
+ */
+static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
+                                uint64_t *path)
+{
+    QppEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+    }, *val;
+    uint32_t hash = qpp_hash(lookup);
+
+    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_prefix_next == 0) {
+            /* we ran out of prefixes */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        if (!val) {
+            return -ENOMEM;
+        }
+        *val = lookup;
+
+        /* new unique inode prefix and device combo */
+        val->qp_prefix = pdu->s->qp_prefix_next++;
+        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+    }
+
+    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+    return 0;
+}
+
+static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
+{
+    int err;
+
+    /* map inode+device to qid path (fast path) */
+    err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+    if (err == -ENFILE) {
+        /* fast path didn't work, fal back to full map */
+        err = qid_path_fullmap(pdu, stbuf, &qidp->path);
+    }
+    if (err) {
+        return err;
+    }
 
-    memset(&qidp->path, 0, sizeof(qidp->path));
-    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
-    memcpy(&qidp->path, &stbuf->st_ino, size);
     qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
     qidp->type = 0;
     if (S_ISDIR(stbuf->st_mode)) {
@@ -587,6 +709,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
     if (S_ISLNK(stbuf->st_mode)) {
         qidp->type |= P9_QID_TYPE_SYMLINK;
     }
+
+    return 0;
 }
 
 static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
@@ -599,7 +723,10 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
     if (err < 0) {
         return err;
     }
-    stat_to_qid(&stbuf, qidp);
+    err = stat_to_qid(pdu, &stbuf, qidp);
+    if (err < 0) {
+        return err;
+    }
     return 0;
 }
 
@@ -830,7 +957,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
 
     memset(v9stat, 0, sizeof(*v9stat));
 
-    stat_to_qid(stbuf, &v9stat->qid);
+    err = stat_to_qid(pdu, stbuf, &v9stat->qid);
+    if (err < 0) {
+        return err;
+    }
     v9stat->mode = stat_to_v9mode(stbuf);
     v9stat->atime = stbuf->st_atime;
     v9stat->mtime = stbuf->st_mtime;
@@ -891,7 +1021,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
 #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
 
 
-static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
+static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
                                 V9fsStatDotl *v9lstat)
 {
     memset(v9lstat, 0, sizeof(*v9lstat));
@@ -913,7 +1043,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
     /* Currently we only support BASIC fields in stat */
     v9lstat->st_result_mask = P9_STATS_BASIC;
 
-    stat_to_qid(stbuf, &v9lstat->qid);
+    return stat_to_qid(pdu, stbuf, &v9lstat->qid);
 }
 
 static void print_sg(struct iovec *sg, int cnt)
@@ -1115,7 +1245,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
     uint64_t request_mask;
     V9fsStatDotl v9stat_dotl;
     V9fsPDU *pdu = opaque;
-    V9fsState *s = pdu->s;
 
     retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
     if (retval < 0) {
@@ -1136,7 +1265,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
     if (retval < 0) {
         goto out;
     }
-    stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
+    retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
+    if (retval < 0) {
+        goto out;
+    }
 
     /*  fill st_gen if requested and supported by underlying fs */
     if (request_mask & P9_STATS_GEN) {
@@ -1381,7 +1513,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
             if (err < 0) {
                 goto out;
             }
-            stat_to_qid(&stbuf, &qid);
+            err = stat_to_qid(pdu, &stbuf, &qid);
+            if (err < 0) {
+                goto out;
+            }
             v9fs_path_copy(&dpath, &path);
         }
         memcpy(&qids[name_idx], &qid, sizeof(qid));
@@ -1483,7 +1618,10 @@ static void coroutine_fn v9fs_open(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     if (S_ISDIR(stbuf.st_mode)) {
         err = v9fs_co_opendir(pdu, fidp);
         if (err < 0) {
@@ -1593,7 +1731,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
         fidp->flags |= FID_NON_RECLAIMABLE;
     }
     iounit =  get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
     if (err < 0) {
         goto out;
@@ -2327,7 +2468,10 @@ static void coroutine_fn v9fs_create(void *opaque)
         }
     }
     iounit = get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
     if (err < 0) {
         goto out;
@@ -2384,7 +2528,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err =  pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3064,7 +3211,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3222,7 +3372,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3633,6 +3786,11 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
         goto out;
     }
 
+    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
+    qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+    s->qp_fullpath_next = 1;
+
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
 
@@ -3646,6 +3804,8 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+        qp_table_destroy(&s->qpp_table);
+        qp_table_destroy(&s->qpf_table);
         v9fs_path_free(&path);
     }
     return rc;
@@ -3658,6 +3818,8 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+    qp_table_destroy(&s->qpp_table);
+    qp_table_destroy(&s->qpf_table);
     g_free(s->ctx.fs_root);
 }
 
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 8883761b2c..44112ea97f 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -8,6 +8,7 @@
 #include "fsdev/9p-iov-marshal.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
+#include "qemu/qht.h"
 
 enum {
     P9_TLERROR = 6,
@@ -235,6 +236,22 @@ struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
+#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+
+/* QID path prefix entry, see stat_to_qid */
+typedef struct {
+    dev_t dev;
+    uint16_t ino_prefix;
+    uint16_t qp_prefix;
+} QppEntry;
+
+/* QID path full entry, as above */
+typedef struct {
+    dev_t dev;
+    ino_t ino;
+    uint64_t path;
+} QpfEntry;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -256,6 +273,10 @@ struct V9fsState
     Error *migration_blocker;
     V9fsConf fsconf;
     V9fsQID root_qid;
+    struct qht qpp_table;
+    struct qht qpf_table;
+    uint16_t qp_prefix_next;
+    uint64_t qp_fullpath_next;
 };
 
 /* 9p2000.L open flags */
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 1/4] 9p: mitigates most QID path collisions
@ 2019-05-03 16:21   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-05-03 16:21 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

This first patch here is an updated version of Antonios Motakis'
original 4-patch set, merged to one patch:

https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02283.html

* Updated to latest git master, specifically to new qht interface.

* Merged the original 4 patches to this single patch.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/9p-marshal.h |   4 +-
 hw/9pfs/9p.c       | 200 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/9pfs/9p.h       |  21 ++++++
 3 files changed, 204 insertions(+), 21 deletions(-)

diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
index c8823d878f..d1ad3645c4 100644
--- a/fsdev/9p-marshal.h
+++ b/fsdev/9p-marshal.h
@@ -10,8 +10,8 @@ typedef struct V9fsString
 typedef struct V9fsQID
 {
     int8_t type;
-    int32_t version;
-    int64_t path;
+    uint32_t version;
+    uint64_t path;
 } V9fsQID;
 
 typedef struct V9fsStat
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 55821343e5..b9bbdcbaee 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -25,6 +25,7 @@
 #include "trace.h"
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
+#include "qemu/xxhash.h"
 
 int open_fd_hw;
 int total_open_fd;
@@ -571,14 +572,135 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
-/* This is the algorithm from ufs in spfs */
-static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
+
+/* creative abuse of qemu_xxhash7, which is based on xxhash */
+static uint32_t qpp_hash(QppEntry e)
 {
-    size_t size;
+    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
+}
+
+static uint32_t qpf_hash(QpfEntry e)
+{
+    return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
+}
+
+static bool qpp_cmp_func(const void *obj, const void *userp)
+{
+    const QppEntry *e1 = obj, *e2 = userp;
+    return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
+}
+
+static bool qpf_cmp_func(const void *obj, const void *userp)
+{
+    const QpfEntry *e1 = obj, *e2 = userp;
+    return (e1->dev == e2->dev) && (e1->ino == e2->ino);
+}
+
+static void qp_table_remove(void *p, uint32_t h, void *up)
+{
+    g_free(p);
+}
+
+static void qp_table_destroy(struct qht *ht)
+{
+    qht_iter(ht, qp_table_remove, NULL);
+    qht_destroy(ht);
+}
+
+static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
+                            uint64_t *path)
+{
+    QpfEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino = stbuf->st_ino
+    }, *val;
+    uint32_t hash = qpf_hash(lookup);
+
+    /* most users won't need the fullmap, so init the table lazily */
+    if (!pdu->s->qpf_table.map) {
+        qht_init(&pdu->s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
+    }
+
+    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_fullpath_next == 0) {
+            /* no more files can be mapped :'( */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        if (!val) {
+            return -ENOMEM;
+        }
+        *val = lookup;
+
+        /* new unique inode and device combo */
+        val->path = pdu->s->qp_fullpath_next++;
+        pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+        qht_insert(&pdu->s->qpf_table, val, hash, NULL);
+    }
+
+    *path = val->path;
+    return 0;
+}
+
+/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+ * to a unique QID path (64 bits). To avoid having to map and keep track
+ * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
+ * the device id to the 16 highest bits of the QID path. The 48 lowest bits
+ * of the QID path equal to the lowest bits of the inode number.
+ *
+ * This takes advantage of the fact that inode number are usually not
+ * random but allocated sequentially, so we have fewer items to keep
+ * track of.
+ */
+static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
+                                uint64_t *path)
+{
+    QppEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+    }, *val;
+    uint32_t hash = qpp_hash(lookup);
+
+    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_prefix_next == 0) {
+            /* we ran out of prefixes */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        if (!val) {
+            return -ENOMEM;
+        }
+        *val = lookup;
+
+        /* new unique inode prefix and device combo */
+        val->qp_prefix = pdu->s->qp_prefix_next++;
+        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+    }
+
+    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+    return 0;
+}
+
+static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
+{
+    int err;
+
+    /* map inode+device to qid path (fast path) */
+    err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+    if (err == -ENFILE) {
+        /* fast path didn't work, fal back to full map */
+        err = qid_path_fullmap(pdu, stbuf, &qidp->path);
+    }
+    if (err) {
+        return err;
+    }
 
-    memset(&qidp->path, 0, sizeof(qidp->path));
-    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
-    memcpy(&qidp->path, &stbuf->st_ino, size);
     qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
     qidp->type = 0;
     if (S_ISDIR(stbuf->st_mode)) {
@@ -587,6 +709,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
     if (S_ISLNK(stbuf->st_mode)) {
         qidp->type |= P9_QID_TYPE_SYMLINK;
     }
+
+    return 0;
 }
 
 static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
@@ -599,7 +723,10 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
     if (err < 0) {
         return err;
     }
-    stat_to_qid(&stbuf, qidp);
+    err = stat_to_qid(pdu, &stbuf, qidp);
+    if (err < 0) {
+        return err;
+    }
     return 0;
 }
 
@@ -830,7 +957,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
 
     memset(v9stat, 0, sizeof(*v9stat));
 
-    stat_to_qid(stbuf, &v9stat->qid);
+    err = stat_to_qid(pdu, stbuf, &v9stat->qid);
+    if (err < 0) {
+        return err;
+    }
     v9stat->mode = stat_to_v9mode(stbuf);
     v9stat->atime = stbuf->st_atime;
     v9stat->mtime = stbuf->st_mtime;
@@ -891,7 +1021,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
 #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
 
 
-static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
+static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
                                 V9fsStatDotl *v9lstat)
 {
     memset(v9lstat, 0, sizeof(*v9lstat));
@@ -913,7 +1043,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
     /* Currently we only support BASIC fields in stat */
     v9lstat->st_result_mask = P9_STATS_BASIC;
 
-    stat_to_qid(stbuf, &v9lstat->qid);
+    return stat_to_qid(pdu, stbuf, &v9lstat->qid);
 }
 
 static void print_sg(struct iovec *sg, int cnt)
@@ -1115,7 +1245,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
     uint64_t request_mask;
     V9fsStatDotl v9stat_dotl;
     V9fsPDU *pdu = opaque;
-    V9fsState *s = pdu->s;
 
     retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
     if (retval < 0) {
@@ -1136,7 +1265,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
     if (retval < 0) {
         goto out;
     }
-    stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
+    retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
+    if (retval < 0) {
+        goto out;
+    }
 
     /*  fill st_gen if requested and supported by underlying fs */
     if (request_mask & P9_STATS_GEN) {
@@ -1381,7 +1513,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
             if (err < 0) {
                 goto out;
             }
-            stat_to_qid(&stbuf, &qid);
+            err = stat_to_qid(pdu, &stbuf, &qid);
+            if (err < 0) {
+                goto out;
+            }
             v9fs_path_copy(&dpath, &path);
         }
         memcpy(&qids[name_idx], &qid, sizeof(qid));
@@ -1483,7 +1618,10 @@ static void coroutine_fn v9fs_open(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     if (S_ISDIR(stbuf.st_mode)) {
         err = v9fs_co_opendir(pdu, fidp);
         if (err < 0) {
@@ -1593,7 +1731,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
         fidp->flags |= FID_NON_RECLAIMABLE;
     }
     iounit =  get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
     if (err < 0) {
         goto out;
@@ -2327,7 +2468,10 @@ static void coroutine_fn v9fs_create(void *opaque)
         }
     }
     iounit = get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
     if (err < 0) {
         goto out;
@@ -2384,7 +2528,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err =  pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3064,7 +3211,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3222,7 +3372,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3633,6 +3786,11 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
         goto out;
     }
 
+    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
+    qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+    s->qp_fullpath_next = 1;
+
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
 
@@ -3646,6 +3804,8 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+        qp_table_destroy(&s->qpp_table);
+        qp_table_destroy(&s->qpf_table);
         v9fs_path_free(&path);
     }
     return rc;
@@ -3658,6 +3818,8 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+    qp_table_destroy(&s->qpp_table);
+    qp_table_destroy(&s->qpf_table);
     g_free(s->ctx.fs_root);
 }
 
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 8883761b2c..44112ea97f 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -8,6 +8,7 @@
 #include "fsdev/9p-iov-marshal.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
+#include "qemu/qht.h"
 
 enum {
     P9_TLERROR = 6,
@@ -235,6 +236,22 @@ struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
+#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+
+/* QID path prefix entry, see stat_to_qid */
+typedef struct {
+    dev_t dev;
+    uint16_t ino_prefix;
+    uint16_t qp_prefix;
+} QppEntry;
+
+/* QID path full entry, as above */
+typedef struct {
+    dev_t dev;
+    ino_t ino;
+    uint64_t path;
+} QpfEntry;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -256,6 +273,10 @@ struct V9fsState
     Error *migration_blocker;
     V9fsConf fsconf;
     V9fsQID root_qid;
+    struct qht qpp_table;
+    struct qht qpf_table;
+    uint16_t qp_prefix_next;
+    uint64_t qp_fullpath_next;
 };
 
 /* 9p2000.L open flags */
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 2/4] 9P: trivial cleanup of QID path collision mitigation
@ 2019-05-03 16:22   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck @ 2019-05-03 16:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

Addresses trivial changes regarding the previous patch as requested
on the mailing list a while ago.

* Removed unneccessary parantheses:
  https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02661.html

* Removed unneccessary g_malloc() result checks:
  https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02814.html

* Unsigned type changes:
  https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02581.html

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/9p-marshal.h   |  2 +-
 hw/9pfs/9p.c         | 16 +++++-----------
 hw/9pfs/trace-events | 14 +++++++-------
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
index d1ad3645c4..8f3babb60a 100644
--- a/fsdev/9p-marshal.h
+++ b/fsdev/9p-marshal.h
@@ -9,7 +9,7 @@ typedef struct V9fsString
 
 typedef struct V9fsQID
 {
-    int8_t type;
+    uint8_t type;
     uint32_t version;
     uint64_t path;
 } V9fsQID;
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index b9bbdcbaee..2b893e25a1 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -587,13 +587,13 @@ static uint32_t qpf_hash(QpfEntry e)
 static bool qpp_cmp_func(const void *obj, const void *userp)
 {
     const QppEntry *e1 = obj, *e2 = userp;
-    return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
+    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
 }
 
 static bool qpf_cmp_func(const void *obj, const void *userp)
 {
     const QpfEntry *e1 = obj, *e2 = userp;
-    return (e1->dev == e2->dev) && (e1->ino == e2->ino);
+    return e1->dev == e2->dev && e1->ino == e2->ino;
 }
 
 static void qp_table_remove(void *p, uint32_t h, void *up)
@@ -630,9 +630,6 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         }
 
         val = g_malloc0(sizeof(QppEntry));
-        if (!val) {
-            return -ENOMEM;
-        }
         *val = lookup;
 
         /* new unique inode and device combo */
@@ -673,9 +670,6 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
         }
 
         val = g_malloc0(sizeof(QppEntry));
-        if (!val) {
-            return -ENOMEM;
-        }
         *val = lookup;
 
         /* new unique inode prefix and device combo */
@@ -870,9 +864,9 @@ static int donttouch_stat(V9fsStat *stat)
 {
     if (stat->type == -1 &&
         stat->dev == -1 &&
-        stat->qid.type == -1 &&
-        stat->qid.version == -1 &&
-        stat->qid.path == -1 &&
+        stat->qid.type == 0xff &&
+        stat->qid.version == (uint32_t) -1 &&
+        stat->qid.path == (uint64_t) -1 &&
         stat->mode == -1 &&
         stat->atime == -1 &&
         stat->mtime == -1 &&
diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
index c0a0a4ab5d..6964756922 100644
--- a/hw/9pfs/trace-events
+++ b/hw/9pfs/trace-events
@@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d"
 v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
 v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
 v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"
-v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
+v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64
 v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}"
 v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64
@@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod
 v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d"
 v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p"
 v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d"
-v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u"
-v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d"
 v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u"
@@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t
 v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d"
 v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd"
 v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d"
-v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid,  char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u"
-v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
+v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
 v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d"
 v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s"
 v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}"
 v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d"
-v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
+v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
 v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
 v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d"
 v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
 v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u"
 v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u"
-v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
+v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
 v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
 v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
 v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 2/4] 9P: trivial cleanup of QID path collision mitigation
@ 2019-05-03 16:22   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-05-03 16:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

Addresses trivial changes regarding the previous patch as requested
on the mailing list a while ago.

* Removed unneccessary parantheses:
  https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02661.html

* Removed unneccessary g_malloc() result checks:
  https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02814.html

* Unsigned type changes:
  https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02581.html

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/9p-marshal.h   |  2 +-
 hw/9pfs/9p.c         | 16 +++++-----------
 hw/9pfs/trace-events | 14 +++++++-------
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
index d1ad3645c4..8f3babb60a 100644
--- a/fsdev/9p-marshal.h
+++ b/fsdev/9p-marshal.h
@@ -9,7 +9,7 @@ typedef struct V9fsString
 
 typedef struct V9fsQID
 {
-    int8_t type;
+    uint8_t type;
     uint32_t version;
     uint64_t path;
 } V9fsQID;
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index b9bbdcbaee..2b893e25a1 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -587,13 +587,13 @@ static uint32_t qpf_hash(QpfEntry e)
 static bool qpp_cmp_func(const void *obj, const void *userp)
 {
     const QppEntry *e1 = obj, *e2 = userp;
-    return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
+    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
 }
 
 static bool qpf_cmp_func(const void *obj, const void *userp)
 {
     const QpfEntry *e1 = obj, *e2 = userp;
-    return (e1->dev == e2->dev) && (e1->ino == e2->ino);
+    return e1->dev == e2->dev && e1->ino == e2->ino;
 }
 
 static void qp_table_remove(void *p, uint32_t h, void *up)
@@ -630,9 +630,6 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         }
 
         val = g_malloc0(sizeof(QppEntry));
-        if (!val) {
-            return -ENOMEM;
-        }
         *val = lookup;
 
         /* new unique inode and device combo */
@@ -673,9 +670,6 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
         }
 
         val = g_malloc0(sizeof(QppEntry));
-        if (!val) {
-            return -ENOMEM;
-        }
         *val = lookup;
 
         /* new unique inode prefix and device combo */
@@ -870,9 +864,9 @@ static int donttouch_stat(V9fsStat *stat)
 {
     if (stat->type == -1 &&
         stat->dev == -1 &&
-        stat->qid.type == -1 &&
-        stat->qid.version == -1 &&
-        stat->qid.path == -1 &&
+        stat->qid.type == 0xff &&
+        stat->qid.version == (uint32_t) -1 &&
+        stat->qid.path == (uint64_t) -1 &&
         stat->mode == -1 &&
         stat->atime == -1 &&
         stat->mtime == -1 &&
diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
index c0a0a4ab5d..6964756922 100644
--- a/hw/9pfs/trace-events
+++ b/hw/9pfs/trace-events
@@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d"
 v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
 v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
 v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"
-v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
+v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64
 v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}"
 v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64
@@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod
 v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d"
 v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p"
 v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d"
-v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u"
-v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d"
 v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u"
@@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t
 v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d"
 v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd"
 v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d"
-v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid,  char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u"
-v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
+v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
 v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d"
 v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s"
 v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}"
 v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d"
-v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
+v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
 v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
 v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d"
 v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
 v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u"
 v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u"
-v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
+v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
 v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
 v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
 v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 3/4] 9p: persistency of QID path beyond reboots / suspensions
@ 2019-05-03 16:23   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck @ 2019-05-03 16:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

This patch aims to keep QID path identical beyond the scope of reboots and
guest suspensions. With the 1st patch alone the QID path of the same files
might change after reboots / suspensions, since 9p would restart with
empty qpp_table and the resulting QID path depends on the precise sequence
of files being accessed on guest.

The first patch should already avoid the vast majority of potential QID
path collisions. However especially network services running on guest
would still be prone to QID path issues when just using the 1st patch.
For instance Samba is exporting file IDs to clients in the network and
SMB cliens in the network will use those IDs to access and request
changes on the file server. If guest is now interrupted in between, like
it commonly happens on maintenance, e.g. software updates on host, then
SMB clients in the network will continue working with old file IDs, which
in turn leads to data corruption and data loss on the file server.
Furthermore on SMB client side I also encountered severe misbehaviours in
this case, for instance Macs accessing the file server would either
start to hang or die with a kernel panic within seconds, since the smbx
implementation on macOS heavily relies on file IDs being unique (within
the context of a connection that is).

So this patch here mitigates the remaining problem described above by
storing the qpp_table persistently as extended attribute(s) on the
exported root of the file system and automatically tries to restore the
qpp_table i.e. after reboots / resumptions.

This patch is aimed at real world scenarios, in which qpp_table will only
ever get few dozens of entries (and none ever in qpf_table). So it is e.g.
intentionally limited to only store qpp_table, not qpf_table; and so far
I have not made optimizations, since in practice the qpf_table is really
just tiny.

Since there is currently no callback in qemu yet that would reliably be
called on guest shutdowns, the table is stored on every new insertion for
now.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/9pfs/9p.h |  33 +++++++
 2 files changed, 343 insertions(+), 5 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 2b893e25a1..29c6dfc68a 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -26,6 +26,19 @@
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
 #include "qemu/xxhash.h"
+#include "qemu/crc32c.h"
+#if defined(__linux__) /* TODO: This should probably go into osdep.h instead */
+# include <linux/limits.h> /* for XATTR_SIZE_MAX */
+#endif
+
+/*
+ * How many bytes may we store to fs per extended attribute value?
+ */
+#ifdef XATTR_SIZE_MAX
+# define ATTR_MAX_SIZE XATTR_SIZE_MAX /* Linux only: 64kB limit in kernel */
+#else
+# define ATTR_MAX_SIZE 65536 /* Most systems allow a bit more, so we take this as basis.  */
+#endif
 
 int open_fd_hw;
 int total_open_fd;
@@ -642,6 +655,285 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
     return 0;
 }
 
+static inline bool is_ro_export(FsContext *ctx)
+{
+    return ctx->export_flags & V9FS_RDONLY;
+}
+
+/*
+ * Once qpp_table size exceeds this value, we no longer save
+ * the table persistently. See comment in v9fs_store_qpp_table()
+ */
+#define QPP_TABLE_PERSISTENCY_LIMIT 32768
+
+/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
+static void remove_qidp_xattr(FsContext *ctx)
+{
+    V9fsString name;
+    int i;
+
+    /* just for a paranoid endless recursion sanity check */
+    const ssize_t max_size =
+        sizeof(QppSrlzHeader) +
+        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QppEntryS);
+
+    v9fs_string_init(&name);
+    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        if (lremovexattr(ctx->fs_root, name.data) < 0)
+            break;
+    }
+    v9fs_string_free(&name);
+}
+
+/* Used to convert qpp hash table into continuous stream. */
+static void qpp_table_serialize(void *p, uint32_t h, void *up)
+{
+    const QppEntry *entry = (const QppEntry*) p;
+    QppSerialize *ser = (QppSerialize*) up;
+
+    if (ser->error)
+        return;
+
+    /* safety first */
+    if (entry->qp_prefix - 1 >= ser->count) {
+        ser->error = -1;
+        return;
+    }
+
+    ser->elements[entry->qp_prefix - 1] = (QppEntryS) {
+        .dev = entry->dev,
+        .ino_prefix = entry->ino_prefix
+    };
+    ser->done++;
+}
+
+/*
+ * Tries to store the current qpp_table as extended attribute(s) on the
+ * exported file system root with the goal to preserve identical qids
+ * beyond the scope of reboots.
+ */
+static void v9fs_store_qpp_table(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    V9fsString name;
+    int i, res;
+    size_t size;
+    QppSrlzStream* stream;
+    QppSerialize ser;
+
+    if (is_ro_export(ctx))
+        return;
+
+    /*
+     * Whenever we exceeded some certain (arbitrary) high qpp_table size we
+     * delete the stored table from the file system to get rid of old device
+     * ids / inodes that might no longer exist with the goal to potentially
+     * yield in a smaller table size after next reboot.
+     */
+    if (!s->qp_prefix_next || s->qp_prefix_next >= QPP_TABLE_PERSISTENCY_LIMIT) {
+        if (s->qp_prefix_next == QPP_TABLE_PERSISTENCY_LIMIT) {
+            remove_qidp_xattr(ctx);
+        }
+        return;
+    }
+
+    /* Convert qpp hash table into continuous array. */
+    size = sizeof(QppSrlzHeader) +
+           ( (s->qp_prefix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
+    stream = g_malloc0(size);
+    ser = (QppSerialize) {
+        .elements = &stream->elements[0],
+        .count = s->qp_prefix_next - 1,
+        .done  = 0,
+        .error = 0,
+    };
+    qht_iter(&s->qpp_table, qpp_table_serialize, &ser);
+    if (ser.error || ser.done != ser.count)
+        goto out;
+
+    /* initialize header and calculate CRC32 checksum */
+    stream->header = (QppSrlzHeader) {
+        .version = 1,
+        .reserved = 0,
+        .crc32 = crc32c(
+            0xffffffff,
+            (const uint8_t*) &stream->elements[0],
+            (ser.count * sizeof(QppEntryS))
+        ),
+    };
+
+    /*
+     * Actually just required if the qpp_table size decreased, or if the
+     * previous xattr size limit increased on OS (kernel/fs) level.
+     */
+    remove_qidp_xattr(ctx);
+
+    /*
+     * Subdivide (if required) the data stream into individual xattrs
+     * to cope with the system's max. supported xattr value size.
+     */
+    v9fs_string_init(&name);
+    for (i = 0; size > (i * ATTR_MAX_SIZE); ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        res = lsetxattr(
+            ctx->fs_root,
+            name.data,
+            ((const uint8_t*)stream) + i * ATTR_MAX_SIZE,
+            MIN(ATTR_MAX_SIZE, size - i * ATTR_MAX_SIZE),
+            0/*flags*/
+        );
+        if (res < 0) {
+            if (i > 0)
+                remove_qidp_xattr(ctx);
+            break;
+        }
+    }
+    v9fs_string_free(&name);
+out:
+    g_free(stream);
+}
+
+/* Frees the entire chain of passed nodes from memory. */
+static void destroy_xattr_nodes(XAttrNode **first)
+{
+    XAttrNode *prev;
+    if (!first)
+        return;
+    while (*first) {
+        if ((*first)->value)
+            g_free((*first)->value);
+        prev = *first;
+        *first = (*first)->next;
+        g_free(prev);
+    }
+}
+
+/*
+ * Loads all user.virtfs.system.qidp.* xattrs from exported fs root and
+ * returns a linked list with one node per xattr.
+ */
+static XAttrNode* v9fs_load_qidp_xattr_nodes(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    XAttrNode *first = NULL, *current = NULL;
+    V9fsString name;
+    ssize_t size;
+    int i;
+
+    const ssize_t max_size =
+        sizeof(QppSrlzHeader) +
+        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QppEntryS);
+
+    v9fs_string_init(&name);
+
+    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        size = lgetxattr(ctx->fs_root, name.data, NULL, 0);
+        if (size <= 0)
+            break;
+        if (!first) {
+            first = current = g_malloc0(sizeof(XAttrNode));
+        } else {
+            current = current->next = g_malloc0(sizeof(XAttrNode));
+        }
+        current->value = g_malloc0(size);
+        current->length = lgetxattr(
+            ctx->fs_root, name.data, current->value, size
+        );
+        if (current->length <= 0) {
+            goto out_w_err;
+        }
+    }
+    goto out;
+
+out_w_err:
+    destroy_xattr_nodes(&first);
+out:
+    v9fs_string_free(&name);
+    return first;
+}
+
+/*
+ * Try to load previously stored qpp_table from file system. Calling this
+ * function assumes that qpp_table is yet empty.
+ *
+ * @see v9fs_store_qpp_table()
+ */
+static void v9fs_load_qpp_table(V9fsState *s)
+{
+    ssize_t size, count;
+    XAttrNode *current, *first;
+    QppSrlzStream* stream = NULL;
+    uint32_t crc32;
+    int i;
+    QppEntry *val;
+    uint32_t hash;
+
+    if (s->qp_prefix_next != 1)
+        return;
+
+    first = v9fs_load_qidp_xattr_nodes(s);
+    if (!first)
+        return;
+
+    /* convert nodes into continuous stream */
+    size = 0;
+    for (current = first; current; current = current->next) {
+        size += current->length;
+    }
+    if (size <= 0) {
+        goto out;
+    }
+    stream = g_malloc0(size);
+    size = 0;
+    for (current = first; current; current = current->next) {
+        memcpy(((uint8_t*)stream) + size, current->value, current->length);
+        size += current->length;
+    }
+
+    if (stream->header.version != 1) {
+        goto out;
+    }
+
+    count = (size - sizeof(QppSrlzHeader)) / sizeof(QppEntryS);
+    if (count <= 0) {
+        goto out;
+    }
+
+    /* verify CRC32 checksum of stream */
+    crc32 = crc32c(
+        0xffffffff,
+        (const uint8_t*) &stream->elements[0],
+        (count * sizeof(QppEntryS))
+    );
+    if (crc32 != stream->header.crc32) {
+        goto out;
+    }
+
+    /* fill qpp_table with the retrieved elements */
+    for (i = 0; i < count; ++i) {
+        val = g_malloc0(sizeof(QppEntry));
+        *val = (QppEntry) {
+            .dev = stream->elements[i].dev,
+            .ino_prefix = stream->elements[i].ino_prefix,
+        };
+        hash = qpp_hash(*val);
+        if (qht_lookup(&s->qpp_table, val, hash)) {
+            /* should never happen: duplicate entry detected */
+            g_free(val);
+            goto out;
+        }
+        val->qp_prefix = s->qp_prefix_next++;
+        qht_insert(&s->qpp_table, val, hash, NULL);
+    }
+
+out:
+    destroy_xattr_nodes(&first);
+    if (stream)
+        g_free(stream);
+}
+
 /* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
  * to a unique QID path (64 bits). To avoid having to map and keep track
  * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
@@ -675,6 +967,14 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
         /* new unique inode prefix and device combo */
         val->qp_prefix = pdu->s->qp_prefix_next++;
         qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+
+        /*
+         * Store qpp_table as extended attribute(s) to file system.
+         *
+         * TODO: This should better only be called from a guest shutdown and
+         * suspend handler.
+         */
+        v9fs_store_qpp_table(pdu->s);
     }
 
     *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
@@ -1064,11 +1364,6 @@ static void v9fs_fix_path(V9fsPath *dst, V9fsPath *src, int len)
     v9fs_path_free(&str);
 }
 
-static inline bool is_ro_export(FsContext *ctx)
-{
-    return ctx->export_flags & V9FS_RDONLY;
-}
-
 static void coroutine_fn v9fs_version(void *opaque)
 {
     ssize_t err;
@@ -3784,6 +4079,8 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
     qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
     s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
     s->qp_fullpath_next = 1;
+    /* try to load and restore previous qpp_table */
+    v9fs_load_qpp_table(s);
 
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
@@ -3807,6 +4104,14 @@ out:
 
 void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
 {
+    /*
+     * Store qpp_table as extended attribute(s) to file system.
+     *
+     * This was actually plan A, but unfortunately unserialize is not called
+     * reliably on guest shutdowns and suspensions.
+     */
+    v9fs_store_qpp_table(s);
+
     if (s->ops->cleanup) {
         s->ops->cleanup(&s->ctx);
     }
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 44112ea97f..54ce039969 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -245,6 +245,13 @@ typedef struct {
     uint16_t qp_prefix;
 } QppEntry;
 
+/* Small version of QppEntry for serialization as xattr. */
+struct QppEntryS {
+    dev_t dev;
+    uint16_t ino_prefix;
+} __attribute__((packed));
+typedef struct QppEntryS QppEntryS;
+
 /* QID path full entry, as above */
 typedef struct {
     dev_t dev;
@@ -252,6 +259,32 @@ typedef struct {
     uint64_t path;
 } QpfEntry;
 
+typedef struct {
+    QppEntryS *elements;
+    uint count; /* In: QppEntryS count in @a elements */
+    uint done; /* Out: how many QppEntryS did we actually fill in @a elements */
+    int error; /* Out: zero on success */
+} QppSerialize;
+
+struct QppSrlzHeader {
+    uint16_t version;
+    uint16_t reserved; /* might be used e.g. for flags in future */
+    uint32_t crc32;
+} __attribute__((packed));
+typedef struct QppSrlzHeader QppSrlzHeader;
+
+struct QppSrlzStream {
+    QppSrlzHeader header;
+    QppEntryS elements[0];
+} __attribute__((packed));
+typedef struct QppSrlzStream QppSrlzStream;
+
+typedef struct XAttrNode {
+    uint8_t* value;
+    ssize_t length;
+    struct XAttrNode* next;
+} XAttrNode;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 3/4] 9p: persistency of QID path beyond reboots / suspensions
@ 2019-05-03 16:23   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-05-03 16:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

This patch aims to keep QID path identical beyond the scope of reboots and
guest suspensions. With the 1st patch alone the QID path of the same files
might change after reboots / suspensions, since 9p would restart with
empty qpp_table and the resulting QID path depends on the precise sequence
of files being accessed on guest.

The first patch should already avoid the vast majority of potential QID
path collisions. However especially network services running on guest
would still be prone to QID path issues when just using the 1st patch.
For instance Samba is exporting file IDs to clients in the network and
SMB cliens in the network will use those IDs to access and request
changes on the file server. If guest is now interrupted in between, like
it commonly happens on maintenance, e.g. software updates on host, then
SMB clients in the network will continue working with old file IDs, which
in turn leads to data corruption and data loss on the file server.
Furthermore on SMB client side I also encountered severe misbehaviours in
this case, for instance Macs accessing the file server would either
start to hang or die with a kernel panic within seconds, since the smbx
implementation on macOS heavily relies on file IDs being unique (within
the context of a connection that is).

So this patch here mitigates the remaining problem described above by
storing the qpp_table persistently as extended attribute(s) on the
exported root of the file system and automatically tries to restore the
qpp_table i.e. after reboots / resumptions.

This patch is aimed at real world scenarios, in which qpp_table will only
ever get few dozens of entries (and none ever in qpf_table). So it is e.g.
intentionally limited to only store qpp_table, not qpf_table; and so far
I have not made optimizations, since in practice the qpf_table is really
just tiny.

Since there is currently no callback in qemu yet that would reliably be
called on guest shutdowns, the table is stored on every new insertion for
now.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/9pfs/9p.h |  33 +++++++
 2 files changed, 343 insertions(+), 5 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 2b893e25a1..29c6dfc68a 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -26,6 +26,19 @@
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
 #include "qemu/xxhash.h"
+#include "qemu/crc32c.h"
+#if defined(__linux__) /* TODO: This should probably go into osdep.h instead */
+# include <linux/limits.h> /* for XATTR_SIZE_MAX */
+#endif
+
+/*
+ * How many bytes may we store to fs per extended attribute value?
+ */
+#ifdef XATTR_SIZE_MAX
+# define ATTR_MAX_SIZE XATTR_SIZE_MAX /* Linux only: 64kB limit in kernel */
+#else
+# define ATTR_MAX_SIZE 65536 /* Most systems allow a bit more, so we take this as basis.  */
+#endif
 
 int open_fd_hw;
 int total_open_fd;
@@ -642,6 +655,285 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
     return 0;
 }
 
+static inline bool is_ro_export(FsContext *ctx)
+{
+    return ctx->export_flags & V9FS_RDONLY;
+}
+
+/*
+ * Once qpp_table size exceeds this value, we no longer save
+ * the table persistently. See comment in v9fs_store_qpp_table()
+ */
+#define QPP_TABLE_PERSISTENCY_LIMIT 32768
+
+/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
+static void remove_qidp_xattr(FsContext *ctx)
+{
+    V9fsString name;
+    int i;
+
+    /* just for a paranoid endless recursion sanity check */
+    const ssize_t max_size =
+        sizeof(QppSrlzHeader) +
+        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QppEntryS);
+
+    v9fs_string_init(&name);
+    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        if (lremovexattr(ctx->fs_root, name.data) < 0)
+            break;
+    }
+    v9fs_string_free(&name);
+}
+
+/* Used to convert qpp hash table into continuous stream. */
+static void qpp_table_serialize(void *p, uint32_t h, void *up)
+{
+    const QppEntry *entry = (const QppEntry*) p;
+    QppSerialize *ser = (QppSerialize*) up;
+
+    if (ser->error)
+        return;
+
+    /* safety first */
+    if (entry->qp_prefix - 1 >= ser->count) {
+        ser->error = -1;
+        return;
+    }
+
+    ser->elements[entry->qp_prefix - 1] = (QppEntryS) {
+        .dev = entry->dev,
+        .ino_prefix = entry->ino_prefix
+    };
+    ser->done++;
+}
+
+/*
+ * Tries to store the current qpp_table as extended attribute(s) on the
+ * exported file system root with the goal to preserve identical qids
+ * beyond the scope of reboots.
+ */
+static void v9fs_store_qpp_table(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    V9fsString name;
+    int i, res;
+    size_t size;
+    QppSrlzStream* stream;
+    QppSerialize ser;
+
+    if (is_ro_export(ctx))
+        return;
+
+    /*
+     * Whenever we exceeded some certain (arbitrary) high qpp_table size we
+     * delete the stored table from the file system to get rid of old device
+     * ids / inodes that might no longer exist with the goal to potentially
+     * yield in a smaller table size after next reboot.
+     */
+    if (!s->qp_prefix_next || s->qp_prefix_next >= QPP_TABLE_PERSISTENCY_LIMIT) {
+        if (s->qp_prefix_next == QPP_TABLE_PERSISTENCY_LIMIT) {
+            remove_qidp_xattr(ctx);
+        }
+        return;
+    }
+
+    /* Convert qpp hash table into continuous array. */
+    size = sizeof(QppSrlzHeader) +
+           ( (s->qp_prefix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
+    stream = g_malloc0(size);
+    ser = (QppSerialize) {
+        .elements = &stream->elements[0],
+        .count = s->qp_prefix_next - 1,
+        .done  = 0,
+        .error = 0,
+    };
+    qht_iter(&s->qpp_table, qpp_table_serialize, &ser);
+    if (ser.error || ser.done != ser.count)
+        goto out;
+
+    /* initialize header and calculate CRC32 checksum */
+    stream->header = (QppSrlzHeader) {
+        .version = 1,
+        .reserved = 0,
+        .crc32 = crc32c(
+            0xffffffff,
+            (const uint8_t*) &stream->elements[0],
+            (ser.count * sizeof(QppEntryS))
+        ),
+    };
+
+    /*
+     * Actually just required if the qpp_table size decreased, or if the
+     * previous xattr size limit increased on OS (kernel/fs) level.
+     */
+    remove_qidp_xattr(ctx);
+
+    /*
+     * Subdivide (if required) the data stream into individual xattrs
+     * to cope with the system's max. supported xattr value size.
+     */
+    v9fs_string_init(&name);
+    for (i = 0; size > (i * ATTR_MAX_SIZE); ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        res = lsetxattr(
+            ctx->fs_root,
+            name.data,
+            ((const uint8_t*)stream) + i * ATTR_MAX_SIZE,
+            MIN(ATTR_MAX_SIZE, size - i * ATTR_MAX_SIZE),
+            0/*flags*/
+        );
+        if (res < 0) {
+            if (i > 0)
+                remove_qidp_xattr(ctx);
+            break;
+        }
+    }
+    v9fs_string_free(&name);
+out:
+    g_free(stream);
+}
+
+/* Frees the entire chain of passed nodes from memory. */
+static void destroy_xattr_nodes(XAttrNode **first)
+{
+    XAttrNode *prev;
+    if (!first)
+        return;
+    while (*first) {
+        if ((*first)->value)
+            g_free((*first)->value);
+        prev = *first;
+        *first = (*first)->next;
+        g_free(prev);
+    }
+}
+
+/*
+ * Loads all user.virtfs.system.qidp.* xattrs from exported fs root and
+ * returns a linked list with one node per xattr.
+ */
+static XAttrNode* v9fs_load_qidp_xattr_nodes(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    XAttrNode *first = NULL, *current = NULL;
+    V9fsString name;
+    ssize_t size;
+    int i;
+
+    const ssize_t max_size =
+        sizeof(QppSrlzHeader) +
+        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QppEntryS);
+
+    v9fs_string_init(&name);
+
+    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        size = lgetxattr(ctx->fs_root, name.data, NULL, 0);
+        if (size <= 0)
+            break;
+        if (!first) {
+            first = current = g_malloc0(sizeof(XAttrNode));
+        } else {
+            current = current->next = g_malloc0(sizeof(XAttrNode));
+        }
+        current->value = g_malloc0(size);
+        current->length = lgetxattr(
+            ctx->fs_root, name.data, current->value, size
+        );
+        if (current->length <= 0) {
+            goto out_w_err;
+        }
+    }
+    goto out;
+
+out_w_err:
+    destroy_xattr_nodes(&first);
+out:
+    v9fs_string_free(&name);
+    return first;
+}
+
+/*
+ * Try to load previously stored qpp_table from file system. Calling this
+ * function assumes that qpp_table is yet empty.
+ *
+ * @see v9fs_store_qpp_table()
+ */
+static void v9fs_load_qpp_table(V9fsState *s)
+{
+    ssize_t size, count;
+    XAttrNode *current, *first;
+    QppSrlzStream* stream = NULL;
+    uint32_t crc32;
+    int i;
+    QppEntry *val;
+    uint32_t hash;
+
+    if (s->qp_prefix_next != 1)
+        return;
+
+    first = v9fs_load_qidp_xattr_nodes(s);
+    if (!first)
+        return;
+
+    /* convert nodes into continuous stream */
+    size = 0;
+    for (current = first; current; current = current->next) {
+        size += current->length;
+    }
+    if (size <= 0) {
+        goto out;
+    }
+    stream = g_malloc0(size);
+    size = 0;
+    for (current = first; current; current = current->next) {
+        memcpy(((uint8_t*)stream) + size, current->value, current->length);
+        size += current->length;
+    }
+
+    if (stream->header.version != 1) {
+        goto out;
+    }
+
+    count = (size - sizeof(QppSrlzHeader)) / sizeof(QppEntryS);
+    if (count <= 0) {
+        goto out;
+    }
+
+    /* verify CRC32 checksum of stream */
+    crc32 = crc32c(
+        0xffffffff,
+        (const uint8_t*) &stream->elements[0],
+        (count * sizeof(QppEntryS))
+    );
+    if (crc32 != stream->header.crc32) {
+        goto out;
+    }
+
+    /* fill qpp_table with the retrieved elements */
+    for (i = 0; i < count; ++i) {
+        val = g_malloc0(sizeof(QppEntry));
+        *val = (QppEntry) {
+            .dev = stream->elements[i].dev,
+            .ino_prefix = stream->elements[i].ino_prefix,
+        };
+        hash = qpp_hash(*val);
+        if (qht_lookup(&s->qpp_table, val, hash)) {
+            /* should never happen: duplicate entry detected */
+            g_free(val);
+            goto out;
+        }
+        val->qp_prefix = s->qp_prefix_next++;
+        qht_insert(&s->qpp_table, val, hash, NULL);
+    }
+
+out:
+    destroy_xattr_nodes(&first);
+    if (stream)
+        g_free(stream);
+}
+
 /* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
  * to a unique QID path (64 bits). To avoid having to map and keep track
  * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
@@ -675,6 +967,14 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
         /* new unique inode prefix and device combo */
         val->qp_prefix = pdu->s->qp_prefix_next++;
         qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+
+        /*
+         * Store qpp_table as extended attribute(s) to file system.
+         *
+         * TODO: This should better only be called from a guest shutdown and
+         * suspend handler.
+         */
+        v9fs_store_qpp_table(pdu->s);
     }
 
     *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
@@ -1064,11 +1364,6 @@ static void v9fs_fix_path(V9fsPath *dst, V9fsPath *src, int len)
     v9fs_path_free(&str);
 }
 
-static inline bool is_ro_export(FsContext *ctx)
-{
-    return ctx->export_flags & V9FS_RDONLY;
-}
-
 static void coroutine_fn v9fs_version(void *opaque)
 {
     ssize_t err;
@@ -3784,6 +4079,8 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
     qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
     s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
     s->qp_fullpath_next = 1;
+    /* try to load and restore previous qpp_table */
+    v9fs_load_qpp_table(s);
 
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
@@ -3807,6 +4104,14 @@ out:
 
 void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
 {
+    /*
+     * Store qpp_table as extended attribute(s) to file system.
+     *
+     * This was actually plan A, but unfortunately unserialize is not called
+     * reliably on guest shutdowns and suspensions.
+     */
+    v9fs_store_qpp_table(s);
+
     if (s->ops->cleanup) {
         s->ops->cleanup(&s->ctx);
     }
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 44112ea97f..54ce039969 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -245,6 +245,13 @@ typedef struct {
     uint16_t qp_prefix;
 } QppEntry;
 
+/* Small version of QppEntry for serialization as xattr. */
+struct QppEntryS {
+    dev_t dev;
+    uint16_t ino_prefix;
+} __attribute__((packed));
+typedef struct QppEntryS QppEntryS;
+
 /* QID path full entry, as above */
 typedef struct {
     dev_t dev;
@@ -252,6 +259,32 @@ typedef struct {
     uint64_t path;
 } QpfEntry;
 
+typedef struct {
+    QppEntryS *elements;
+    uint count; /* In: QppEntryS count in @a elements */
+    uint done; /* Out: how many QppEntryS did we actually fill in @a elements */
+    int error; /* Out: zero on success */
+} QppSerialize;
+
+struct QppSrlzHeader {
+    uint16_t version;
+    uint16_t reserved; /* might be used e.g. for flags in future */
+    uint32_t crc32;
+} __attribute__((packed));
+typedef struct QppSrlzHeader QppSrlzHeader;
+
+struct QppSrlzStream {
+    QppSrlzHeader header;
+    QppEntryS elements[0];
+} __attribute__((packed));
+typedef struct QppSrlzStream QppSrlzStream;
+
+typedef struct XAttrNode {
+    uint8_t* value;
+    ssize_t length;
+    struct XAttrNode* next;
+} XAttrNode;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 4/4] 9p: use variable length suffixes for inode mapping
@ 2019-05-03 16:24   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck @ 2019-05-03 16:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

This patch introduces variable length suffixes for being used for inode mapping
instead of the fixed 16 bit size prefixes of patch 1. With this patch the inode
numbers on guest will typically be much smaller (e.g. around >2^1 .. >2^7
instead of >2^48 with patch 1).

I preserved both solutions in this patch though, so you can switch between
them by adjusting P9_VARI_LENGTH_INODE_SUFFIXES, which probably also makes
code comparison between the two more easy for now.

Motivation: with patch 1 the inode numbers were so high that some network
services had issues. Especially SMB seems to be a problem again due to varying
implementations regarding SMB file IDs. I don't bother you with the details.

Additionally this solution should be more efficient, since inode numbers in
practice can take almost their entire 64 bit range on guest as well. Which
might also be beneficial for nested virtualization.

The "Exponential Golomb" algorithm is used as basis for generating the
variable length suffixes. The algorithm has a paramter k which controls the
distribution of bits on increasing indeces (minimum bits at low index vs.
maximum bits at high index). With k=0 the generated suffixes look like:

Index Dec/Bin -> Generated Suffix Bin
1 [1] -> [1] (1 bits)
2 [10] -> [010] (3 bits)
3 [11] -> [110] (3 bits)
4 [100] -> [00100] (5 bits)
5 [101] -> [10100] (5 bits)
6 [110] -> [01100] (5 bits)
7 [111] -> [11100] (5 bits)
8 [1000] -> [0001000] (7 bits)
9 [1001] -> [1001000] (7 bits)
10 [1010] -> [0101000] (7 bits)
11 [1011] -> [1101000] (7 bits)
12 [1100] -> [0011000] (7 bits)
...
65533 [1111111111111101] ->  [1011111111111111000000000000000] (31 bits)
65534 [1111111111111110] ->  [0111111111111111000000000000000] (31 bits)
65535 [1111111111111111] ->  [1111111111111111000000000000000] (31 bits)
Hence minBits=1 maxBits=31

And with k=5 they would look like:

Index Dec/Bin -> Generated Suffix Bin
1 [1] -> [000001] (6 bits)
2 [10] -> [100001] (6 bits)
3 [11] -> [010001] (6 bits)
4 [100] -> [110001] (6 bits)
5 [101] -> [001001] (6 bits)
6 [110] -> [101001] (6 bits)
7 [111] -> [011001] (6 bits)
8 [1000] -> [111001] (6 bits)
9 [1001] -> [000101] (6 bits)
10 [1010] -> [100101] (6 bits)
11 [1011] -> [010101] (6 bits)
12 [1100] -> [110101] (6 bits)
...
65533 [1111111111111101] -> [0011100000000000100000000000] (28 bits)
65534 [1111111111111110] -> [1011100000000000100000000000] (28 bits)
65535 [1111111111111111] -> [0111100000000000100000000000] (28 bits)
Hence minBits=6 maxBits=28

If somebody wants to play around with the details of the suffix generation, let
me know and I will send you a small C source for you to play with.

Additionally this patch disables persistency of file IDs (that I introduced
with patch 3) at compile time by default for now. You can still enable it by
setting QPP_TABLE_PERSISTENCY_LIMIT to some reasonable high value (e.g. 127).
Reason: I am still unresolved what to do with this feature. The original
motivation was to avoid file ID collisions with network services if the machine
was interrupted for a short period. But what I did not consider was that
device IDs might change on host, so after loading the tables from fs the qp
tables would grow with irrelevant entries.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 744 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/9pfs/9p.h | 121 +++++++++-
 2 files changed, 805 insertions(+), 60 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 29c6dfc68a..003901a1e0 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -30,6 +30,10 @@
 #if defined(__linux__) /* TODO: This should probably go into osdep.h instead */
 # include <linux/limits.h> /* for XATTR_SIZE_MAX */
 #endif
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <math.h>
 
 /*
  * How many bytes may we store to fs per extended attribute value?
@@ -585,6 +589,123 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
+/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
+static inline uint8_t mirror8bit(uint8_t byte) {
+    return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+}
+
+/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
+static inline uint64_t mirror64bit(uint64_t value) {
+    return ((uint64_t)mirror8bit( value        & 0xff) << 56) |
+           ((uint64_t)mirror8bit((value >> 8)  & 0xff) << 48) |
+           ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
+           ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
+           ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
+           ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
+           ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8 ) |
+           ((uint64_t)mirror8bit((value >> 56) & 0xff)      ) ;
+}
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Parameter k for the Exponential Golomb algorihm to be used.
+ *
+ * The smaller this value, the smaller the minimum bit count for the Exp.
+ * Golomb generated affixes will be (at lowest index) however for the
+ * price of having higher maximum bit count of generated affixes (at highest
+ * index). Likewise increasing this parameter yields in smaller maximum bit
+ * count for the price of having higher minimum bit count.
+ */
+#define EXP_GOLOMB_K    0
+
+# if !EXP_GOLOMB_K
+
+/** @brief Exponential Golomb algorithm limited to the case k=0.
+ * 
+ * See expGolombEncode() below for details.
+ * 
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix expGolombEncodeK0(uint64_t n) {
+    const int bits = (int) log2(n) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = n,
+        .bits = bits + bits - 1
+    };
+}
+
+# else
+
+/** @brief Exponential Golomb algorithm for arbitrary k (including k=0).
+ *
+ * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!)
+ * with growing length and with the mathematical property of being
+ * "prefix-free". The latter means the generated prefixes can be prepended
+ * in front of arbitrary numbers and the resulting concatenated numbers are
+ * guaranteed to be always unique.
+ *
+ * This is a minor adjustment to the original Exp. Golomb algorithm in the
+ * sense that lowest allowed index (@param n) starts with 1, not with zero.
+ *
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ * @param k - parameter k of Exp. Golomb algorithm to be used
+ *            (see comment on EXP_GOLOMB_K macro for details about k)
+ */
+static VariLenAffix expGolombEncode(uint64_t n, int k) {
+    const uint64_t value = n + (1 << k) - 1;
+    const int bits = (int) log2(value) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = value,
+        .bits = bits + MAX((bits - 1 - k), 0)
+    };
+}
+
+# endif /* !EXP_GOLOMB_K */
+
+/** @brief Converts a suffix into a prefix, or a prefix into a suffix.
+ *
+ * What this function does is actually mirroring all bits of the affix value,
+ * with the purpose to preserve respectively the mathematical "prefix-free"
+ * or "suffix-free" property after the conversion.
+ *
+ * In other words: if a passed prefix is suitable to create unique numbers,
+ * then the returned suffix is suitable to create unique numbers as well
+ * (and vice versa).
+ */
+static VariLenAffix invertAffix(const VariLenAffix* affix) {
+    return (VariLenAffix) {
+        .type = (affix->type == AffixType_Suffix) ? AffixType_Prefix : AffixType_Suffix,
+        .value =  mirror64bit(affix->value) >> ((sizeof(affix->value) * 8) - affix->bits),
+        .bits = affix->bits
+    };
+}
+
+/** @brief Generates suffix numbers with "suffix-free" property.
+ *
+ * This is just a wrapper function on top of the Exp. Golomb algorithm.
+ *
+ * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
+ * this function converts the Exp. Golomb prefixes into appropriate suffixes
+ * which are still suitable for generating unique numbers.
+ *
+ * @param n - natural number (or index) of the suffix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix affixForIndex(uint64_t index) {
+    VariLenAffix prefix;
+    #if EXP_GOLOMB_K
+    prefix = expGolombEncode(index, EXP_GOLOMB_K);
+    #else
+    prefix = expGolombEncodeK0(index);
+    #endif
+    return invertAffix(&prefix); /* convert prefix to suffix */
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
 
 /* creative abuse of qemu_xxhash7, which is based on xxhash */
 static uint32_t qpp_hash(QppEntry e)
@@ -597,6 +718,16 @@ static uint32_t qpf_hash(QpfEntry e)
     return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
 }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+static bool qpd_cmp_func(const void *obj, const void *userp)
+{
+    const QpdEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev;
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 static bool qpp_cmp_func(const void *obj, const void *userp)
 {
     const QppEntry *e1 = obj, *e2 = userp;
@@ -620,6 +751,72 @@ static void qp_table_destroy(struct qht *ht)
     qht_destroy(ht);
 }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+# if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+static int v9fs_store_qpd_table(V9fsState *s);
+# endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
+/*
+ * Returns how many (high end) bits of inode numbers of the passed fs
+ * device shall be used (in combination with the device number) to
+ * generate hash values for qpp_table entries.
+ *
+ * This function is required if variable length suffixes are used for inode
+ * number mapping on guest level. Since a device may end up having multiple
+ * entries in qpp_table, each entry most probably with a different suffix
+ * length, we thus need this function in conjunction with qpd_table to
+ * "agree" about a fix amount of bits (per device) to be always used for
+ * generating hash values for the purpose of accessing qpp_table in order
+ * get consistent behaviour when accessing qpp_table.
+ */
+static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
+{
+    QpdEntry lookup = {
+        .dev = dev
+    }, *val;
+    uint32_t hash = dev;
+    VariLenAffix affix;
+
+    val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
+    if (!val) {
+        val = g_malloc0(sizeof(QpdEntry));
+        *val = lookup;
+        affix = affixForIndex(pdu->s->qp_affix_next);
+        val->prefix_bits = affix.bits;
+        qht_insert(&pdu->s->qpd_table, val, hash, NULL);
+        pdu->s->qp_ndevices++;
+
+        /*
+         * Store qpd_table as extended attribute(s) to file system.
+         * 
+         * TODO: This should better only be called from a guest shutdown and
+         * suspend handler.
+         */
+        #if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+        v9fs_store_qpd_table(pdu->s);
+        #endif
+    }
+    return val->prefix_bits;
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+/** @brief Slow / full mapping host inode nr -> guest inode nr.
+ *
+ * This function performs a slower and much more costly remapping of an
+ * original file inode number on host to an appropriate different inode
+ * number on guest. For every (dev, inode) combination on host a new
+ * sequential number is generated, cached and exposed as inode number on
+ * guest.
+ *
+ * This is just a "last resort" fallback solution if the much faster/cheaper
+ * qid_path_prefixmap() failed. In practice this slow / full mapping is not
+ * expected ever to be used at all though.
+ *
+ * @see qid_path_prefixmap() for details
+ *
+ */
 static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
                             uint64_t *path)
 {
@@ -628,11 +825,9 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         .ino = stbuf->st_ino
     }, *val;
     uint32_t hash = qpf_hash(lookup);
-
-    /* most users won't need the fullmap, so init the table lazily */
-    if (!pdu->s->qpf_table.map) {
-        qht_init(&pdu->s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
-    }
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    VariLenAffix affix;
+#endif
 
     val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
 
@@ -646,8 +841,16 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         *val = lookup;
 
         /* new unique inode and device combo */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        affix = affixForIndex(
+            1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
+        );
+        val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
+        pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
+#else
         val->path = pdu->s->qp_fullpath_next++;
         pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+#endif
         qht_insert(&pdu->s->qpf_table, val, hash, NULL);
     }
 
@@ -660,18 +863,17 @@ static inline bool is_ro_export(FsContext *ctx)
     return ctx->export_flags & V9FS_RDONLY;
 }
 
-/*
- * Once qpp_table size exceeds this value, we no longer save
- * the table persistently. See comment in v9fs_store_qpp_table()
- */
-#define QPP_TABLE_PERSISTENCY_LIMIT 32768
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
 
-/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
-static void remove_qidp_xattr(FsContext *ctx)
+/* Remove all xattrs from export root matching the supplied xattr name_pattern. */
+static void remove_seq_xattr(FsContext *ctx, const char* name_pattern)
 {
     V9fsString name;
     int i;
 
+    if (is_ro_export(ctx))
+        return;
+
     /* just for a paranoid endless recursion sanity check */
     const ssize_t max_size =
         sizeof(QppSrlzHeader) +
@@ -679,83 +881,254 @@ static void remove_qidp_xattr(FsContext *ctx)
 
     v9fs_string_init(&name);
     for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
-        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        v9fs_string_sprintf(&name, name_pattern, i);
         if (lremovexattr(ctx->fs_root, name.data) < 0)
             break;
     }
     v9fs_string_free(&name);
 }
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Remove all user.virtfs.system.qidd.* xattrs from export root. */
+static void remove_qidd_xattr(FsContext *ctx)
+{
+    remove_seq_xattr(ctx, "user.virtfs.system.qidd.%d");
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
+static void remove_qidp_xattr(FsContext *ctx)
+{
+    remove_seq_xattr(ctx, "user.virtfs.system.qidp.%d");
+}
+
+static void remove_qp_tables_xattrs(FsContext *ctx) {
+    #if P9_VARI_LENGTH_INODE_SUFFIXES
+    remove_qidd_xattr(ctx);
+    #endif
+    remove_qidp_xattr(ctx);
+}
+
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Used to convert qpd hash table into continuous stream. */
+static void qpd_table_serialize(struct qht *ht, void *p, uint32_t h, void *up)
+{
+    const QpdEntry *entry = (const QpdEntry*) p;
+    QpdSerialize *ser = (QpdSerialize*) up;
+
+    /* NOTE: Due to the simple incremental index solution we use here, this
+     * means the entries might be serialized in the final data stream with
+     * different order than the devices were originally inserted into the
+     * qpd_table. However specifically for qpd_table this is irrelevant ATM.
+     */
+    const uint32_t index = ser->done;
+
+    if (ser->error)
+        return;
+
+    /* safety first */
+    if (index >= ser->count) {
+        ser->error = -1;
+        return;
+    }
+
+    ser->elements[index] = (QpdEntryS) {
+        .dev = entry->dev,
+        .prefix_bits = entry->prefix_bits
+    };
+    ser->done++;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /* Used to convert qpp hash table into continuous stream. */
 static void qpp_table_serialize(void *p, uint32_t h, void *up)
 {
     const QppEntry *entry = (const QppEntry*) p;
     QppSerialize *ser = (QppSerialize*) up;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    const uint32_t index = entry->qp_affix_index - 1;
+#else
+    const uint32_t index = entry->qp_affix - 1;
+#endif
 
     if (ser->error)
         return;
 
     /* safety first */
-    if (entry->qp_prefix - 1 >= ser->count) {
+    if (index >= ser->count) {
         ser->error = -1;
         return;
     }
 
-    ser->elements[entry->qp_prefix - 1] = (QppEntryS) {
+    ser->elements[index] = (QppEntryS) {
         .dev = entry->dev,
         .ino_prefix = entry->ino_prefix
     };
     ser->done++;
 }
 
+/** @brief Returns true if we should still save qpp_table (and qpd_table).
+ *
+ * Whenever we exceeded some certain (arbitrary) high qpp_table size we
+ * delete the stored table(s) from the file system to get rid of old device
+ * ids / inodes that might no longer exist with the goal to potentially
+ * yield in a smaller table size after next reboot and smaller inode
+ * numbers on guest.
+ */
+static bool shouldStoreQpTables(V9fsState *s) {
+    return s->qp_affix_next && s->qp_affix_next < QPP_TABLE_PERSISTENCY_LIMIT;
+}
+
+/* See shouldStoreQpTables() above */
+static bool shouldDeleteQpTables(V9fsState *s) {
+    return s->qp_affix_next == QPP_TABLE_PERSISTENCY_LIMIT;
+}
+
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Tries to store the current qpd_table as extended attribute(s) on the
+ * exported file system root with the goal to preserve identical qids
+ * beyond the scope of reboots / suspends.
+ *
+ * @returns 0 on success
+ */
+static int v9fs_store_qpd_table(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    V9fsString name;
+    int i, res, err = 0;
+    size_t size;
+    QpdSrlzStream* stream;
+    QpdSerialize ser;
+
+    if (is_ro_export(ctx))
+        return 0;
+
+    if (!shouldStoreQpTables(s)) {
+        if (shouldDeleteQpTables(s)) {
+            remove_qp_tables_xattrs(ctx);
+        }
+        return 0;
+    }
+
+    /* Convert qpd hash table into continuous array. */
+    size = sizeof(QppSrlzHeader) +
+           ( s->qp_ndevices /* qpd_table entry count */ * sizeof(QpdEntryS) );
+    stream = g_malloc0(size);
+    ser = (QpdSerialize) {
+        .elements = &stream->elements[0],
+        .count = s->qp_ndevices,
+        .done  = 0,
+        .error = 0,
+    };
+    qht_iter(&s->qpd_table, qpd_table_serialize, &ser);
+    if (ser.error || ser.done != ser.count) {
+        err = -1;
+        goto out;
+    }
+
+    /* initialize header and calculate CRC32 checksum */
+    stream->header = (QppSrlzHeader) {
+        .version = 1,
+        .options = 0, /* not used for storing the qpd_table yet */
+        .crc32 = crc32c(
+            0xffffffff,
+            (const uint8_t*) &stream->elements[0],
+            (ser.count * sizeof(QpdEntryS))
+        ),
+    };
+
+    /*
+     * Actually just required if the qpd_table size decreased, or if the
+     * previous xattr size limit increased on OS (kernel/fs) level.
+     */
+    remove_qidd_xattr(ctx);
+
+    /*
+     * Subdivide (if required) the data stream into individual xattrs
+     * to cope with the system's max. supported xattr value size.
+     */
+    v9fs_string_init(&name);
+    for (i = 0; size > (i * ATTR_MAX_SIZE); ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidd.%d", i);
+        res = lsetxattr(
+            ctx->fs_root,
+            name.data,
+            ((const uint8_t*)stream) + i * ATTR_MAX_SIZE,
+            MIN(ATTR_MAX_SIZE, size - i * ATTR_MAX_SIZE),
+            0/*flags*/
+        );
+        if (res < 0) {
+            if (i > 0)
+                remove_qidd_xattr(ctx);
+            err = -1;
+            break;
+        }
+    }
+    v9fs_string_free(&name);
+out:
+    g_free(stream);
+
+    return err;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /*
  * Tries to store the current qpp_table as extended attribute(s) on the
  * exported file system root with the goal to preserve identical qids
- * beyond the scope of reboots.
+ * beyond the scope of reboots / suspends.
+ *
+ * @returns 0 on success
  */
-static void v9fs_store_qpp_table(V9fsState *s)
+static int v9fs_store_qpp_table(V9fsState *s)
 {
     FsContext *ctx = &s->ctx;
     V9fsString name;
-    int i, res;
+    int i, res, err = 0;
     size_t size;
     QppSrlzStream* stream;
     QppSerialize ser;
 
     if (is_ro_export(ctx))
-        return;
+        return 0;
 
-    /*
-     * Whenever we exceeded some certain (arbitrary) high qpp_table size we
-     * delete the stored table from the file system to get rid of old device
-     * ids / inodes that might no longer exist with the goal to potentially
-     * yield in a smaller table size after next reboot.
-     */
-    if (!s->qp_prefix_next || s->qp_prefix_next >= QPP_TABLE_PERSISTENCY_LIMIT) {
-        if (s->qp_prefix_next == QPP_TABLE_PERSISTENCY_LIMIT) {
-            remove_qidp_xattr(ctx);
+    if (!shouldStoreQpTables(s)) {
+        if (shouldDeleteQpTables(s)) {
+            remove_qp_tables_xattrs(ctx);
         }
-        return;
+        return 0;
     }
 
     /* Convert qpp hash table into continuous array. */
     size = sizeof(QppSrlzHeader) +
-           ( (s->qp_prefix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
+           ( (s->qp_affix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
     stream = g_malloc0(size);
     ser = (QppSerialize) {
         .elements = &stream->elements[0],
-        .count = s->qp_prefix_next - 1,
+        .count = s->qp_affix_next - 1,
         .done  = 0,
         .error = 0,
     };
     qht_iter(&s->qpp_table, qpp_table_serialize, &ser);
-    if (ser.error || ser.done != ser.count)
+    if (ser.error || ser.done != ser.count) {
+        err = -1;
         goto out;
+    }
 
     /* initialize header and calculate CRC32 checksum */
     stream->header = (QppSrlzHeader) {
         .version = 1,
-        .reserved = 0,
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        .options = EXP_GOLOMB_K + 1,
+#else
+        .options = 0,
+#endif
         .crc32 = crc32c(
             0xffffffff,
             (const uint8_t*) &stream->elements[0],
@@ -786,12 +1159,15 @@ static void v9fs_store_qpp_table(V9fsState *s)
         if (res < 0) {
             if (i > 0)
                 remove_qidp_xattr(ctx);
+            err = -1;
             break;
         }
     }
     v9fs_string_free(&name);
 out:
     g_free(stream);
+
+    return err;
 }
 
 /* Frees the entire chain of passed nodes from memory. */
@@ -809,6 +1185,55 @@ static void destroy_xattr_nodes(XAttrNode **first)
     }
 }
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Loads all user.virtfs.system.qidd.* xattrs from exported fs root and
+ * returns a linked list with one node per xattr.
+ */
+static XAttrNode* v9fs_load_qidd_xattr_nodes(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    XAttrNode *first = NULL, *current = NULL;
+    V9fsString name;
+    ssize_t size;
+    int i;
+
+    const ssize_t max_size =
+        sizeof(QppSrlzHeader) +
+        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QpdEntryS);
+
+    v9fs_string_init(&name);
+
+    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidd.%d", i);
+        size = lgetxattr(ctx->fs_root, name.data, NULL, 0);
+        if (size <= 0)
+            break;
+        if (!first) {
+            first = current = g_malloc0(sizeof(XAttrNode));
+        } else {
+            current = current->next = g_malloc0(sizeof(XAttrNode));
+        }
+        current->value = g_malloc0(size);
+        current->length = lgetxattr(
+            ctx->fs_root, name.data, current->value, size
+        );
+        if (current->length <= 0) {
+            goto out_w_err;
+        }
+    }
+    goto out;
+
+out_w_err:
+    destroy_xattr_nodes(&first);
+out:
+    v9fs_string_free(&name);
+    return first;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /*
  * Loads all user.virtfs.system.qidp.* xattrs from exported fs root and
  * returns a linked list with one node per xattr.
@@ -854,28 +1279,121 @@ out:
     return first;
 }
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Try to load previously stored qpd_table from file system. Calling this
+ * function assumes that qpd_table is yet empty.
+ *
+ * @returns number of entries loaded on success or negative number on error
+ * @see v9fs_store_qpd_table()
+ */
+static int v9fs_load_qpd_table(V9fsState *s)
+{
+    ssize_t size, count = 0;
+    XAttrNode *current, *first;
+    QpdSrlzStream* stream = NULL;
+    uint32_t crc32;
+    int i, err = 0;
+    QpdEntry *val;
+    uint32_t hash;
+
+    if (s->qp_ndevices != 0)
+        return -1;
+
+    first = v9fs_load_qidd_xattr_nodes(s);
+    if (!first)
+        return -1;
+
+    /* convert nodes into continuous stream */
+    size = 0;
+    for (current = first; current; current = current->next) {
+        size += current->length;
+    }
+    if (size <= 0) {
+        err = -1;
+        goto out;
+    }
+    stream = g_malloc0(size);
+    size = 0;
+    for (current = first; current; current = current->next) {
+        memcpy(((uint8_t*)stream) + size, current->value, current->length);
+        size += current->length;
+    }
+
+    if (stream->header.version != 1) {
+        err = -1;
+        goto out;
+    }
+
+    count = (size - sizeof(QppSrlzHeader)) / sizeof(QpdEntryS);
+    if (count <= 0) {
+        err = -1;
+        goto out;
+    }
+
+    /* verify CRC32 checksum of stream */
+    crc32 = crc32c(
+        0xffffffff,
+        (const uint8_t*) &stream->elements[0],
+        (count * sizeof(QpdEntryS))
+    );
+    if (crc32 != stream->header.crc32) {
+        err = -1;
+        goto out;
+    }
+
+    /* fill qpd_table with the retrieved elements */
+    for (i = 0; i < count; ++i) {
+        val = g_malloc0(sizeof(QpdEntry));
+        *val = (QpdEntry) {
+            .dev = stream->elements[i].dev,
+            .prefix_bits = stream->elements[i].prefix_bits,
+        };
+        hash = val->dev;
+        if (qht_lookup(&s->qpd_table, val, hash)) {
+            /* should never happen: duplicate entry detected */
+            g_free(val);
+            err = -1;
+            goto out;
+        }
+        qht_insert(&s->qpd_table, val, hash, NULL);
+        s->qp_ndevices++;
+    }
+
+out:
+    destroy_xattr_nodes(&first);
+    if (stream)
+        g_free(stream);
+
+    return err ? err : count;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /*
  * Try to load previously stored qpp_table from file system. Calling this
  * function assumes that qpp_table is yet empty.
  *
+ * @returns number of entries loaded on success or negative number on error
  * @see v9fs_store_qpp_table()
  */
-static void v9fs_load_qpp_table(V9fsState *s)
+static int v9fs_load_qpp_table(V9fsState *s)
 {
-    ssize_t size, count;
+    ssize_t size, count = 0;
     XAttrNode *current, *first;
     QppSrlzStream* stream = NULL;
     uint32_t crc32;
-    int i;
+    int i, err = 0;
     QppEntry *val;
     uint32_t hash;
 
-    if (s->qp_prefix_next != 1)
-        return;
+    if (s->qp_affix_next != 1)
+        return -1;
 
     first = v9fs_load_qidp_xattr_nodes(s);
     if (!first)
-        return;
+        return -1;
 
     /* convert nodes into continuous stream */
     size = 0;
@@ -883,6 +1401,7 @@ static void v9fs_load_qpp_table(V9fsState *s)
         size += current->length;
     }
     if (size <= 0) {
+        err = -1;
         goto out;
     }
     stream = g_malloc0(size);
@@ -893,11 +1412,24 @@ static void v9fs_load_qpp_table(V9fsState *s)
     }
 
     if (stream->header.version != 1) {
+        err = -1;
+        goto out;
+    }
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    if ((stream->header.options - 1) != EXP_GOLOMB_K) {
+        err = -1;
         goto out;
     }
+#else
+    if (stream->header.options != 0) {
+        err = -1;
+        goto out;
+    }
+#endif
 
     count = (size - sizeof(QppSrlzHeader)) / sizeof(QppEntryS);
     if (count <= 0) {
+        err = -1;
         goto out;
     }
 
@@ -908,6 +1440,7 @@ static void v9fs_load_qpp_table(V9fsState *s)
         (count * sizeof(QppEntryS))
     );
     if (crc32 != stream->header.crc32) {
+        err = -1;
         goto out;
     }
 
@@ -922,9 +1455,15 @@ static void v9fs_load_qpp_table(V9fsState *s)
         if (qht_lookup(&s->qpp_table, val, hash)) {
             /* should never happen: duplicate entry detected */
             g_free(val);
+            err = -1;
             goto out;
         }
-        val->qp_prefix = s->qp_prefix_next++;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        val->qp_affix_index = s->qp_affix_next++;
+        val->qp_affix = affixForIndex(val->qp_affix_index);
+#else
+        val->qp_affix = s->qp_affix_next++;
+#endif
         qht_insert(&s->qpp_table, val, hash, NULL);
     }
 
@@ -932,40 +1471,109 @@ out:
     destroy_xattr_nodes(&first);
     if (stream)
         g_free(stream);
+
+    return err ? err : count;
+}
+
+/* Tries to load qpp_table (and qpd_table) from xattrs being saved on export root. */
+static void v9fs_load_qp_tables(V9fsState *s)
+{
+    int res;
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    res = v9fs_load_qpd_table(s);
+    if (res > 0) {
+        res = v9fs_load_qpp_table(s);
+    }
+#else
+    res = v9fs_load_qpp_table(s);
+#endif
+
+    /* if there was any error, remove the xattrs */
+    if (res < 0) {
+        remove_qp_tables_xattrs(&s->ctx);
+    }
 }
 
-/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+/* Tries to store qpp_table (and qpd_table) as xattrs on export root. */
+static void v9fs_store_qp_tables(V9fsState *s)
+{
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    int res;
+
+    res = v9fs_store_qpd_table(s);
+    if (res >= 0) {
+        res = v9fs_store_qpp_table(s);
+    }
+
+    /* if there was any error, remove the xattrs immediately again */
+    if (res < 0) {
+        remove_qp_tables_xattrs(&s->ctx);
+    }
+#else
+    v9fs_store_qpp_table(s);
+#endif
+}
+
+#endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
+/** @brief Quick mapping host inode nr -> guest inode nr.
+ *
+ * This function performs quick remapping of an original file inode number
+ * on host to an appropriate different inode number on guest. This remapping
+ * of inodes is required to avoid inode nr collisions on guest which would
+ * happen if the 9p export contains more than 1 exported file system (or
+ * more than 1 file system data set), because unlike on host level where the
+ * files would have different device nrs, all files exported by 9p would
+ * share the same device nr on guest (the device nr of the virtual 9p device
+ * that is).
+ *
+ * if P9_VARI_LENGTH_INODE_SUFFIXES == 0 :
+ * stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
  * to a unique QID path (64 bits). To avoid having to map and keep track
  * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
  * the device id to the 16 highest bits of the QID path. The 48 lowest bits
  * of the QID path equal to the lowest bits of the inode number.
  *
- * This takes advantage of the fact that inode number are usually not
- * random but allocated sequentially, so we have fewer items to keep
- * track of.
+ * if P9_VARI_LENGTH_INODE_SUFFIXES == 1 :
+ * Instead of fixed size (16 bit) generated prefix, we use variable size
+ * suffixes instead. See comment on P9_VARI_LENGTH_INODE_SUFFIXES for
+ * details.
  */
 static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
                                 uint64_t *path)
 {
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
+#endif
     QppEntry lookup = {
         .dev = stbuf->st_dev,
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> (64-ino_hash_bits))
+#else
         .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+#endif
     }, *val;
     uint32_t hash = qpp_hash(lookup);
 
     val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
 
     if (!val) {
-        if (pdu->s->qp_prefix_next == 0) {
-            /* we ran out of prefixes */
+        if (pdu->s->qp_affix_next == 0) {
+            /* we ran out of affixes */
             return -ENFILE;
         }
 
         val = g_malloc0(sizeof(QppEntry));
         *val = lookup;
 
-        /* new unique inode prefix and device combo */
-        val->qp_prefix = pdu->s->qp_prefix_next++;
+        /* new unique inode affix and device combo */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        val->qp_affix_index = pdu->s->qp_affix_next++;
+        val->qp_affix = affixForIndex(val->qp_affix_index);
+#else
+        val->qp_affix = pdu->s->qp_affix_next++;
+#endif
         qht_insert(&pdu->s->qpp_table, val, hash, NULL);
 
         /*
@@ -974,10 +1582,16 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
          * TODO: This should better only be called from a guest shutdown and
          * suspend handler.
          */
+        #if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
         v9fs_store_qpp_table(pdu->s);
+        #endif
     }
-
-    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    /* assuming generated affix to be suffix type, not prefix */
+    *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
+#else
+    *path = ((uint64_t)val->qp_affix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+#endif
     return 0;
 }
 
@@ -988,7 +1602,7 @@ static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
     /* map inode+device to qid path (fast path) */
     err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
     if (err == -ENFILE) {
-        /* fast path didn't work, fal back to full map */
+        /* fast path didn't work, fall back to full map */
         err = qid_path_fullmap(pdu, stbuf, &qidp->path);
     }
     if (err) {
@@ -4075,12 +4689,22 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
         goto out;
     }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    qht_init(&s->qpd_table, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+#endif
+    /* most users won't need the fullmap, so init the table lazily */
+    qht_init(&s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
     /* QID path hash table. 1 entry ought to be enough for anybody ;) */
     qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
-    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    s->qp_ndevices = 0;
+#endif
+    s->qp_affix_next = 1; /* reserve 0 to detect overflow */
     s->qp_fullpath_next = 1;
-    /* try to load and restore previous qpp_table */
-    v9fs_load_qpp_table(s);
+    /* try to load and restore previous qpd_table and qpp_table */
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+    v9fs_load_qp_tables(s);
+#endif
 
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
@@ -4095,6 +4719,9 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        qp_table_destroy(&s->qpd_table);
+#endif
         qp_table_destroy(&s->qpp_table);
         qp_table_destroy(&s->qpf_table);
         v9fs_path_free(&path);
@@ -4105,18 +4732,23 @@ out:
 void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
 {
     /*
-     * Store qpp_table as extended attribute(s) to file system.
+     * Store qpd_table and qpp_table as extended attribute(s) to file system.
      *
      * This was actually plan A, but unfortunately unserialize is not called
      * reliably on guest shutdowns and suspensions.
      */
-    v9fs_store_qpp_table(s);
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+    v9fs_store_qp_tables(s);
+#endif
 
     if (s->ops->cleanup) {
         s->ops->cleanup(&s->ctx);
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    qp_table_destroy(&s->qpd_table);
+#endif
     qp_table_destroy(&s->qpp_table);
     qp_table_destroy(&s->qpf_table);
     g_free(s->ctx.fs_root);
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 54ce039969..78c29fb916 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -236,15 +236,95 @@ struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
-#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+/*
+ * Defines how inode numbers from host shall be remapped on guest.
+ *
+ * When this compile time option is disabled then fixed length (16 bit)
+ * prefix values are used for all inode numbers on guest level. Accordingly
+ * guest's inode numbers will be quite large (>2^48).
+ *
+ * If this option is enabled then variable length suffixes will be used for
+ * guest's inode numbers instead which usually yields in much smaller inode
+ * numbers on guest level (typically around >2^1 .. >2^7).
+ */
+#define P9_VARI_LENGTH_INODE_SUFFIXES 1
+
+/*
+ * If this value is higher than 0 then file inode remapping is tried to be
+ * retained consistent beyond reboots/suspends.
+ *
+ * Once qpp_table size exceeds this value, we no longer save
+ * the table persistently. See comment in v9fs_store_qpp_table()
+ *
+ * By setting this to 0, persistency of file ids will (beyond the scope of
+ * reboots/suspends) will be disabled completely at compile time.
+ */
+#define QPP_TABLE_PERSISTENCY_LIMIT 0
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+typedef enum AffixType_t {
+    AffixType_Prefix,
+    AffixType_Suffix, /* A.k.a. postfix. */
+} AffixType_t;
+
+/** @brief Unique affix of variable length.
+ *
+ * An affix is (currently) either a suffix or a prefix, which is either
+ * going to be prepended (prefix) or appended (suffix) with some other
+ * number for the goal to generate unique numbers. Accordingly the
+ * suffixes (or prefixes) we generate @b must all have the mathematical
+ * property of being suffix-free (or prefix-free in case of prefixes)
+ * so that no matter what number we concatenate the affix with, that we
+ * always reliably get unique numbers as result after concatenation.
+ */
+typedef struct VariLenAffix {
+    AffixType_t type; /* Whether this affix is a suffix or a prefix. */
+    uint64_t value; /* Actual numerical value of this affix. */
+    int bits; /* Lenght of the affix, that is how many (of the lowest) bits of @c value must be used for appending/prepending this affix to its final resulting, unique number. */
+} VariLenAffix;
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+#if !P9_VARI_LENGTH_INODE_SUFFIXES
+# define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+#endif
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* See qid_inode_prefix_hash_bits(). */
+typedef struct {
+    dev_t dev; /* FS device on host. */
+    int prefix_bits; /* How many (high) bits of the original inode number shall be used for hashing. */
+} QpdEntry;
+
+# if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+
+/* Small version of QpdEntry for serialization as xattr. */
+struct QpdEntryS {
+    dev_t dev;
+    uint8_t prefix_bits;
+}  __attribute__((packed));
+typedef struct QpdEntryS QpdEntryS;
+
+# endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
 
 /* QID path prefix entry, see stat_to_qid */
 typedef struct {
     dev_t dev;
     uint16_t ino_prefix;
-    uint16_t qp_prefix;
+    #if P9_VARI_LENGTH_INODE_SUFFIXES
+    uint32_t qp_affix_index;
+    VariLenAffix qp_affix;
+    #else
+    uint16_t qp_affix;
+    #endif
 } QppEntry;
 
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+
 /* Small version of QppEntry for serialization as xattr. */
 struct QppEntryS {
     dev_t dev;
@@ -252,6 +332,8 @@ struct QppEntryS {
 } __attribute__((packed));
 typedef struct QppEntryS QppEntryS;
 
+#endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
 /* QID path full entry, as above */
 typedef struct {
     dev_t dev;
@@ -259,6 +341,19 @@ typedef struct {
     uint64_t path;
 } QpfEntry;
 
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+typedef struct {
+    QpdEntryS *elements;
+    uint count; /* In: QpdEntryS count in @a elements */
+    uint done; /* Out: how many QpdEntryS did we actually fill in @a elements */
+    int error; /* Out: zero on success */
+} QpdSerialize;
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 typedef struct {
     QppEntryS *elements;
     uint count; /* In: QppEntryS count in @a elements */
@@ -268,7 +363,7 @@ typedef struct {
 
 struct QppSrlzHeader {
     uint16_t version;
-    uint16_t reserved; /* might be used e.g. for flags in future */
+    uint16_t options;
     uint32_t crc32;
 } __attribute__((packed));
 typedef struct QppSrlzHeader QppSrlzHeader;
@@ -279,12 +374,24 @@ struct QppSrlzStream {
 } __attribute__((packed));
 typedef struct QppSrlzStream QppSrlzStream;
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+struct QpdSrlzStream {
+    QppSrlzHeader header;
+    QpdEntryS elements[0];
+} __attribute__((packed));
+typedef struct QpdSrlzStream QpdSrlzStream;
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 typedef struct XAttrNode {
     uint8_t* value;
     ssize_t length;
     struct XAttrNode* next;
 } XAttrNode;
 
+#endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -306,9 +413,15 @@ struct V9fsState
     Error *migration_blocker;
     V9fsConf fsconf;
     V9fsQID root_qid;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    struct qht qpd_table;
+#endif
     struct qht qpp_table;
     struct qht qpf_table;
-    uint16_t qp_prefix_next;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    uint64_t qp_ndevices; /* Amount of entries in qpd_table. */
+#endif
+    uint16_t qp_affix_next;
     uint64_t qp_fullpath_next;
 };
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH v2 4/4] 9p: use variable length suffixes for inode mapping
@ 2019-05-03 16:24   ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 11+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-05-03 16:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz, Antonios Motakis

This patch introduces variable length suffixes for being used for inode mapping
instead of the fixed 16 bit size prefixes of patch 1. With this patch the inode
numbers on guest will typically be much smaller (e.g. around >2^1 .. >2^7
instead of >2^48 with patch 1).

I preserved both solutions in this patch though, so you can switch between
them by adjusting P9_VARI_LENGTH_INODE_SUFFIXES, which probably also makes
code comparison between the two more easy for now.

Motivation: with patch 1 the inode numbers were so high that some network
services had issues. Especially SMB seems to be a problem again due to varying
implementations regarding SMB file IDs. I don't bother you with the details.

Additionally this solution should be more efficient, since inode numbers in
practice can take almost their entire 64 bit range on guest as well. Which
might also be beneficial for nested virtualization.

The "Exponential Golomb" algorithm is used as basis for generating the
variable length suffixes. The algorithm has a paramter k which controls the
distribution of bits on increasing indeces (minimum bits at low index vs.
maximum bits at high index). With k=0 the generated suffixes look like:

Index Dec/Bin -> Generated Suffix Bin
1 [1] -> [1] (1 bits)
2 [10] -> [010] (3 bits)
3 [11] -> [110] (3 bits)
4 [100] -> [00100] (5 bits)
5 [101] -> [10100] (5 bits)
6 [110] -> [01100] (5 bits)
7 [111] -> [11100] (5 bits)
8 [1000] -> [0001000] (7 bits)
9 [1001] -> [1001000] (7 bits)
10 [1010] -> [0101000] (7 bits)
11 [1011] -> [1101000] (7 bits)
12 [1100] -> [0011000] (7 bits)
...
65533 [1111111111111101] ->  [1011111111111111000000000000000] (31 bits)
65534 [1111111111111110] ->  [0111111111111111000000000000000] (31 bits)
65535 [1111111111111111] ->  [1111111111111111000000000000000] (31 bits)
Hence minBits=1 maxBits=31

And with k=5 they would look like:

Index Dec/Bin -> Generated Suffix Bin
1 [1] -> [000001] (6 bits)
2 [10] -> [100001] (6 bits)
3 [11] -> [010001] (6 bits)
4 [100] -> [110001] (6 bits)
5 [101] -> [001001] (6 bits)
6 [110] -> [101001] (6 bits)
7 [111] -> [011001] (6 bits)
8 [1000] -> [111001] (6 bits)
9 [1001] -> [000101] (6 bits)
10 [1010] -> [100101] (6 bits)
11 [1011] -> [010101] (6 bits)
12 [1100] -> [110101] (6 bits)
...
65533 [1111111111111101] -> [0011100000000000100000000000] (28 bits)
65534 [1111111111111110] -> [1011100000000000100000000000] (28 bits)
65535 [1111111111111111] -> [0111100000000000100000000000] (28 bits)
Hence minBits=6 maxBits=28

If somebody wants to play around with the details of the suffix generation, let
me know and I will send you a small C source for you to play with.

Additionally this patch disables persistency of file IDs (that I introduced
with patch 3) at compile time by default for now. You can still enable it by
setting QPP_TABLE_PERSISTENCY_LIMIT to some reasonable high value (e.g. 127).
Reason: I am still unresolved what to do with this feature. The original
motivation was to avoid file ID collisions with network services if the machine
was interrupted for a short period. But what I did not consider was that
device IDs might change on host, so after loading the tables from fs the qp
tables would grow with irrelevant entries.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 744 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/9pfs/9p.h | 121 +++++++++-
 2 files changed, 805 insertions(+), 60 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 29c6dfc68a..003901a1e0 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -30,6 +30,10 @@
 #if defined(__linux__) /* TODO: This should probably go into osdep.h instead */
 # include <linux/limits.h> /* for XATTR_SIZE_MAX */
 #endif
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <math.h>
 
 /*
  * How many bytes may we store to fs per extended attribute value?
@@ -585,6 +589,123 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
+/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
+static inline uint8_t mirror8bit(uint8_t byte) {
+    return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+}
+
+/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
+static inline uint64_t mirror64bit(uint64_t value) {
+    return ((uint64_t)mirror8bit( value        & 0xff) << 56) |
+           ((uint64_t)mirror8bit((value >> 8)  & 0xff) << 48) |
+           ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
+           ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
+           ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
+           ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
+           ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8 ) |
+           ((uint64_t)mirror8bit((value >> 56) & 0xff)      ) ;
+}
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Parameter k for the Exponential Golomb algorihm to be used.
+ *
+ * The smaller this value, the smaller the minimum bit count for the Exp.
+ * Golomb generated affixes will be (at lowest index) however for the
+ * price of having higher maximum bit count of generated affixes (at highest
+ * index). Likewise increasing this parameter yields in smaller maximum bit
+ * count for the price of having higher minimum bit count.
+ */
+#define EXP_GOLOMB_K    0
+
+# if !EXP_GOLOMB_K
+
+/** @brief Exponential Golomb algorithm limited to the case k=0.
+ * 
+ * See expGolombEncode() below for details.
+ * 
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix expGolombEncodeK0(uint64_t n) {
+    const int bits = (int) log2(n) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = n,
+        .bits = bits + bits - 1
+    };
+}
+
+# else
+
+/** @brief Exponential Golomb algorithm for arbitrary k (including k=0).
+ *
+ * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!)
+ * with growing length and with the mathematical property of being
+ * "prefix-free". The latter means the generated prefixes can be prepended
+ * in front of arbitrary numbers and the resulting concatenated numbers are
+ * guaranteed to be always unique.
+ *
+ * This is a minor adjustment to the original Exp. Golomb algorithm in the
+ * sense that lowest allowed index (@param n) starts with 1, not with zero.
+ *
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ * @param k - parameter k of Exp. Golomb algorithm to be used
+ *            (see comment on EXP_GOLOMB_K macro for details about k)
+ */
+static VariLenAffix expGolombEncode(uint64_t n, int k) {
+    const uint64_t value = n + (1 << k) - 1;
+    const int bits = (int) log2(value) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = value,
+        .bits = bits + MAX((bits - 1 - k), 0)
+    };
+}
+
+# endif /* !EXP_GOLOMB_K */
+
+/** @brief Converts a suffix into a prefix, or a prefix into a suffix.
+ *
+ * What this function does is actually mirroring all bits of the affix value,
+ * with the purpose to preserve respectively the mathematical "prefix-free"
+ * or "suffix-free" property after the conversion.
+ *
+ * In other words: if a passed prefix is suitable to create unique numbers,
+ * then the returned suffix is suitable to create unique numbers as well
+ * (and vice versa).
+ */
+static VariLenAffix invertAffix(const VariLenAffix* affix) {
+    return (VariLenAffix) {
+        .type = (affix->type == AffixType_Suffix) ? AffixType_Prefix : AffixType_Suffix,
+        .value =  mirror64bit(affix->value) >> ((sizeof(affix->value) * 8) - affix->bits),
+        .bits = affix->bits
+    };
+}
+
+/** @brief Generates suffix numbers with "suffix-free" property.
+ *
+ * This is just a wrapper function on top of the Exp. Golomb algorithm.
+ *
+ * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
+ * this function converts the Exp. Golomb prefixes into appropriate suffixes
+ * which are still suitable for generating unique numbers.
+ *
+ * @param n - natural number (or index) of the suffix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix affixForIndex(uint64_t index) {
+    VariLenAffix prefix;
+    #if EXP_GOLOMB_K
+    prefix = expGolombEncode(index, EXP_GOLOMB_K);
+    #else
+    prefix = expGolombEncodeK0(index);
+    #endif
+    return invertAffix(&prefix); /* convert prefix to suffix */
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
 
 /* creative abuse of qemu_xxhash7, which is based on xxhash */
 static uint32_t qpp_hash(QppEntry e)
@@ -597,6 +718,16 @@ static uint32_t qpf_hash(QpfEntry e)
     return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
 }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+static bool qpd_cmp_func(const void *obj, const void *userp)
+{
+    const QpdEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev;
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 static bool qpp_cmp_func(const void *obj, const void *userp)
 {
     const QppEntry *e1 = obj, *e2 = userp;
@@ -620,6 +751,72 @@ static void qp_table_destroy(struct qht *ht)
     qht_destroy(ht);
 }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+# if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+static int v9fs_store_qpd_table(V9fsState *s);
+# endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
+/*
+ * Returns how many (high end) bits of inode numbers of the passed fs
+ * device shall be used (in combination with the device number) to
+ * generate hash values for qpp_table entries.
+ *
+ * This function is required if variable length suffixes are used for inode
+ * number mapping on guest level. Since a device may end up having multiple
+ * entries in qpp_table, each entry most probably with a different suffix
+ * length, we thus need this function in conjunction with qpd_table to
+ * "agree" about a fix amount of bits (per device) to be always used for
+ * generating hash values for the purpose of accessing qpp_table in order
+ * get consistent behaviour when accessing qpp_table.
+ */
+static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
+{
+    QpdEntry lookup = {
+        .dev = dev
+    }, *val;
+    uint32_t hash = dev;
+    VariLenAffix affix;
+
+    val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
+    if (!val) {
+        val = g_malloc0(sizeof(QpdEntry));
+        *val = lookup;
+        affix = affixForIndex(pdu->s->qp_affix_next);
+        val->prefix_bits = affix.bits;
+        qht_insert(&pdu->s->qpd_table, val, hash, NULL);
+        pdu->s->qp_ndevices++;
+
+        /*
+         * Store qpd_table as extended attribute(s) to file system.
+         * 
+         * TODO: This should better only be called from a guest shutdown and
+         * suspend handler.
+         */
+        #if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+        v9fs_store_qpd_table(pdu->s);
+        #endif
+    }
+    return val->prefix_bits;
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+/** @brief Slow / full mapping host inode nr -> guest inode nr.
+ *
+ * This function performs a slower and much more costly remapping of an
+ * original file inode number on host to an appropriate different inode
+ * number on guest. For every (dev, inode) combination on host a new
+ * sequential number is generated, cached and exposed as inode number on
+ * guest.
+ *
+ * This is just a "last resort" fallback solution if the much faster/cheaper
+ * qid_path_prefixmap() failed. In practice this slow / full mapping is not
+ * expected ever to be used at all though.
+ *
+ * @see qid_path_prefixmap() for details
+ *
+ */
 static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
                             uint64_t *path)
 {
@@ -628,11 +825,9 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         .ino = stbuf->st_ino
     }, *val;
     uint32_t hash = qpf_hash(lookup);
-
-    /* most users won't need the fullmap, so init the table lazily */
-    if (!pdu->s->qpf_table.map) {
-        qht_init(&pdu->s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
-    }
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    VariLenAffix affix;
+#endif
 
     val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
 
@@ -646,8 +841,16 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         *val = lookup;
 
         /* new unique inode and device combo */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        affix = affixForIndex(
+            1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
+        );
+        val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
+        pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
+#else
         val->path = pdu->s->qp_fullpath_next++;
         pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+#endif
         qht_insert(&pdu->s->qpf_table, val, hash, NULL);
     }
 
@@ -660,18 +863,17 @@ static inline bool is_ro_export(FsContext *ctx)
     return ctx->export_flags & V9FS_RDONLY;
 }
 
-/*
- * Once qpp_table size exceeds this value, we no longer save
- * the table persistently. See comment in v9fs_store_qpp_table()
- */
-#define QPP_TABLE_PERSISTENCY_LIMIT 32768
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
 
-/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
-static void remove_qidp_xattr(FsContext *ctx)
+/* Remove all xattrs from export root matching the supplied xattr name_pattern. */
+static void remove_seq_xattr(FsContext *ctx, const char* name_pattern)
 {
     V9fsString name;
     int i;
 
+    if (is_ro_export(ctx))
+        return;
+
     /* just for a paranoid endless recursion sanity check */
     const ssize_t max_size =
         sizeof(QppSrlzHeader) +
@@ -679,83 +881,254 @@ static void remove_qidp_xattr(FsContext *ctx)
 
     v9fs_string_init(&name);
     for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
-        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
+        v9fs_string_sprintf(&name, name_pattern, i);
         if (lremovexattr(ctx->fs_root, name.data) < 0)
             break;
     }
     v9fs_string_free(&name);
 }
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Remove all user.virtfs.system.qidd.* xattrs from export root. */
+static void remove_qidd_xattr(FsContext *ctx)
+{
+    remove_seq_xattr(ctx, "user.virtfs.system.qidd.%d");
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
+static void remove_qidp_xattr(FsContext *ctx)
+{
+    remove_seq_xattr(ctx, "user.virtfs.system.qidp.%d");
+}
+
+static void remove_qp_tables_xattrs(FsContext *ctx) {
+    #if P9_VARI_LENGTH_INODE_SUFFIXES
+    remove_qidd_xattr(ctx);
+    #endif
+    remove_qidp_xattr(ctx);
+}
+
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Used to convert qpd hash table into continuous stream. */
+static void qpd_table_serialize(struct qht *ht, void *p, uint32_t h, void *up)
+{
+    const QpdEntry *entry = (const QpdEntry*) p;
+    QpdSerialize *ser = (QpdSerialize*) up;
+
+    /* NOTE: Due to the simple incremental index solution we use here, this
+     * means the entries might be serialized in the final data stream with
+     * different order than the devices were originally inserted into the
+     * qpd_table. However specifically for qpd_table this is irrelevant ATM.
+     */
+    const uint32_t index = ser->done;
+
+    if (ser->error)
+        return;
+
+    /* safety first */
+    if (index >= ser->count) {
+        ser->error = -1;
+        return;
+    }
+
+    ser->elements[index] = (QpdEntryS) {
+        .dev = entry->dev,
+        .prefix_bits = entry->prefix_bits
+    };
+    ser->done++;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /* Used to convert qpp hash table into continuous stream. */
 static void qpp_table_serialize(void *p, uint32_t h, void *up)
 {
     const QppEntry *entry = (const QppEntry*) p;
     QppSerialize *ser = (QppSerialize*) up;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    const uint32_t index = entry->qp_affix_index - 1;
+#else
+    const uint32_t index = entry->qp_affix - 1;
+#endif
 
     if (ser->error)
         return;
 
     /* safety first */
-    if (entry->qp_prefix - 1 >= ser->count) {
+    if (index >= ser->count) {
         ser->error = -1;
         return;
     }
 
-    ser->elements[entry->qp_prefix - 1] = (QppEntryS) {
+    ser->elements[index] = (QppEntryS) {
         .dev = entry->dev,
         .ino_prefix = entry->ino_prefix
     };
     ser->done++;
 }
 
+/** @brief Returns true if we should still save qpp_table (and qpd_table).
+ *
+ * Whenever we exceeded some certain (arbitrary) high qpp_table size we
+ * delete the stored table(s) from the file system to get rid of old device
+ * ids / inodes that might no longer exist with the goal to potentially
+ * yield in a smaller table size after next reboot and smaller inode
+ * numbers on guest.
+ */
+static bool shouldStoreQpTables(V9fsState *s) {
+    return s->qp_affix_next && s->qp_affix_next < QPP_TABLE_PERSISTENCY_LIMIT;
+}
+
+/* See shouldStoreQpTables() above */
+static bool shouldDeleteQpTables(V9fsState *s) {
+    return s->qp_affix_next == QPP_TABLE_PERSISTENCY_LIMIT;
+}
+
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Tries to store the current qpd_table as extended attribute(s) on the
+ * exported file system root with the goal to preserve identical qids
+ * beyond the scope of reboots / suspends.
+ *
+ * @returns 0 on success
+ */
+static int v9fs_store_qpd_table(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    V9fsString name;
+    int i, res, err = 0;
+    size_t size;
+    QpdSrlzStream* stream;
+    QpdSerialize ser;
+
+    if (is_ro_export(ctx))
+        return 0;
+
+    if (!shouldStoreQpTables(s)) {
+        if (shouldDeleteQpTables(s)) {
+            remove_qp_tables_xattrs(ctx);
+        }
+        return 0;
+    }
+
+    /* Convert qpd hash table into continuous array. */
+    size = sizeof(QppSrlzHeader) +
+           ( s->qp_ndevices /* qpd_table entry count */ * sizeof(QpdEntryS) );
+    stream = g_malloc0(size);
+    ser = (QpdSerialize) {
+        .elements = &stream->elements[0],
+        .count = s->qp_ndevices,
+        .done  = 0,
+        .error = 0,
+    };
+    qht_iter(&s->qpd_table, qpd_table_serialize, &ser);
+    if (ser.error || ser.done != ser.count) {
+        err = -1;
+        goto out;
+    }
+
+    /* initialize header and calculate CRC32 checksum */
+    stream->header = (QppSrlzHeader) {
+        .version = 1,
+        .options = 0, /* not used for storing the qpd_table yet */
+        .crc32 = crc32c(
+            0xffffffff,
+            (const uint8_t*) &stream->elements[0],
+            (ser.count * sizeof(QpdEntryS))
+        ),
+    };
+
+    /*
+     * Actually just required if the qpd_table size decreased, or if the
+     * previous xattr size limit increased on OS (kernel/fs) level.
+     */
+    remove_qidd_xattr(ctx);
+
+    /*
+     * Subdivide (if required) the data stream into individual xattrs
+     * to cope with the system's max. supported xattr value size.
+     */
+    v9fs_string_init(&name);
+    for (i = 0; size > (i * ATTR_MAX_SIZE); ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidd.%d", i);
+        res = lsetxattr(
+            ctx->fs_root,
+            name.data,
+            ((const uint8_t*)stream) + i * ATTR_MAX_SIZE,
+            MIN(ATTR_MAX_SIZE, size - i * ATTR_MAX_SIZE),
+            0/*flags*/
+        );
+        if (res < 0) {
+            if (i > 0)
+                remove_qidd_xattr(ctx);
+            err = -1;
+            break;
+        }
+    }
+    v9fs_string_free(&name);
+out:
+    g_free(stream);
+
+    return err;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /*
  * Tries to store the current qpp_table as extended attribute(s) on the
  * exported file system root with the goal to preserve identical qids
- * beyond the scope of reboots.
+ * beyond the scope of reboots / suspends.
+ *
+ * @returns 0 on success
  */
-static void v9fs_store_qpp_table(V9fsState *s)
+static int v9fs_store_qpp_table(V9fsState *s)
 {
     FsContext *ctx = &s->ctx;
     V9fsString name;
-    int i, res;
+    int i, res, err = 0;
     size_t size;
     QppSrlzStream* stream;
     QppSerialize ser;
 
     if (is_ro_export(ctx))
-        return;
+        return 0;
 
-    /*
-     * Whenever we exceeded some certain (arbitrary) high qpp_table size we
-     * delete the stored table from the file system to get rid of old device
-     * ids / inodes that might no longer exist with the goal to potentially
-     * yield in a smaller table size after next reboot.
-     */
-    if (!s->qp_prefix_next || s->qp_prefix_next >= QPP_TABLE_PERSISTENCY_LIMIT) {
-        if (s->qp_prefix_next == QPP_TABLE_PERSISTENCY_LIMIT) {
-            remove_qidp_xattr(ctx);
+    if (!shouldStoreQpTables(s)) {
+        if (shouldDeleteQpTables(s)) {
+            remove_qp_tables_xattrs(ctx);
         }
-        return;
+        return 0;
     }
 
     /* Convert qpp hash table into continuous array. */
     size = sizeof(QppSrlzHeader) +
-           ( (s->qp_prefix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
+           ( (s->qp_affix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
     stream = g_malloc0(size);
     ser = (QppSerialize) {
         .elements = &stream->elements[0],
-        .count = s->qp_prefix_next - 1,
+        .count = s->qp_affix_next - 1,
         .done  = 0,
         .error = 0,
     };
     qht_iter(&s->qpp_table, qpp_table_serialize, &ser);
-    if (ser.error || ser.done != ser.count)
+    if (ser.error || ser.done != ser.count) {
+        err = -1;
         goto out;
+    }
 
     /* initialize header and calculate CRC32 checksum */
     stream->header = (QppSrlzHeader) {
         .version = 1,
-        .reserved = 0,
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        .options = EXP_GOLOMB_K + 1,
+#else
+        .options = 0,
+#endif
         .crc32 = crc32c(
             0xffffffff,
             (const uint8_t*) &stream->elements[0],
@@ -786,12 +1159,15 @@ static void v9fs_store_qpp_table(V9fsState *s)
         if (res < 0) {
             if (i > 0)
                 remove_qidp_xattr(ctx);
+            err = -1;
             break;
         }
     }
     v9fs_string_free(&name);
 out:
     g_free(stream);
+
+    return err;
 }
 
 /* Frees the entire chain of passed nodes from memory. */
@@ -809,6 +1185,55 @@ static void destroy_xattr_nodes(XAttrNode **first)
     }
 }
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Loads all user.virtfs.system.qidd.* xattrs from exported fs root and
+ * returns a linked list with one node per xattr.
+ */
+static XAttrNode* v9fs_load_qidd_xattr_nodes(V9fsState *s)
+{
+    FsContext *ctx = &s->ctx;
+    XAttrNode *first = NULL, *current = NULL;
+    V9fsString name;
+    ssize_t size;
+    int i;
+
+    const ssize_t max_size =
+        sizeof(QppSrlzHeader) +
+        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QpdEntryS);
+
+    v9fs_string_init(&name);
+
+    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
+        v9fs_string_sprintf(&name, "user.virtfs.system.qidd.%d", i);
+        size = lgetxattr(ctx->fs_root, name.data, NULL, 0);
+        if (size <= 0)
+            break;
+        if (!first) {
+            first = current = g_malloc0(sizeof(XAttrNode));
+        } else {
+            current = current->next = g_malloc0(sizeof(XAttrNode));
+        }
+        current->value = g_malloc0(size);
+        current->length = lgetxattr(
+            ctx->fs_root, name.data, current->value, size
+        );
+        if (current->length <= 0) {
+            goto out_w_err;
+        }
+    }
+    goto out;
+
+out_w_err:
+    destroy_xattr_nodes(&first);
+out:
+    v9fs_string_free(&name);
+    return first;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /*
  * Loads all user.virtfs.system.qidp.* xattrs from exported fs root and
  * returns a linked list with one node per xattr.
@@ -854,28 +1279,121 @@ out:
     return first;
 }
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Try to load previously stored qpd_table from file system. Calling this
+ * function assumes that qpd_table is yet empty.
+ *
+ * @returns number of entries loaded on success or negative number on error
+ * @see v9fs_store_qpd_table()
+ */
+static int v9fs_load_qpd_table(V9fsState *s)
+{
+    ssize_t size, count = 0;
+    XAttrNode *current, *first;
+    QpdSrlzStream* stream = NULL;
+    uint32_t crc32;
+    int i, err = 0;
+    QpdEntry *val;
+    uint32_t hash;
+
+    if (s->qp_ndevices != 0)
+        return -1;
+
+    first = v9fs_load_qidd_xattr_nodes(s);
+    if (!first)
+        return -1;
+
+    /* convert nodes into continuous stream */
+    size = 0;
+    for (current = first; current; current = current->next) {
+        size += current->length;
+    }
+    if (size <= 0) {
+        err = -1;
+        goto out;
+    }
+    stream = g_malloc0(size);
+    size = 0;
+    for (current = first; current; current = current->next) {
+        memcpy(((uint8_t*)stream) + size, current->value, current->length);
+        size += current->length;
+    }
+
+    if (stream->header.version != 1) {
+        err = -1;
+        goto out;
+    }
+
+    count = (size - sizeof(QppSrlzHeader)) / sizeof(QpdEntryS);
+    if (count <= 0) {
+        err = -1;
+        goto out;
+    }
+
+    /* verify CRC32 checksum of stream */
+    crc32 = crc32c(
+        0xffffffff,
+        (const uint8_t*) &stream->elements[0],
+        (count * sizeof(QpdEntryS))
+    );
+    if (crc32 != stream->header.crc32) {
+        err = -1;
+        goto out;
+    }
+
+    /* fill qpd_table with the retrieved elements */
+    for (i = 0; i < count; ++i) {
+        val = g_malloc0(sizeof(QpdEntry));
+        *val = (QpdEntry) {
+            .dev = stream->elements[i].dev,
+            .prefix_bits = stream->elements[i].prefix_bits,
+        };
+        hash = val->dev;
+        if (qht_lookup(&s->qpd_table, val, hash)) {
+            /* should never happen: duplicate entry detected */
+            g_free(val);
+            err = -1;
+            goto out;
+        }
+        qht_insert(&s->qpd_table, val, hash, NULL);
+        s->qp_ndevices++;
+    }
+
+out:
+    destroy_xattr_nodes(&first);
+    if (stream)
+        g_free(stream);
+
+    return err ? err : count;
+}
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 /*
  * Try to load previously stored qpp_table from file system. Calling this
  * function assumes that qpp_table is yet empty.
  *
+ * @returns number of entries loaded on success or negative number on error
  * @see v9fs_store_qpp_table()
  */
-static void v9fs_load_qpp_table(V9fsState *s)
+static int v9fs_load_qpp_table(V9fsState *s)
 {
-    ssize_t size, count;
+    ssize_t size, count = 0;
     XAttrNode *current, *first;
     QppSrlzStream* stream = NULL;
     uint32_t crc32;
-    int i;
+    int i, err = 0;
     QppEntry *val;
     uint32_t hash;
 
-    if (s->qp_prefix_next != 1)
-        return;
+    if (s->qp_affix_next != 1)
+        return -1;
 
     first = v9fs_load_qidp_xattr_nodes(s);
     if (!first)
-        return;
+        return -1;
 
     /* convert nodes into continuous stream */
     size = 0;
@@ -883,6 +1401,7 @@ static void v9fs_load_qpp_table(V9fsState *s)
         size += current->length;
     }
     if (size <= 0) {
+        err = -1;
         goto out;
     }
     stream = g_malloc0(size);
@@ -893,11 +1412,24 @@ static void v9fs_load_qpp_table(V9fsState *s)
     }
 
     if (stream->header.version != 1) {
+        err = -1;
+        goto out;
+    }
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    if ((stream->header.options - 1) != EXP_GOLOMB_K) {
+        err = -1;
         goto out;
     }
+#else
+    if (stream->header.options != 0) {
+        err = -1;
+        goto out;
+    }
+#endif
 
     count = (size - sizeof(QppSrlzHeader)) / sizeof(QppEntryS);
     if (count <= 0) {
+        err = -1;
         goto out;
     }
 
@@ -908,6 +1440,7 @@ static void v9fs_load_qpp_table(V9fsState *s)
         (count * sizeof(QppEntryS))
     );
     if (crc32 != stream->header.crc32) {
+        err = -1;
         goto out;
     }
 
@@ -922,9 +1455,15 @@ static void v9fs_load_qpp_table(V9fsState *s)
         if (qht_lookup(&s->qpp_table, val, hash)) {
             /* should never happen: duplicate entry detected */
             g_free(val);
+            err = -1;
             goto out;
         }
-        val->qp_prefix = s->qp_prefix_next++;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        val->qp_affix_index = s->qp_affix_next++;
+        val->qp_affix = affixForIndex(val->qp_affix_index);
+#else
+        val->qp_affix = s->qp_affix_next++;
+#endif
         qht_insert(&s->qpp_table, val, hash, NULL);
     }
 
@@ -932,40 +1471,109 @@ out:
     destroy_xattr_nodes(&first);
     if (stream)
         g_free(stream);
+
+    return err ? err : count;
+}
+
+/* Tries to load qpp_table (and qpd_table) from xattrs being saved on export root. */
+static void v9fs_load_qp_tables(V9fsState *s)
+{
+    int res;
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    res = v9fs_load_qpd_table(s);
+    if (res > 0) {
+        res = v9fs_load_qpp_table(s);
+    }
+#else
+    res = v9fs_load_qpp_table(s);
+#endif
+
+    /* if there was any error, remove the xattrs */
+    if (res < 0) {
+        remove_qp_tables_xattrs(&s->ctx);
+    }
 }
 
-/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+/* Tries to store qpp_table (and qpd_table) as xattrs on export root. */
+static void v9fs_store_qp_tables(V9fsState *s)
+{
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    int res;
+
+    res = v9fs_store_qpd_table(s);
+    if (res >= 0) {
+        res = v9fs_store_qpp_table(s);
+    }
+
+    /* if there was any error, remove the xattrs immediately again */
+    if (res < 0) {
+        remove_qp_tables_xattrs(&s->ctx);
+    }
+#else
+    v9fs_store_qpp_table(s);
+#endif
+}
+
+#endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
+/** @brief Quick mapping host inode nr -> guest inode nr.
+ *
+ * This function performs quick remapping of an original file inode number
+ * on host to an appropriate different inode number on guest. This remapping
+ * of inodes is required to avoid inode nr collisions on guest which would
+ * happen if the 9p export contains more than 1 exported file system (or
+ * more than 1 file system data set), because unlike on host level where the
+ * files would have different device nrs, all files exported by 9p would
+ * share the same device nr on guest (the device nr of the virtual 9p device
+ * that is).
+ *
+ * if P9_VARI_LENGTH_INODE_SUFFIXES == 0 :
+ * stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
  * to a unique QID path (64 bits). To avoid having to map and keep track
  * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
  * the device id to the 16 highest bits of the QID path. The 48 lowest bits
  * of the QID path equal to the lowest bits of the inode number.
  *
- * This takes advantage of the fact that inode number are usually not
- * random but allocated sequentially, so we have fewer items to keep
- * track of.
+ * if P9_VARI_LENGTH_INODE_SUFFIXES == 1 :
+ * Instead of fixed size (16 bit) generated prefix, we use variable size
+ * suffixes instead. See comment on P9_VARI_LENGTH_INODE_SUFFIXES for
+ * details.
  */
 static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
                                 uint64_t *path)
 {
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
+#endif
     QppEntry lookup = {
         .dev = stbuf->st_dev,
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> (64-ino_hash_bits))
+#else
         .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+#endif
     }, *val;
     uint32_t hash = qpp_hash(lookup);
 
     val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
 
     if (!val) {
-        if (pdu->s->qp_prefix_next == 0) {
-            /* we ran out of prefixes */
+        if (pdu->s->qp_affix_next == 0) {
+            /* we ran out of affixes */
             return -ENFILE;
         }
 
         val = g_malloc0(sizeof(QppEntry));
         *val = lookup;
 
-        /* new unique inode prefix and device combo */
-        val->qp_prefix = pdu->s->qp_prefix_next++;
+        /* new unique inode affix and device combo */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        val->qp_affix_index = pdu->s->qp_affix_next++;
+        val->qp_affix = affixForIndex(val->qp_affix_index);
+#else
+        val->qp_affix = pdu->s->qp_affix_next++;
+#endif
         qht_insert(&pdu->s->qpp_table, val, hash, NULL);
 
         /*
@@ -974,10 +1582,16 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
          * TODO: This should better only be called from a guest shutdown and
          * suspend handler.
          */
+        #if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
         v9fs_store_qpp_table(pdu->s);
+        #endif
     }
-
-    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    /* assuming generated affix to be suffix type, not prefix */
+    *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
+#else
+    *path = ((uint64_t)val->qp_affix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+#endif
     return 0;
 }
 
@@ -988,7 +1602,7 @@ static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
     /* map inode+device to qid path (fast path) */
     err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
     if (err == -ENFILE) {
-        /* fast path didn't work, fal back to full map */
+        /* fast path didn't work, fall back to full map */
         err = qid_path_fullmap(pdu, stbuf, &qidp->path);
     }
     if (err) {
@@ -4075,12 +4689,22 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
         goto out;
     }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    qht_init(&s->qpd_table, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+#endif
+    /* most users won't need the fullmap, so init the table lazily */
+    qht_init(&s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
     /* QID path hash table. 1 entry ought to be enough for anybody ;) */
     qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
-    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    s->qp_ndevices = 0;
+#endif
+    s->qp_affix_next = 1; /* reserve 0 to detect overflow */
     s->qp_fullpath_next = 1;
-    /* try to load and restore previous qpp_table */
-    v9fs_load_qpp_table(s);
+    /* try to load and restore previous qpd_table and qpp_table */
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+    v9fs_load_qp_tables(s);
+#endif
 
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
@@ -4095,6 +4719,9 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        qp_table_destroy(&s->qpd_table);
+#endif
         qp_table_destroy(&s->qpp_table);
         qp_table_destroy(&s->qpf_table);
         v9fs_path_free(&path);
@@ -4105,18 +4732,23 @@ out:
 void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
 {
     /*
-     * Store qpp_table as extended attribute(s) to file system.
+     * Store qpd_table and qpp_table as extended attribute(s) to file system.
      *
      * This was actually plan A, but unfortunately unserialize is not called
      * reliably on guest shutdowns and suspensions.
      */
-    v9fs_store_qpp_table(s);
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+    v9fs_store_qp_tables(s);
+#endif
 
     if (s->ops->cleanup) {
         s->ops->cleanup(&s->ctx);
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    qp_table_destroy(&s->qpd_table);
+#endif
     qp_table_destroy(&s->qpp_table);
     qp_table_destroy(&s->qpf_table);
     g_free(s->ctx.fs_root);
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 54ce039969..78c29fb916 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -236,15 +236,95 @@ struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
-#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+/*
+ * Defines how inode numbers from host shall be remapped on guest.
+ *
+ * When this compile time option is disabled then fixed length (16 bit)
+ * prefix values are used for all inode numbers on guest level. Accordingly
+ * guest's inode numbers will be quite large (>2^48).
+ *
+ * If this option is enabled then variable length suffixes will be used for
+ * guest's inode numbers instead which usually yields in much smaller inode
+ * numbers on guest level (typically around >2^1 .. >2^7).
+ */
+#define P9_VARI_LENGTH_INODE_SUFFIXES 1
+
+/*
+ * If this value is higher than 0 then file inode remapping is tried to be
+ * retained consistent beyond reboots/suspends.
+ *
+ * Once qpp_table size exceeds this value, we no longer save
+ * the table persistently. See comment in v9fs_store_qpp_table()
+ *
+ * By setting this to 0, persistency of file ids will (beyond the scope of
+ * reboots/suspends) will be disabled completely at compile time.
+ */
+#define QPP_TABLE_PERSISTENCY_LIMIT 0
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+typedef enum AffixType_t {
+    AffixType_Prefix,
+    AffixType_Suffix, /* A.k.a. postfix. */
+} AffixType_t;
+
+/** @brief Unique affix of variable length.
+ *
+ * An affix is (currently) either a suffix or a prefix, which is either
+ * going to be prepended (prefix) or appended (suffix) with some other
+ * number for the goal to generate unique numbers. Accordingly the
+ * suffixes (or prefixes) we generate @b must all have the mathematical
+ * property of being suffix-free (or prefix-free in case of prefixes)
+ * so that no matter what number we concatenate the affix with, that we
+ * always reliably get unique numbers as result after concatenation.
+ */
+typedef struct VariLenAffix {
+    AffixType_t type; /* Whether this affix is a suffix or a prefix. */
+    uint64_t value; /* Actual numerical value of this affix. */
+    int bits; /* Lenght of the affix, that is how many (of the lowest) bits of @c value must be used for appending/prepending this affix to its final resulting, unique number. */
+} VariLenAffix;
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+#if !P9_VARI_LENGTH_INODE_SUFFIXES
+# define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+#endif
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* See qid_inode_prefix_hash_bits(). */
+typedef struct {
+    dev_t dev; /* FS device on host. */
+    int prefix_bits; /* How many (high) bits of the original inode number shall be used for hashing. */
+} QpdEntry;
+
+# if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+
+/* Small version of QpdEntry for serialization as xattr. */
+struct QpdEntryS {
+    dev_t dev;
+    uint8_t prefix_bits;
+}  __attribute__((packed));
+typedef struct QpdEntryS QpdEntryS;
+
+# endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
 
 /* QID path prefix entry, see stat_to_qid */
 typedef struct {
     dev_t dev;
     uint16_t ino_prefix;
-    uint16_t qp_prefix;
+    #if P9_VARI_LENGTH_INODE_SUFFIXES
+    uint32_t qp_affix_index;
+    VariLenAffix qp_affix;
+    #else
+    uint16_t qp_affix;
+    #endif
 } QppEntry;
 
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+
 /* Small version of QppEntry for serialization as xattr. */
 struct QppEntryS {
     dev_t dev;
@@ -252,6 +332,8 @@ struct QppEntryS {
 } __attribute__((packed));
 typedef struct QppEntryS QppEntryS;
 
+#endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
 /* QID path full entry, as above */
 typedef struct {
     dev_t dev;
@@ -259,6 +341,19 @@ typedef struct {
     uint64_t path;
 } QpfEntry;
 
+#if (QPP_TABLE_PERSISTENCY_LIMIT > 0)
+
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+typedef struct {
+    QpdEntryS *elements;
+    uint count; /* In: QpdEntryS count in @a elements */
+    uint done; /* Out: how many QpdEntryS did we actually fill in @a elements */
+    int error; /* Out: zero on success */
+} QpdSerialize;
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 typedef struct {
     QppEntryS *elements;
     uint count; /* In: QppEntryS count in @a elements */
@@ -268,7 +363,7 @@ typedef struct {
 
 struct QppSrlzHeader {
     uint16_t version;
-    uint16_t reserved; /* might be used e.g. for flags in future */
+    uint16_t options;
     uint32_t crc32;
 } __attribute__((packed));
 typedef struct QppSrlzHeader QppSrlzHeader;
@@ -279,12 +374,24 @@ struct QppSrlzStream {
 } __attribute__((packed));
 typedef struct QppSrlzStream QppSrlzStream;
 
+# if P9_VARI_LENGTH_INODE_SUFFIXES
+
+struct QpdSrlzStream {
+    QppSrlzHeader header;
+    QpdEntryS elements[0];
+} __attribute__((packed));
+typedef struct QpdSrlzStream QpdSrlzStream;
+
+# endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
 typedef struct XAttrNode {
     uint8_t* value;
     ssize_t length;
     struct XAttrNode* next;
 } XAttrNode;
 
+#endif /* (QPP_TABLE_PERSISTENCY_LIMIT > 0) */
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -306,9 +413,15 @@ struct V9fsState
     Error *migration_blocker;
     V9fsConf fsconf;
     V9fsQID root_qid;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    struct qht qpd_table;
+#endif
     struct qht qpp_table;
     struct qht qpf_table;
-    uint16_t qp_prefix_next;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    uint64_t qp_ndevices; /* Amount of entries in qpd_table. */
+#endif
+    uint16_t qp_affix_next;
     uint64_t qp_fullpath_next;
 };
 
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/4] 9p: mitigates most QID path collisions
  2019-05-03 16:21   ` Christian Schoenebeck via Qemu-devel
  (?)
@ 2019-05-07  9:29   ` Greg Kurz
  -1 siblings, 0 replies; 11+ messages in thread
From: Greg Kurz @ 2019-05-07  9:29 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel, Antonios Motakis

On Fri, 03 May 2019 18:21:36 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> This first patch here is an updated version of Antonios Motakis'
> original 4-patch set, merged to one patch:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02283.html
> 
> * Updated to latest git master, specifically to new qht interface.
> 
> * Merged the original 4 patches to this single patch.
> 

Not sure this helps review... Especially, we had asked for some changes
at the time and I don't see them in this unique patch...

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  fsdev/9p-marshal.h |   4 +-
>  hw/9pfs/9p.c       | 200 ++++++++++++++++++++++++++++++++++++++++++++++++-----
>  hw/9pfs/9p.h       |  21 ++++++
>  3 files changed, 204 insertions(+), 21 deletions(-)
> 
> diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
> index c8823d878f..d1ad3645c4 100644
> --- a/fsdev/9p-marshal.h
> +++ b/fsdev/9p-marshal.h
> @@ -10,8 +10,8 @@ typedef struct V9fsString
>  typedef struct V9fsQID
>  {
>      int8_t type;

... changing type to uint8_t or...

> -    int32_t version;
> -    int64_t path;
> +    uint32_t version;
> +    uint64_t path;

... fixing the locations that still expect signed version and path...

>  } V9fsQID;
>  
>  typedef struct V9fsStat
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 55821343e5..b9bbdcbaee 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -25,6 +25,7 @@
>  #include "trace.h"
>  #include "migration/blocker.h"
>  #include "sysemu/qtest.h"
> +#include "qemu/xxhash.h"
>  
>  int open_fd_hw;
>  int total_open_fd;
> @@ -571,14 +572,135 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
>                                  P9_STAT_MODE_NAMED_PIPE |   \
>                                  P9_STAT_MODE_SOCKET)
>  
> -/* This is the algorithm from ufs in spfs */
> -static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
> +
> +/* creative abuse of qemu_xxhash7, which is based on xxhash */
> +static uint32_t qpp_hash(QppEntry e)
>  {
> -    size_t size;
> +    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
> +}
> +
> +static uint32_t qpf_hash(QpfEntry e)
> +{
> +    return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
> +}
> +
> +static bool qpp_cmp_func(const void *obj, const void *userp)
> +{
> +    const QppEntry *e1 = obj, *e2 = userp;
> +    return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
> +}
> +
> +static bool qpf_cmp_func(const void *obj, const void *userp)
> +{
> +    const QpfEntry *e1 = obj, *e2 = userp;
> +    return (e1->dev == e2->dev) && (e1->ino == e2->ino);
> +}
> +
> +static void qp_table_remove(void *p, uint32_t h, void *up)
> +{
> +    g_free(p);
> +}
> +
> +static void qp_table_destroy(struct qht *ht)
> +{
> +    qht_iter(ht, qp_table_remove, NULL);
> +    qht_destroy(ht);
> +}
> +
> +static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
> +                            uint64_t *path)
> +{
> +    QpfEntry lookup = {
> +        .dev = stbuf->st_dev,
> +        .ino = stbuf->st_ino
> +    }, *val;
> +    uint32_t hash = qpf_hash(lookup);
> +
> +    /* most users won't need the fullmap, so init the table lazily */
> +    if (!pdu->s->qpf_table.map) {
> +        qht_init(&pdu->s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
> +    }
> +
> +    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
> +
> +    if (!val) {
> +        if (pdu->s->qp_fullpath_next == 0) {
> +            /* no more files can be mapped :'( */
> +            return -ENFILE;
> +        }
> +
> +        val = g_malloc0(sizeof(QppEntry));
> +        if (!val) {
> +            return -ENOMEM;
> +        }

... and this useless error path since g_malloc0() always returns a
non-null values, otherwise aborts.

https://developer.gnome.org/glib/stable/glib-Memory-Allocation.html#glib-Memory-Allocation.description

Please address all the comments and make this four patches again when
you repost.

> +        *val = lookup;
> +
> +        /* new unique inode and device combo */
> +        val->path = pdu->s->qp_fullpath_next++;
> +        pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
> +        qht_insert(&pdu->s->qpf_table, val, hash, NULL);
> +    }
> +
> +    *path = val->path;
> +    return 0;
> +}
> +
> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
> + * to a unique QID path (64 bits). To avoid having to map and keep track
> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
> + * of the QID path equal to the lowest bits of the inode number.
> + *
> + * This takes advantage of the fact that inode number are usually not
> + * random but allocated sequentially, so we have fewer items to keep
> + * track of.
> + */
> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> +                                uint64_t *path)
> +{
> +    QppEntry lookup = {
> +        .dev = stbuf->st_dev,
> +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> +    }, *val;
> +    uint32_t hash = qpp_hash(lookup);
> +
> +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> +
> +    if (!val) {
> +        if (pdu->s->qp_prefix_next == 0) {
> +            /* we ran out of prefixes */
> +            return -ENFILE;
> +        }
> +
> +        val = g_malloc0(sizeof(QppEntry));
> +        if (!val) {
> +            return -ENOMEM;
> +        }
> +        *val = lookup;
> +
> +        /* new unique inode prefix and device combo */
> +        val->qp_prefix = pdu->s->qp_prefix_next++;
> +        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
> +    }
> +
> +    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> +    return 0;
> +}
> +
> +static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
> +{
> +    int err;
> +
> +    /* map inode+device to qid path (fast path) */
> +    err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> +    if (err == -ENFILE) {
> +        /* fast path didn't work, fal back to full map */
> +        err = qid_path_fullmap(pdu, stbuf, &qidp->path);
> +    }
> +    if (err) {
> +        return err;
> +    }
>  
> -    memset(&qidp->path, 0, sizeof(qidp->path));
> -    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
> -    memcpy(&qidp->path, &stbuf->st_ino, size);
>      qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
>      qidp->type = 0;
>      if (S_ISDIR(stbuf->st_mode)) {
> @@ -587,6 +709,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
>      if (S_ISLNK(stbuf->st_mode)) {
>          qidp->type |= P9_QID_TYPE_SYMLINK;
>      }
> +
> +    return 0;
>  }
>  
>  static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
> @@ -599,7 +723,10 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
>      if (err < 0) {
>          return err;
>      }
> -    stat_to_qid(&stbuf, qidp);
> +    err = stat_to_qid(pdu, &stbuf, qidp);
> +    if (err < 0) {
> +        return err;
> +    }
>      return 0;
>  }
>  
> @@ -830,7 +957,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
>  
>      memset(v9stat, 0, sizeof(*v9stat));
>  
> -    stat_to_qid(stbuf, &v9stat->qid);
> +    err = stat_to_qid(pdu, stbuf, &v9stat->qid);
> +    if (err < 0) {
> +        return err;
> +    }
>      v9stat->mode = stat_to_v9mode(stbuf);
>      v9stat->atime = stbuf->st_atime;
>      v9stat->mtime = stbuf->st_mtime;
> @@ -891,7 +1021,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
>  #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
>  
>  
> -static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
> +static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
>                                  V9fsStatDotl *v9lstat)
>  {
>      memset(v9lstat, 0, sizeof(*v9lstat));
> @@ -913,7 +1043,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
>      /* Currently we only support BASIC fields in stat */
>      v9lstat->st_result_mask = P9_STATS_BASIC;
>  
> -    stat_to_qid(stbuf, &v9lstat->qid);
> +    return stat_to_qid(pdu, stbuf, &v9lstat->qid);
>  }
>  
>  static void print_sg(struct iovec *sg, int cnt)
> @@ -1115,7 +1245,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
>      uint64_t request_mask;
>      V9fsStatDotl v9stat_dotl;
>      V9fsPDU *pdu = opaque;
> -    V9fsState *s = pdu->s;
>  
>      retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
>      if (retval < 0) {
> @@ -1136,7 +1265,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
>      if (retval < 0) {
>          goto out;
>      }
> -    stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
> +    retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
> +    if (retval < 0) {
> +        goto out;
> +    }
>  
>      /*  fill st_gen if requested and supported by underlying fs */
>      if (request_mask & P9_STATS_GEN) {
> @@ -1381,7 +1513,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
>              if (err < 0) {
>                  goto out;
>              }
> -            stat_to_qid(&stbuf, &qid);
> +            err = stat_to_qid(pdu, &stbuf, &qid);
> +            if (err < 0) {
> +                goto out;
> +            }
>              v9fs_path_copy(&dpath, &path);
>          }
>          memcpy(&qids[name_idx], &qid, sizeof(qid));
> @@ -1483,7 +1618,10 @@ static void coroutine_fn v9fs_open(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      if (S_ISDIR(stbuf.st_mode)) {
>          err = v9fs_co_opendir(pdu, fidp);
>          if (err < 0) {
> @@ -1593,7 +1731,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
>          fidp->flags |= FID_NON_RECLAIMABLE;
>      }
>      iounit =  get_iounit(pdu, &fidp->path);
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
>      if (err < 0) {
>          goto out;
> @@ -2327,7 +2468,10 @@ static void coroutine_fn v9fs_create(void *opaque)
>          }
>      }
>      iounit = get_iounit(pdu, &fidp->path);
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
>      if (err < 0) {
>          goto out;
> @@ -2384,7 +2528,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err =  pdu_marshal(pdu, offset, "Q", &qid);
>      if (err < 0) {
>          goto out;
> @@ -3064,7 +3211,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Q", &qid);
>      if (err < 0) {
>          goto out;
> @@ -3222,7 +3372,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Q", &qid);
>      if (err < 0) {
>          goto out;
> @@ -3633,6 +3786,11 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>          goto out;
>      }
>  
> +    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
> +    qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
> +    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
> +    s->qp_fullpath_next = 1;
> +
>      s->ctx.fst = &fse->fst;
>      fsdev_throttle_init(s->ctx.fst);
>  
> @@ -3646,6 +3804,8 @@ out:
>          }
>          g_free(s->tag);
>          g_free(s->ctx.fs_root);
> +        qp_table_destroy(&s->qpp_table);
> +        qp_table_destroy(&s->qpf_table);
>          v9fs_path_free(&path);
>      }
>      return rc;
> @@ -3658,6 +3818,8 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
>      }
>      fsdev_throttle_cleanup(s->ctx.fst);
>      g_free(s->tag);
> +    qp_table_destroy(&s->qpp_table);
> +    qp_table_destroy(&s->qpf_table);
>      g_free(s->ctx.fs_root);
>  }
>  
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 8883761b2c..44112ea97f 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -8,6 +8,7 @@
>  #include "fsdev/9p-iov-marshal.h"
>  #include "qemu/thread.h"
>  #include "qemu/coroutine.h"
> +#include "qemu/qht.h"
>  
>  enum {
>      P9_TLERROR = 6,
> @@ -235,6 +236,22 @@ struct V9fsFidState
>      V9fsFidState *rclm_lst;
>  };
>  
> +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> +
> +/* QID path prefix entry, see stat_to_qid */
> +typedef struct {
> +    dev_t dev;
> +    uint16_t ino_prefix;
> +    uint16_t qp_prefix;
> +} QppEntry;
> +
> +/* QID path full entry, as above */
> +typedef struct {
> +    dev_t dev;
> +    ino_t ino;
> +    uint64_t path;
> +} QpfEntry;
> +
>  struct V9fsState
>  {
>      QLIST_HEAD(, V9fsPDU) free_list;
> @@ -256,6 +273,10 @@ struct V9fsState
>      Error *migration_blocker;
>      V9fsConf fsconf;
>      V9fsQID root_qid;
> +    struct qht qpp_table;
> +    struct qht qpf_table;
> +    uint16_t qp_prefix_next;
> +    uint64_t qp_fullpath_next;
>  };
>  
>  /* 9p2000.L open flags */



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH v2 2/4] 9P: trivial cleanup of QID path collision mitigation
  2019-05-03 16:22   ` Christian Schoenebeck via Qemu-devel
  (?)
@ 2019-05-07  9:36   ` Greg Kurz
  -1 siblings, 0 replies; 11+ messages in thread
From: Greg Kurz @ 2019-05-07  9:36 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel, Antonios Motakis

On Fri, 03 May 2019 18:22:12 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> Addresses trivial changes regarding the previous patch as requested
> on the mailing list a while ago.
> 

Ah... so that explains why I couldn't find the changes in the other
patch.

> * Removed unneccessary parantheses:
>   https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02661.html
> 
> * Removed unneccessary g_malloc() result checks:
>   https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02814.html
> 
> * Unsigned type changes:
>   https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02581.html
> 

I'd prefer to see these applied to Antonio's original patches
along with your rebase changes. It will be easier for me to
review, ie. more likely for your contribution to be merged fast. :)

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  fsdev/9p-marshal.h   |  2 +-
>  hw/9pfs/9p.c         | 16 +++++-----------
>  hw/9pfs/trace-events | 14 +++++++-------
>  3 files changed, 13 insertions(+), 19 deletions(-)
> 
> diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
> index d1ad3645c4..8f3babb60a 100644
> --- a/fsdev/9p-marshal.h
> +++ b/fsdev/9p-marshal.h
> @@ -9,7 +9,7 @@ typedef struct V9fsString
>  
>  typedef struct V9fsQID
>  {
> -    int8_t type;
> +    uint8_t type;
>      uint32_t version;
>      uint64_t path;
>  } V9fsQID;
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index b9bbdcbaee..2b893e25a1 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -587,13 +587,13 @@ static uint32_t qpf_hash(QpfEntry e)
>  static bool qpp_cmp_func(const void *obj, const void *userp)
>  {
>      const QppEntry *e1 = obj, *e2 = userp;
> -    return (e1->dev == e2->dev) && (e1->ino_prefix == e2->ino_prefix);
> +    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
>  }
>  
>  static bool qpf_cmp_func(const void *obj, const void *userp)
>  {
>      const QpfEntry *e1 = obj, *e2 = userp;
> -    return (e1->dev == e2->dev) && (e1->ino == e2->ino);
> +    return e1->dev == e2->dev && e1->ino == e2->ino;
>  }
>  
>  static void qp_table_remove(void *p, uint32_t h, void *up)
> @@ -630,9 +630,6 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
>          }
>  
>          val = g_malloc0(sizeof(QppEntry));
> -        if (!val) {
> -            return -ENOMEM;
> -        }
>          *val = lookup;
>  
>          /* new unique inode and device combo */
> @@ -673,9 +670,6 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
>          }
>  
>          val = g_malloc0(sizeof(QppEntry));
> -        if (!val) {
> -            return -ENOMEM;
> -        }
>          *val = lookup;
>  
>          /* new unique inode prefix and device combo */
> @@ -870,9 +864,9 @@ static int donttouch_stat(V9fsStat *stat)
>  {
>      if (stat->type == -1 &&
>          stat->dev == -1 &&
> -        stat->qid.type == -1 &&
> -        stat->qid.version == -1 &&
> -        stat->qid.path == -1 &&
> +        stat->qid.type == 0xff &&
> +        stat->qid.version == (uint32_t) -1 &&
> +        stat->qid.path == (uint64_t) -1 &&
>          stat->mode == -1 &&
>          stat->atime == -1 &&
>          stat->mtime == -1 &&
> diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
> index c0a0a4ab5d..6964756922 100644
> --- a/hw/9pfs/trace-events
> +++ b/hw/9pfs/trace-events
> @@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d"
>  v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
>  v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
>  v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"
> -v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
> +v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64
>  v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
>  v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}"
>  v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64
> @@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod
>  v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d"
>  v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p"
>  v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d"
> -v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
> +v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
>  v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u"
> -v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
> +v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
>  v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d"
>  v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
>  v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u"
> @@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t
>  v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d"
>  v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd"
>  v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d"
> -v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
> +v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
>  v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid,  char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u"
> -v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
> +v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
>  v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d"
>  v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s"
>  v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
>  v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}"
>  v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d"
> -v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
> +v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
>  v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
>  v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d"
>  v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
>  v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u"
>  v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u"
> -v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
> +v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
>  v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
>  v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
>  v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH v2 3/4] 9p: persistency of QID path beyond reboots / suspensions
  2019-05-03 16:23   ` Christian Schoenebeck via Qemu-devel
  (?)
@ 2019-05-07  9:41   ` Greg Kurz
  -1 siblings, 0 replies; 11+ messages in thread
From: Greg Kurz @ 2019-05-07  9:41 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel, Antonios Motakis

On Fri, 03 May 2019 18:23:03 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> This patch aims to keep QID path identical beyond the scope of reboots and
> guest suspensions. With the 1st patch alone the QID path of the same files
> might change after reboots / suspensions, since 9p would restart with
> empty qpp_table and the resulting QID path depends on the precise sequence
> of files being accessed on guest.
> 
> The first patch should already avoid the vast majority of potential QID
> path collisions. However especially network services running on guest
> would still be prone to QID path issues when just using the 1st patch.
> For instance Samba is exporting file IDs to clients in the network and
> SMB cliens in the network will use those IDs to access and request
> changes on the file server. If guest is now interrupted in between, like
> it commonly happens on maintenance, e.g. software updates on host, then
> SMB clients in the network will continue working with old file IDs, which
> in turn leads to data corruption and data loss on the file server.
> Furthermore on SMB client side I also encountered severe misbehaviours in
> this case, for instance Macs accessing the file server would either
> start to hang or die with a kernel panic within seconds, since the smbx
> implementation on macOS heavily relies on file IDs being unique (within
> the context of a connection that is).
> 
> So this patch here mitigates the remaining problem described above by
> storing the qpp_table persistently as extended attribute(s) on the
> exported root of the file system and automatically tries to restore the
> qpp_table i.e. after reboots / resumptions.
> 
> This patch is aimed at real world scenarios, in which qpp_table will only
> ever get few dozens of entries (and none ever in qpf_table). So it is e.g.
> intentionally limited to only store qpp_table, not qpf_table; and so far
> I have not made optimizations, since in practice the qpf_table is really
> just tiny.
> 
> Since there is currently no callback in qemu yet that would reliably be
> called on guest shutdowns, the table is stored on every new insertion for
> now.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/9pfs/9p.h |  33 +++++++
>  2 files changed, 343 insertions(+), 5 deletions(-)
> 

Yikes... both the changelog and the diffstat have an impressive size. Since
I'm likely the only QEMU developer who will review this, please expect some
delay before I get down to it... :-\ Of course, if you can split this into
smaller patches, it will probably help :)

Same remark for patch 4.

> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 2b893e25a1..29c6dfc68a 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -26,6 +26,19 @@
>  #include "migration/blocker.h"
>  #include "sysemu/qtest.h"
>  #include "qemu/xxhash.h"
> +#include "qemu/crc32c.h"
> +#if defined(__linux__) /* TODO: This should probably go into osdep.h instead */
> +# include <linux/limits.h> /* for XATTR_SIZE_MAX */
> +#endif
> +
> +/*
> + * How many bytes may we store to fs per extended attribute value?
> + */
> +#ifdef XATTR_SIZE_MAX
> +# define ATTR_MAX_SIZE XATTR_SIZE_MAX /* Linux only: 64kB limit in kernel */
> +#else
> +# define ATTR_MAX_SIZE 65536 /* Most systems allow a bit more, so we take this as basis.  */
> +#endif
>  
>  int open_fd_hw;
>  int total_open_fd;
> @@ -642,6 +655,285 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
>      return 0;
>  }
>  
> +static inline bool is_ro_export(FsContext *ctx)
> +{
> +    return ctx->export_flags & V9FS_RDONLY;
> +}
> +
> +/*
> + * Once qpp_table size exceeds this value, we no longer save
> + * the table persistently. See comment in v9fs_store_qpp_table()
> + */
> +#define QPP_TABLE_PERSISTENCY_LIMIT 32768
> +
> +/* Remove all user.virtfs.system.qidp.* xattrs from export root. */
> +static void remove_qidp_xattr(FsContext *ctx)
> +{
> +    V9fsString name;
> +    int i;
> +
> +    /* just for a paranoid endless recursion sanity check */
> +    const ssize_t max_size =
> +        sizeof(QppSrlzHeader) +
> +        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QppEntryS);
> +
> +    v9fs_string_init(&name);
> +    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
> +        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
> +        if (lremovexattr(ctx->fs_root, name.data) < 0)
> +            break;
> +    }
> +    v9fs_string_free(&name);
> +}
> +
> +/* Used to convert qpp hash table into continuous stream. */
> +static void qpp_table_serialize(void *p, uint32_t h, void *up)
> +{
> +    const QppEntry *entry = (const QppEntry*) p;
> +    QppSerialize *ser = (QppSerialize*) up;
> +
> +    if (ser->error)
> +        return;
> +
> +    /* safety first */
> +    if (entry->qp_prefix - 1 >= ser->count) {
> +        ser->error = -1;
> +        return;
> +    }
> +
> +    ser->elements[entry->qp_prefix - 1] = (QppEntryS) {
> +        .dev = entry->dev,
> +        .ino_prefix = entry->ino_prefix
> +    };
> +    ser->done++;
> +}
> +
> +/*
> + * Tries to store the current qpp_table as extended attribute(s) on the
> + * exported file system root with the goal to preserve identical qids
> + * beyond the scope of reboots.
> + */
> +static void v9fs_store_qpp_table(V9fsState *s)
> +{
> +    FsContext *ctx = &s->ctx;
> +    V9fsString name;
> +    int i, res;
> +    size_t size;
> +    QppSrlzStream* stream;
> +    QppSerialize ser;
> +
> +    if (is_ro_export(ctx))
> +        return;
> +
> +    /*
> +     * Whenever we exceeded some certain (arbitrary) high qpp_table size we
> +     * delete the stored table from the file system to get rid of old device
> +     * ids / inodes that might no longer exist with the goal to potentially
> +     * yield in a smaller table size after next reboot.
> +     */
> +    if (!s->qp_prefix_next || s->qp_prefix_next >= QPP_TABLE_PERSISTENCY_LIMIT) {
> +        if (s->qp_prefix_next == QPP_TABLE_PERSISTENCY_LIMIT) {
> +            remove_qidp_xattr(ctx);
> +        }
> +        return;
> +    }
> +
> +    /* Convert qpp hash table into continuous array. */
> +    size = sizeof(QppSrlzHeader) +
> +           ( (s->qp_prefix_next - 1) /* qpp_table entry count */ * sizeof(QppEntryS) );
> +    stream = g_malloc0(size);
> +    ser = (QppSerialize) {
> +        .elements = &stream->elements[0],
> +        .count = s->qp_prefix_next - 1,
> +        .done  = 0,
> +        .error = 0,
> +    };
> +    qht_iter(&s->qpp_table, qpp_table_serialize, &ser);
> +    if (ser.error || ser.done != ser.count)
> +        goto out;
> +
> +    /* initialize header and calculate CRC32 checksum */
> +    stream->header = (QppSrlzHeader) {
> +        .version = 1,
> +        .reserved = 0,
> +        .crc32 = crc32c(
> +            0xffffffff,
> +            (const uint8_t*) &stream->elements[0],
> +            (ser.count * sizeof(QppEntryS))
> +        ),
> +    };
> +
> +    /*
> +     * Actually just required if the qpp_table size decreased, or if the
> +     * previous xattr size limit increased on OS (kernel/fs) level.
> +     */
> +    remove_qidp_xattr(ctx);
> +
> +    /*
> +     * Subdivide (if required) the data stream into individual xattrs
> +     * to cope with the system's max. supported xattr value size.
> +     */
> +    v9fs_string_init(&name);
> +    for (i = 0; size > (i * ATTR_MAX_SIZE); ++i) {
> +        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
> +        res = lsetxattr(
> +            ctx->fs_root,
> +            name.data,
> +            ((const uint8_t*)stream) + i * ATTR_MAX_SIZE,
> +            MIN(ATTR_MAX_SIZE, size - i * ATTR_MAX_SIZE),
> +            0/*flags*/
> +        );
> +        if (res < 0) {
> +            if (i > 0)
> +                remove_qidp_xattr(ctx);
> +            break;
> +        }
> +    }
> +    v9fs_string_free(&name);
> +out:
> +    g_free(stream);
> +}
> +
> +/* Frees the entire chain of passed nodes from memory. */
> +static void destroy_xattr_nodes(XAttrNode **first)
> +{
> +    XAttrNode *prev;
> +    if (!first)
> +        return;
> +    while (*first) {
> +        if ((*first)->value)
> +            g_free((*first)->value);
> +        prev = *first;
> +        *first = (*first)->next;
> +        g_free(prev);
> +    }
> +}
> +
> +/*
> + * Loads all user.virtfs.system.qidp.* xattrs from exported fs root and
> + * returns a linked list with one node per xattr.
> + */
> +static XAttrNode* v9fs_load_qidp_xattr_nodes(V9fsState *s)
> +{
> +    FsContext *ctx = &s->ctx;
> +    XAttrNode *first = NULL, *current = NULL;
> +    V9fsString name;
> +    ssize_t size;
> +    int i;
> +
> +    const ssize_t max_size =
> +        sizeof(QppSrlzHeader) +
> +        QPP_TABLE_PERSISTENCY_LIMIT * sizeof(QppEntryS);
> +
> +    v9fs_string_init(&name);
> +
> +    for (i = 0; i * ATTR_MAX_SIZE < max_size; ++i) {
> +        v9fs_string_sprintf(&name, "user.virtfs.system.qidp.%d", i);
> +        size = lgetxattr(ctx->fs_root, name.data, NULL, 0);
> +        if (size <= 0)
> +            break;
> +        if (!first) {
> +            first = current = g_malloc0(sizeof(XAttrNode));
> +        } else {
> +            current = current->next = g_malloc0(sizeof(XAttrNode));
> +        }
> +        current->value = g_malloc0(size);
> +        current->length = lgetxattr(
> +            ctx->fs_root, name.data, current->value, size
> +        );
> +        if (current->length <= 0) {
> +            goto out_w_err;
> +        }
> +    }
> +    goto out;
> +
> +out_w_err:
> +    destroy_xattr_nodes(&first);
> +out:
> +    v9fs_string_free(&name);
> +    return first;
> +}
> +
> +/*
> + * Try to load previously stored qpp_table from file system. Calling this
> + * function assumes that qpp_table is yet empty.
> + *
> + * @see v9fs_store_qpp_table()
> + */
> +static void v9fs_load_qpp_table(V9fsState *s)
> +{
> +    ssize_t size, count;
> +    XAttrNode *current, *first;
> +    QppSrlzStream* stream = NULL;
> +    uint32_t crc32;
> +    int i;
> +    QppEntry *val;
> +    uint32_t hash;
> +
> +    if (s->qp_prefix_next != 1)
> +        return;
> +
> +    first = v9fs_load_qidp_xattr_nodes(s);
> +    if (!first)
> +        return;
> +
> +    /* convert nodes into continuous stream */
> +    size = 0;
> +    for (current = first; current; current = current->next) {
> +        size += current->length;
> +    }
> +    if (size <= 0) {
> +        goto out;
> +    }
> +    stream = g_malloc0(size);
> +    size = 0;
> +    for (current = first; current; current = current->next) {
> +        memcpy(((uint8_t*)stream) + size, current->value, current->length);
> +        size += current->length;
> +    }
> +
> +    if (stream->header.version != 1) {
> +        goto out;
> +    }
> +
> +    count = (size - sizeof(QppSrlzHeader)) / sizeof(QppEntryS);
> +    if (count <= 0) {
> +        goto out;
> +    }
> +
> +    /* verify CRC32 checksum of stream */
> +    crc32 = crc32c(
> +        0xffffffff,
> +        (const uint8_t*) &stream->elements[0],
> +        (count * sizeof(QppEntryS))
> +    );
> +    if (crc32 != stream->header.crc32) {
> +        goto out;
> +    }
> +
> +    /* fill qpp_table with the retrieved elements */
> +    for (i = 0; i < count; ++i) {
> +        val = g_malloc0(sizeof(QppEntry));
> +        *val = (QppEntry) {
> +            .dev = stream->elements[i].dev,
> +            .ino_prefix = stream->elements[i].ino_prefix,
> +        };
> +        hash = qpp_hash(*val);
> +        if (qht_lookup(&s->qpp_table, val, hash)) {
> +            /* should never happen: duplicate entry detected */
> +            g_free(val);
> +            goto out;
> +        }
> +        val->qp_prefix = s->qp_prefix_next++;
> +        qht_insert(&s->qpp_table, val, hash, NULL);
> +    }
> +
> +out:
> +    destroy_xattr_nodes(&first);
> +    if (stream)
> +        g_free(stream);
> +}
> +
>  /* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
>   * to a unique QID path (64 bits). To avoid having to map and keep track
>   * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> @@ -675,6 +967,14 @@ static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
>          /* new unique inode prefix and device combo */
>          val->qp_prefix = pdu->s->qp_prefix_next++;
>          qht_insert(&pdu->s->qpp_table, val, hash, NULL);
> +
> +        /*
> +         * Store qpp_table as extended attribute(s) to file system.
> +         *
> +         * TODO: This should better only be called from a guest shutdown and
> +         * suspend handler.
> +         */
> +        v9fs_store_qpp_table(pdu->s);
>      }
>  
>      *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> @@ -1064,11 +1364,6 @@ static void v9fs_fix_path(V9fsPath *dst, V9fsPath *src, int len)
>      v9fs_path_free(&str);
>  }
>  
> -static inline bool is_ro_export(FsContext *ctx)
> -{
> -    return ctx->export_flags & V9FS_RDONLY;
> -}
> -
>  static void coroutine_fn v9fs_version(void *opaque)
>  {
>      ssize_t err;
> @@ -3784,6 +4079,8 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>      qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
>      s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
>      s->qp_fullpath_next = 1;
> +    /* try to load and restore previous qpp_table */
> +    v9fs_load_qpp_table(s);
>  
>      s->ctx.fst = &fse->fst;
>      fsdev_throttle_init(s->ctx.fst);
> @@ -3807,6 +4104,14 @@ out:
>  
>  void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
>  {
> +    /*
> +     * Store qpp_table as extended attribute(s) to file system.
> +     *
> +     * This was actually plan A, but unfortunately unserialize is not called
> +     * reliably on guest shutdowns and suspensions.
> +     */
> +    v9fs_store_qpp_table(s);
> +
>      if (s->ops->cleanup) {
>          s->ops->cleanup(&s->ctx);
>      }
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 44112ea97f..54ce039969 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -245,6 +245,13 @@ typedef struct {
>      uint16_t qp_prefix;
>  } QppEntry;
>  
> +/* Small version of QppEntry for serialization as xattr. */
> +struct QppEntryS {
> +    dev_t dev;
> +    uint16_t ino_prefix;
> +} __attribute__((packed));
> +typedef struct QppEntryS QppEntryS;
> +
>  /* QID path full entry, as above */
>  typedef struct {
>      dev_t dev;
> @@ -252,6 +259,32 @@ typedef struct {
>      uint64_t path;
>  } QpfEntry;
>  
> +typedef struct {
> +    QppEntryS *elements;
> +    uint count; /* In: QppEntryS count in @a elements */
> +    uint done; /* Out: how many QppEntryS did we actually fill in @a elements */
> +    int error; /* Out: zero on success */
> +} QppSerialize;
> +
> +struct QppSrlzHeader {
> +    uint16_t version;
> +    uint16_t reserved; /* might be used e.g. for flags in future */
> +    uint32_t crc32;
> +} __attribute__((packed));
> +typedef struct QppSrlzHeader QppSrlzHeader;
> +
> +struct QppSrlzStream {
> +    QppSrlzHeader header;
> +    QppEntryS elements[0];
> +} __attribute__((packed));
> +typedef struct QppSrlzStream QppSrlzStream;
> +
> +typedef struct XAttrNode {
> +    uint8_t* value;
> +    ssize_t length;
> +    struct XAttrNode* next;
> +} XAttrNode;
> +
>  struct V9fsState
>  {
>      QLIST_HEAD(, V9fsPDU) free_list;



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-05-07  9:42 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <cover.1556896526.git.qemu_oss@crudebyte.com>
2019-05-03 16:21 ` [Qemu-devel] [PATCH v2 1/4] 9p: mitigates most QID path collisions Christian Schoenebeck
2019-05-03 16:21   ` Christian Schoenebeck via Qemu-devel
2019-05-07  9:29   ` Greg Kurz
2019-05-03 16:22 ` [Qemu-devel] [PATCH v2 2/4] 9P: trivial cleanup of QID path collision mitigation Christian Schoenebeck
2019-05-03 16:22   ` Christian Schoenebeck via Qemu-devel
2019-05-07  9:36   ` Greg Kurz
2019-05-03 16:23 ` [Qemu-devel] [PATCH v2 3/4] 9p: persistency of QID path beyond reboots / suspensions Christian Schoenebeck
2019-05-03 16:23   ` Christian Schoenebeck via Qemu-devel
2019-05-07  9:41   ` Greg Kurz
2019-05-03 16:24 ` [Qemu-devel] [PATCH v2 4/4] 9p: use variable length suffixes for inode mapping Christian Schoenebeck
2019-05-03 16:24   ` Christian Schoenebeck via Qemu-devel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.