qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path
  2019-06-26 18:57 [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions Christian Schoenebeck via Qemu-devel
@ 2019-06-26 18:25 ` Christian Schoenebeck via Qemu-devel
  2019-06-27 16:12   ` Greg Kurz
  2019-06-26 18:30 ` [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error Christian Schoenebeck via Qemu-devel
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-26 18:25 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

There is no need for signedness on these QID fields for 9p.

Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/9p-marshal.h   |  6 +++---
 hw/9pfs/9p.c         |  6 +++---
 hw/9pfs/trace-events | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
index c8823d878f..8f3babb60a 100644
--- a/fsdev/9p-marshal.h
+++ b/fsdev/9p-marshal.h
@@ -9,9 +9,9 @@ typedef struct V9fsString
 
 typedef struct V9fsQID
 {
-    int8_t type;
-    int32_t version;
-    int64_t path;
+    uint8_t type;
+    uint32_t version;
+    uint64_t path;
 } V9fsQID;
 
 typedef struct V9fsStat
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 55821343e5..586a6dccba 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -743,9 +743,9 @@ static int donttouch_stat(V9fsStat *stat)
 {
     if (stat->type == -1 &&
         stat->dev == -1 &&
-        stat->qid.type == -1 &&
-        stat->qid.version == -1 &&
-        stat->qid.path == -1 &&
+        stat->qid.type == 0xff &&
+        stat->qid.version == (uint32_t) -1 &&
+        stat->qid.path == (uint64_t) -1 &&
         stat->mode == -1 &&
         stat->atime == -1 &&
         stat->mtime == -1 &&
diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
index c0a0a4ab5d..6964756922 100644
--- a/hw/9pfs/trace-events
+++ b/hw/9pfs/trace-events
@@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d"
 v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
 v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
 v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"
-v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
+v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64
 v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}"
 v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64
@@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod
 v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d"
 v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p"
 v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d"
-v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u"
-v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d"
 v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u"
@@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t
 v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d"
 v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd"
 v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d"
-v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
+v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
 v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid,  char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u"
-v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
+v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
 v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d"
 v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s"
 v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}"
 v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d"
-v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
+v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
 v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
 v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d"
 v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
 v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u"
 v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u"
-v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
+v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
 v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
 v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
 v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"
-- 
2.11.0



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error
  2019-06-26 18:57 [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions Christian Schoenebeck via Qemu-devel
  2019-06-26 18:25 ` [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path Christian Schoenebeck via Qemu-devel
@ 2019-06-26 18:30 ` Christian Schoenebeck via Qemu-devel
  2019-06-27 17:26   ` Greg Kurz
  2019-06-26 18:42 ` [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes" Christian Schoenebeck via Qemu-devel
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-26 18:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

The QID path should uniquely identify a file. However, the
inode of a file is currently used as the QID path, which
on its own only uniquely identifies wiles within a device.
Here we track the device hosting the 9pfs share, in order
to prevent security issues with QID path collisions from
other devices.

Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 hw/9pfs/9p.h |  1 +
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 586a6dccba..cbaa212625 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -572,10 +572,20 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_SOCKET)
 
 /* This is the algorithm from ufs in spfs */
-static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
+static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
 {
     size_t size;
 
+    if (pdu->s->dev_id == 0) {
+        pdu->s->dev_id = stbuf->st_dev;
+    } else if (pdu->s->dev_id != stbuf->st_dev) {
+        error_report_once(
+            "9p: Multiple devices detected in same VirtFS export. "
+            "You must use a separate export for each device."
+        );
+        return -ENOSYS;
+    }
+
     memset(&qidp->path, 0, sizeof(qidp->path));
     size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
     memcpy(&qidp->path, &stbuf->st_ino, size);
@@ -587,6 +597,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
     if (S_ISLNK(stbuf->st_mode)) {
         qidp->type |= P9_QID_TYPE_SYMLINK;
     }
+
+    return 0;
 }
 
 static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
@@ -599,7 +611,10 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
     if (err < 0) {
         return err;
     }
-    stat_to_qid(&stbuf, qidp);
+    err = stat_to_qid(pdu, &stbuf, qidp);
+    if (err < 0) {
+        return err;
+    }
     return 0;
 }
 
@@ -830,7 +845,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
 
     memset(v9stat, 0, sizeof(*v9stat));
 
-    stat_to_qid(stbuf, &v9stat->qid);
+    err = stat_to_qid(pdu, stbuf, &v9stat->qid);
+    if (err < 0) {
+        return err;
+    }
     v9stat->mode = stat_to_v9mode(stbuf);
     v9stat->atime = stbuf->st_atime;
     v9stat->mtime = stbuf->st_mtime;
@@ -891,7 +909,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
 #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
 
 
-static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
+static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
                                 V9fsStatDotl *v9lstat)
 {
     memset(v9lstat, 0, sizeof(*v9lstat));
@@ -913,7 +931,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
     /* Currently we only support BASIC fields in stat */
     v9lstat->st_result_mask = P9_STATS_BASIC;
 
-    stat_to_qid(stbuf, &v9lstat->qid);
+    return stat_to_qid(pdu, stbuf, &v9lstat->qid);
 }
 
 static void print_sg(struct iovec *sg, int cnt)
@@ -1115,7 +1133,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
     uint64_t request_mask;
     V9fsStatDotl v9stat_dotl;
     V9fsPDU *pdu = opaque;
-    V9fsState *s = pdu->s;
 
     retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
     if (retval < 0) {
@@ -1136,7 +1153,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
     if (retval < 0) {
         goto out;
     }
-    stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
+    retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
+    if (retval < 0) {
+        goto out;
+    }
 
     /*  fill st_gen if requested and supported by underlying fs */
     if (request_mask & P9_STATS_GEN) {
@@ -1381,7 +1401,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
             if (err < 0) {
                 goto out;
             }
-            stat_to_qid(&stbuf, &qid);
+            err = stat_to_qid(pdu, &stbuf, &qid);
+            if (err < 0) {
+                goto out;
+            }
             v9fs_path_copy(&dpath, &path);
         }
         memcpy(&qids[name_idx], &qid, sizeof(qid));
@@ -1483,7 +1506,10 @@ static void coroutine_fn v9fs_open(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     if (S_ISDIR(stbuf.st_mode)) {
         err = v9fs_co_opendir(pdu, fidp);
         if (err < 0) {
@@ -1593,7 +1619,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
         fidp->flags |= FID_NON_RECLAIMABLE;
     }
     iounit =  get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
     if (err < 0) {
         goto out;
@@ -2327,7 +2356,10 @@ static void coroutine_fn v9fs_create(void *opaque)
         }
     }
     iounit = get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
     if (err < 0) {
         goto out;
@@ -2384,7 +2416,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err =  pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3064,7 +3099,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3222,7 +3260,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
     if (err < 0) {
         goto out;
     }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
     err = pdu_marshal(pdu, offset, "Q", &qid);
     if (err < 0) {
         goto out;
@@ -3633,6 +3674,8 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
         goto out;
     }
 
+    s->dev_id = 0;
+
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
 
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 8883761b2c..5e316178d5 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -256,6 +256,7 @@ struct V9fsState
     Error *migration_blocker;
     V9fsConf fsconf;
     V9fsQID root_qid;
+    dev_t dev_id;
 };
 
 /* 9p2000.L open flags */
-- 
2.11.0



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes"
  2019-06-26 18:57 [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions Christian Schoenebeck via Qemu-devel
  2019-06-26 18:25 ` [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path Christian Schoenebeck via Qemu-devel
  2019-06-26 18:30 ` [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error Christian Schoenebeck via Qemu-devel
@ 2019-06-26 18:42 ` Christian Schoenebeck via Qemu-devel
  2019-06-28 10:09   ` Greg Kurz
  2019-06-26 18:46 ` [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path Christian Schoenebeck via Qemu-devel
  2019-06-26 18:52 ` [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping Christian Schoenebeck via Qemu-devel
  4 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-26 18:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

To support multiple devices on the 9p share, and avoid
qid path collisions we take the device id as input
to generate a unique QID path. The lowest 48 bits of
the path will be set equal to the file inode, and the
top bits will be uniquely assigned based on the top
16 bits of the inode and the device id.

Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 fsdev/file-op-9p.h      |   1 +
 fsdev/qemu-fsdev-opts.c |   7 +++-
 fsdev/qemu-fsdev.c      |   6 +++
 hw/9pfs/9p.c            | 105 ++++++++++++++++++++++++++++++++++++++++++------
 hw/9pfs/9p.h            |  12 ++++++
 qemu-options.hx         |  17 +++++++-
 vl.c                    |   3 ++
 7 files changed, 135 insertions(+), 16 deletions(-)

diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
index c757c8099f..6c1663c252 100644
--- a/fsdev/file-op-9p.h
+++ b/fsdev/file-op-9p.h
@@ -59,6 +59,7 @@ typedef struct ExtendedOps {
 #define V9FS_RDONLY                 0x00000040
 #define V9FS_PROXY_SOCK_FD          0x00000080
 #define V9FS_PROXY_SOCK_NAME        0x00000100
+#define V9FS_REMAP_INODES           0x00000200
 
 #define V9FS_SEC_MASK               0x0000003C
 
diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
index 7c31ffffaf..64a8052266 100644
--- a/fsdev/qemu-fsdev-opts.c
+++ b/fsdev/qemu-fsdev-opts.c
@@ -31,7 +31,9 @@ static QemuOptsList qemu_fsdev_opts = {
         }, {
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
-
+        }, {
+            .name = "remap_inodes",
+            .type = QEMU_OPT_BOOL,
         }, {
             .name = "socket",
             .type = QEMU_OPT_STRING,
@@ -76,6 +78,9 @@ static QemuOptsList qemu_virtfs_opts = {
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
         }, {
+            .name = "remap_inodes",
+            .type = QEMU_OPT_BOOL,
+        }, {
             .name = "socket",
             .type = QEMU_OPT_STRING,
         }, {
diff --git a/fsdev/qemu-fsdev.c b/fsdev/qemu-fsdev.c
index 077a8c4e2b..b6fa9799be 100644
--- a/fsdev/qemu-fsdev.c
+++ b/fsdev/qemu-fsdev.c
@@ -121,6 +121,7 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
     const char *fsdev_id = qemu_opts_id(opts);
     const char *fsdriver = qemu_opt_get(opts, "fsdriver");
     const char *writeout = qemu_opt_get(opts, "writeout");
+    bool remap_inodes = qemu_opt_get_bool(opts, "remap_inodes", 0);
     bool ro = qemu_opt_get_bool(opts, "readonly", 0);
 
     if (!fsdev_id) {
@@ -161,6 +162,11 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
     } else {
         fsle->fse.export_flags &= ~V9FS_RDONLY;
     }
+    if (remap_inodes) {
+        fsle->fse.export_flags |= V9FS_REMAP_INODES;
+    } else {
+        fsle->fse.export_flags &= ~V9FS_REMAP_INODES;
+    }
 
     if (fsle->fse.ops->parse_opts) {
         if (fsle->fse.ops->parse_opts(opts, &fsle->fse, errp)) {
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index cbaa212625..7ccc68a829 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -25,6 +25,7 @@
 #include "trace.h"
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
+#include "qemu/xxhash.h"
 
 int open_fd_hw;
 int total_open_fd;
@@ -571,24 +572,96 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
-/* This is the algorithm from ufs in spfs */
+
+/* creative abuse of tb_hash_func7, which is based on xxhash */
+static uint32_t qpp_hash(QppEntry e)
+{
+    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
+}
+
+static bool qpp_lookup_func(const void *obj, const void *userp)
+{
+    const QppEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
+}
+
+static void qpp_table_remove(void *p, uint32_t h, void *up)
+{
+    g_free(p);
+}
+
+static void qpp_table_destroy(struct qht *ht)
+{
+    qht_iter(ht, qpp_table_remove, NULL);
+    qht_destroy(ht);
+}
+
+/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+ * to a unique QID path (64 bits). To avoid having to map and keep track
+ * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
+ * the device id to the 16 highest bits of the QID path. The 48 lowest bits
+ * of the QID path equal to the lowest bits of the inode number.
+ *
+ * This takes advantage of the fact that inode number are usually not
+ * random but allocated sequentially, so we have fewer items to keep
+ * track of.
+ */
+static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
+                                uint64_t *path)
+{
+    QppEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+    }, *val;
+    uint32_t hash = qpp_hash(lookup);
+
+    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_prefix_next == 0) {
+            /* we ran out of prefixes */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        *val = lookup;
+
+        /* new unique inode prefix and device combo */
+        val->qp_prefix = pdu->s->qp_prefix_next++;
+        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+    }
+
+    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+    return 0;
+}
+
 static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
 {
-    size_t size;
+    int err;
 
-    if (pdu->s->dev_id == 0) {
-        pdu->s->dev_id = stbuf->st_dev;
-    } else if (pdu->s->dev_id != stbuf->st_dev) {
-        error_report_once(
-            "9p: Multiple devices detected in same VirtFS export. "
-            "You must use a separate export for each device."
-        );
-        return -ENOSYS;
+    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
+        /* map inode+device to qid path (fast path) */
+        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+        if (err) {
+            return err;
+        }
+    } else {
+        if (pdu->s->dev_id == 0) {
+            pdu->s->dev_id = stbuf->st_dev;
+        } else if (pdu->s->dev_id != stbuf->st_dev) {
+            error_report_once(
+                "9p: Multiple devices detected in same VirtFS export. "
+                "You must either use a separate export for each device "
+                "shared from host or enable virtfs option 'remap_inodes'."
+            );
+            return -ENOSYS;
+        }
+        size_t size;
+        memset(&qidp->path, 0, sizeof(qidp->path));
+        size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
+        memcpy(&qidp->path, &stbuf->st_ino, size);
     }
 
-    memset(&qidp->path, 0, sizeof(qidp->path));
-    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
-    memcpy(&qidp->path, &stbuf->st_ino, size);
     qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
     qidp->type = 0;
     if (S_ISDIR(stbuf->st_mode)) {
@@ -3676,6 +3749,10 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
 
     s->dev_id = 0;
 
+    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
+    qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
+    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
 
@@ -3689,6 +3766,7 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+        qpp_table_destroy(&s->qpp_table);
         v9fs_path_free(&path);
     }
     return rc;
@@ -3701,6 +3779,7 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+    qpp_table_destroy(&s->qpp_table);
     g_free(s->ctx.fs_root);
 }
 
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 5e316178d5..0200e04176 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -8,6 +8,7 @@
 #include "fsdev/9p-iov-marshal.h"
 #include "qemu/thread.h"
 #include "qemu/coroutine.h"
+#include "qemu/qht.h"
 
 enum {
     P9_TLERROR = 6,
@@ -235,6 +236,15 @@ struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
+#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+
+/* QID path prefix entry, see stat_to_qid */
+typedef struct {
+    dev_t dev;
+    uint16_t ino_prefix;
+    uint16_t qp_prefix;
+} QppEntry;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -257,6 +267,8 @@ struct V9fsState
     V9fsConf fsconf;
     V9fsQID root_qid;
     dev_t dev_id;
+    struct qht qpp_table;
+    uint16_t qp_prefix_next;
 };
 
 /* 9p2000.L open flags */
diff --git a/qemu-options.hx b/qemu-options.hx
index 0d8beb4afd..e7ea136da1 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1334,7 +1334,7 @@ ETEXI
 
 DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
     "-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n"
-    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n"
+    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,remap_inodes]\n"
     "-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n"
     "-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n"
     "-virtfs synth,mount_tag=tag[,id=id][,readonly]\n",
@@ -1342,7 +1342,7 @@ DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
 
 STEXI
 
-@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}]
+@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,remap_inodes]
 @itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
 @itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
 @itemx -virtfs synth,mount_tag=@var{mount_tag}
@@ -1398,6 +1398,19 @@ Specifies the default mode for newly created directories on the host. Works
 only with security models "mapped-xattr" and "mapped-file".
 @item mount_tag=@var{mount_tag}
 Specifies the tag name to be used by the guest to mount this export point.
+@item remap_inodes
+By default virtfs 9p supports only one device per export in order to avoid
+file ID collisions on guest which may otherwise happen because the original
+device IDs from host are not passed and exposed on guest. Instead all files
+of an export shared with virtfs do have the same device id on guest. So two
+files with identical inode numbers but from actually different devices on
+host would otherwise cause a file ID collision and hence potential
+misbehaviours on guest. For that reason it is recommended to create a
+separate virtfs export for each device to be shared with guests. However
+you may also enable "remap_inodes" which allows you to share multiple
+devices with only one export instead, which is achieved by remapping the
+original inode numbers from host to guest in a way that would prevent such
+collisions.
 @end table
 ETEXI
 
diff --git a/vl.c b/vl.c
index 99a56b5556..de9d7b846c 100644
--- a/vl.c
+++ b/vl.c
@@ -3457,6 +3457,9 @@ int main(int argc, char **argv, char **envp)
                 qemu_opt_set_bool(fsdev, "readonly",
                                   qemu_opt_get_bool(opts, "readonly", 0),
                                   &error_abort);
+                qemu_opt_set_bool(fsdev, "remap_inodes",
+                                  qemu_opt_get_bool(opts, "remap_inodes", 0),
+                                  &error_abort);
                 device = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
                                           &error_abort);
                 qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);
-- 
2.11.0



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path
  2019-06-26 18:57 [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions Christian Schoenebeck via Qemu-devel
                   ` (2 preceding siblings ...)
  2019-06-26 18:42 ` [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes" Christian Schoenebeck via Qemu-devel
@ 2019-06-26 18:46 ` Christian Schoenebeck via Qemu-devel
  2019-06-28 10:21   ` Greg Kurz
  2019-06-26 18:52 ` [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping Christian Schoenebeck via Qemu-devel
  4 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-26 18:46 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

stat_to_qid attempts via qid_path_prefixmap to map unique files (which are
identified by 64 bit inode nr and 32 bit device id) to a 64 QID path value.
However this implementation makes some assumptions about inode number
generation on the host.

If qid_path_prefixmap fails, we still have 48 bits available in the QID
path to fall back to a less memory efficient full mapping.

Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/9pfs/9p.h |  9 +++++++++
 2 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 7ccc68a829..e6e410972f 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -579,23 +579,69 @@ static uint32_t qpp_hash(QppEntry e)
     return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
 }
 
+static uint32_t qpf_hash(QpfEntry e)
+{
+    return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
+}
+
 static bool qpp_lookup_func(const void *obj, const void *userp)
 {
     const QppEntry *e1 = obj, *e2 = userp;
     return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
 }
 
-static void qpp_table_remove(void *p, uint32_t h, void *up)
+static bool qpf_lookup_func(const void *obj, const void *userp)
+{
+    const QpfEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev && e1->ino == e2->ino;
+}
+
+static void qp_table_remove(void *p, uint32_t h, void *up)
 {
     g_free(p);
 }
 
-static void qpp_table_destroy(struct qht *ht)
+static void qp_table_destroy(struct qht *ht)
 {
-    qht_iter(ht, qpp_table_remove, NULL);
+    qht_iter(ht, qp_table_remove, NULL);
     qht_destroy(ht);
 }
 
+static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
+                            uint64_t *path)
+{
+    QpfEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino = stbuf->st_ino
+    }, *val;
+    uint32_t hash = qpf_hash(lookup);
+
+    /* most users won't need the fullmap, so init the table lazily */
+    if (!pdu->s->qpf_table.map) {
+        qht_init(&pdu->s->qpf_table, qpf_lookup_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
+    }
+
+    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_fullpath_next == 0) {
+            /* no more files can be mapped :'( */
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        *val = lookup;
+
+        /* new unique inode and device combo */
+        val->path = pdu->s->qp_fullpath_next++;
+        pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+        qht_insert(&pdu->s->qpf_table, val, hash, NULL);
+    }
+
+    *path = val->path;
+    return 0;
+}
+
 /* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
  * to a unique QID path (64 bits). To avoid having to map and keep track
  * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
@@ -642,6 +688,10 @@ static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
     if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
         /* map inode+device to qid path (fast path) */
         err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
+        if (err == -ENFILE) {
+            /* fast path didn't work, fall back to full map */
+            err = qid_path_fullmap(pdu, stbuf, &qidp->path);
+        }
         if (err) {
             return err;
         }
@@ -3752,6 +3802,7 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
     /* QID path hash table. 1 entry ought to be enough for anybody ;) */
     qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
     s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+    s->qp_fullpath_next = 1;
 
     s->ctx.fst = &fse->fst;
     fsdev_throttle_init(s->ctx.fst);
@@ -3766,7 +3817,8 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
-        qpp_table_destroy(&s->qpp_table);
+        qp_table_destroy(&s->qpp_table);
+        qp_table_destroy(&s->qpf_table);
         v9fs_path_free(&path);
     }
     return rc;
@@ -3779,7 +3831,8 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
-    qpp_table_destroy(&s->qpp_table);
+    qp_table_destroy(&s->qpp_table);
+    qp_table_destroy(&s->qpf_table);
     g_free(s->ctx.fs_root);
 }
 
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 0200e04176..2b74561030 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -245,6 +245,13 @@ typedef struct {
     uint16_t qp_prefix;
 } QppEntry;
 
+/* QID path full entry, as above */
+typedef struct {
+    dev_t dev;
+    ino_t ino;
+    uint64_t path;
+} QpfEntry;
+
 struct V9fsState
 {
     QLIST_HEAD(, V9fsPDU) free_list;
@@ -268,7 +275,9 @@ struct V9fsState
     V9fsQID root_qid;
     dev_t dev_id;
     struct qht qpp_table;
+    struct qht qpf_table;
     uint16_t qp_prefix_next;
+    uint64_t qp_fullpath_next;
 };
 
 /* 9p2000.L open flags */
-- 
2.11.0



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping
  2019-06-26 18:57 [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions Christian Schoenebeck via Qemu-devel
                   ` (3 preceding siblings ...)
  2019-06-26 18:46 ` [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path Christian Schoenebeck via Qemu-devel
@ 2019-06-26 18:52 ` Christian Schoenebeck via Qemu-devel
  2019-06-28 11:50   ` Greg Kurz
  4 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-26 18:52 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

Use variable length suffixes for inode remapping instead of the fixed 16
bit size prefixes before. With this change the inode numbers on guest will
typically be much smaller (e.g. around >2^1 .. >2^7 instead of >2^48 with
the previous fixed size inode remapping.

Additionally this solution should be more efficient, since inode numbers in
practice can take almost their entire 64 bit range on guest as well. Which
might also be beneficial for nested virtualization.

The "Exponential Golomb" algorithm is used as basis for generating the
variable length suffixes. The algorithm has a paramter k which controls the
distribution of bits on increasing indeces (minimum bits at low index vs.
maximum bits at high index). With k=0 the generated suffixes look like:

Index Dec/Bin -> Generated Suffix Bin
1 [1] -> [1] (1 bits)
2 [10] -> [010] (3 bits)
3 [11] -> [110] (3 bits)
4 [100] -> [00100] (5 bits)
5 [101] -> [10100] (5 bits)
6 [110] -> [01100] (5 bits)
7 [111] -> [11100] (5 bits)
8 [1000] -> [0001000] (7 bits)
9 [1001] -> [1001000] (7 bits)
10 [1010] -> [0101000] (7 bits)
11 [1011] -> [1101000] (7 bits)
12 [1100] -> [0011000] (7 bits)
...
65533 [1111111111111101] ->  [1011111111111111000000000000000] (31 bits)
65534 [1111111111111110] ->  [0111111111111111000000000000000] (31 bits)
65535 [1111111111111111] ->  [1111111111111111000000000000000] (31 bits)
Hence minBits=1 maxBits=31

And with k=5 they would look like:

Index Dec/Bin -> Generated Suffix Bin
1 [1] -> [000001] (6 bits)
2 [10] -> [100001] (6 bits)
3 [11] -> [010001] (6 bits)
4 [100] -> [110001] (6 bits)
5 [101] -> [001001] (6 bits)
6 [110] -> [101001] (6 bits)
7 [111] -> [011001] (6 bits)
8 [1000] -> [111001] (6 bits)
9 [1001] -> [000101] (6 bits)
10 [1010] -> [100101] (6 bits)
11 [1011] -> [010101] (6 bits)
12 [1100] -> [110101] (6 bits)
...
65533 [1111111111111101] -> [0011100000000000100000000000] (28 bits)
65534 [1111111111111110] -> [1011100000000000100000000000] (28 bits)
65535 [1111111111111111] -> [0111100000000000100000000000] (28 bits)
Hence minBits=6 maxBits=28

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 hw/9pfs/9p.h |  67 ++++++++++++++-
 2 files changed, 312 insertions(+), 22 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index e6e410972f..46c9f11384 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -26,6 +26,7 @@
 #include "migration/blocker.h"
 #include "sysemu/qtest.h"
 #include "qemu/xxhash.h"
+#include <math.h>
 
 int open_fd_hw;
 int total_open_fd;
@@ -572,6 +573,123 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                 P9_STAT_MODE_NAMED_PIPE |   \
                                 P9_STAT_MODE_SOCKET)
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
+static inline uint8_t mirror8bit(uint8_t byte) {
+    return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+}
+
+/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
+static inline uint64_t mirror64bit(uint64_t value) {
+    return ((uint64_t)mirror8bit( value        & 0xff) << 56) |
+           ((uint64_t)mirror8bit((value >> 8)  & 0xff) << 48) |
+           ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
+           ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
+           ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
+           ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
+           ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8 ) |
+           ((uint64_t)mirror8bit((value >> 56) & 0xff)      ) ;
+}
+
+/* Parameter k for the Exponential Golomb algorihm to be used.
+ *
+ * The smaller this value, the smaller the minimum bit count for the Exp.
+ * Golomb generated affixes will be (at lowest index) however for the
+ * price of having higher maximum bit count of generated affixes (at highest
+ * index). Likewise increasing this parameter yields in smaller maximum bit
+ * count for the price of having higher minimum bit count.
+ */
+#define EXP_GOLOMB_K    0
+
+# if !EXP_GOLOMB_K
+
+/** @brief Exponential Golomb algorithm limited to the case k=0.
+ *
+ * See expGolombEncode() below for details.
+ *
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix expGolombEncodeK0(uint64_t n) {
+    const int bits = (int) log2(n) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = n,
+        .bits = bits + bits - 1
+    };
+}
+
+# else
+
+/** @brief Exponential Golomb algorithm for arbitrary k (including k=0).
+ *
+ * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!)
+ * with growing length and with the mathematical property of being
+ * "prefix-free". The latter means the generated prefixes can be prepended
+ * in front of arbitrary numbers and the resulting concatenated numbers are
+ * guaranteed to be always unique.
+ *
+ * This is a minor adjustment to the original Exp. Golomb algorithm in the
+ * sense that lowest allowed index (@param n) starts with 1, not with zero.
+ *
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ * @param k - parameter k of Exp. Golomb algorithm to be used
+ *            (see comment on EXP_GOLOMB_K macro for details about k)
+ */
+static VariLenAffix expGolombEncode(uint64_t n, int k) {
+    const uint64_t value = n + (1 << k) - 1;
+    const int bits = (int) log2(value) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = value,
+        .bits = bits + MAX((bits - 1 - k), 0)
+    };
+}
+
+# endif /* !EXP_GOLOMB_K */
+
+/** @brief Converts a suffix into a prefix, or a prefix into a suffix.
+ *
+ * What this function does is actually mirroring all bits of the affix value,
+ * with the purpose to preserve respectively the mathematical "prefix-free"
+ * or "suffix-free" property after the conversion.
+ *
+ * In other words: if a passed prefix is suitable to create unique numbers,
+ * then the returned suffix is suitable to create unique numbers as well
+ * (and vice versa).
+ */
+static VariLenAffix invertAffix(const VariLenAffix* affix) {
+    return (VariLenAffix) {
+        .type = (affix->type == AffixType_Suffix) ? AffixType_Prefix : AffixType_Suffix,
+        .value =  mirror64bit(affix->value) >> ((sizeof(affix->value) * 8) - affix->bits),
+        .bits = affix->bits
+    };
+}
+
+/** @brief Generates suffix numbers with "suffix-free" property.
+ *
+ * This is just a wrapper function on top of the Exp. Golomb algorithm.
+ *
+ * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
+ * this function converts the Exp. Golomb prefixes into appropriate suffixes
+ * which are still suitable for generating unique numbers.
+ *
+ * @param n - natural number (or index) of the suffix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix affixForIndex(uint64_t index) {
+    VariLenAffix prefix;
+    #if EXP_GOLOMB_K
+    prefix = expGolombEncode(index, EXP_GOLOMB_K);
+    #else
+    prefix = expGolombEncodeK0(index);
+    #endif
+    return invertAffix(&prefix); /* convert prefix to suffix */
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
 
 /* creative abuse of tb_hash_func7, which is based on xxhash */
 static uint32_t qpp_hash(QppEntry e)
@@ -584,13 +702,23 @@ static uint32_t qpf_hash(QpfEntry e)
     return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
 }
 
-static bool qpp_lookup_func(const void *obj, const void *userp)
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+static bool qpd_cmp_func(const void *obj, const void *userp)
+{
+    const QpdEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev;
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+static bool qpp_cmp_func(const void *obj, const void *userp)
 {
     const QppEntry *e1 = obj, *e2 = userp;
     return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
 }
 
-static bool qpf_lookup_func(const void *obj, const void *userp)
+static bool qpf_cmp_func(const void *obj, const void *userp)
 {
     const QpfEntry *e1 = obj, *e2 = userp;
     return e1->dev == e2->dev && e1->ino == e2->ino;
@@ -607,6 +735,58 @@ static void qp_table_destroy(struct qht *ht)
     qht_destroy(ht);
 }
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/*
+ * Returns how many (high end) bits of inode numbers of the passed fs
+ * device shall be used (in combination with the device number) to
+ * generate hash values for qpp_table entries.
+ *
+ * This function is required if variable length suffixes are used for inode
+ * number mapping on guest level. Since a device may end up having multiple
+ * entries in qpp_table, each entry most probably with a different suffix
+ * length, we thus need this function in conjunction with qpd_table to
+ * "agree" about a fix amount of bits (per device) to be always used for
+ * generating hash values for the purpose of accessing qpp_table in order
+ * get consistent behaviour when accessing qpp_table.
+ */
+static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
+{
+    QpdEntry lookup = {
+        .dev = dev
+    }, *val;
+    uint32_t hash = dev;
+    VariLenAffix affix;
+
+    val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
+    if (!val) {
+        val = g_malloc0(sizeof(QpdEntry));
+        *val = lookup;
+        affix = affixForIndex(pdu->s->qp_affix_next);
+        val->prefix_bits = affix.bits;
+        qht_insert(&pdu->s->qpd_table, val, hash, NULL);
+        pdu->s->qp_ndevices++;
+    }
+    return val->prefix_bits;
+}
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+/** @brief Slow / full mapping host inode nr -> guest inode nr.
+ *
+ * This function performs a slower and much more costly remapping of an
+ * original file inode number on host to an appropriate different inode
+ * number on guest. For every (dev, inode) combination on host a new
+ * sequential number is generated, cached and exposed as inode number on
+ * guest.
+ *
+ * This is just a "last resort" fallback solution if the much faster/cheaper
+ * qid_path_prefixmap() failed. In practice this slow / full mapping is not
+ * expected ever to be used at all though.
+ *
+ * @see qid_path_prefixmap() for details
+ *
+ */
 static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
                             uint64_t *path)
 {
@@ -615,11 +795,9 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         .ino = stbuf->st_ino
     }, *val;
     uint32_t hash = qpf_hash(lookup);
-
-    /* most users won't need the fullmap, so init the table lazily */
-    if (!pdu->s->qpf_table.map) {
-        qht_init(&pdu->s->qpf_table, qpf_lookup_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
-    }
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    VariLenAffix affix;
+#endif
 
     val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
 
@@ -633,8 +811,16 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
         *val = lookup;
 
         /* new unique inode and device combo */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        affix = affixForIndex(
+            1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
+        );
+        val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
+        pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
+#else
         val->path = pdu->s->qp_fullpath_next++;
         pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
+#endif
         qht_insert(&pdu->s->qpf_table, val, hash, NULL);
     }
 
@@ -642,42 +828,71 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
     return 0;
 }
 
-/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
+/** @brief Quick mapping host inode nr -> guest inode nr.
+ *
+ * This function performs quick remapping of an original file inode number
+ * on host to an appropriate different inode number on guest. This remapping
+ * of inodes is required to avoid inode nr collisions on guest which would
+ * happen if the 9p export contains more than 1 exported file system (or
+ * more than 1 file system data set), because unlike on host level where the
+ * files would have different device nrs, all files exported by 9p would
+ * share the same device nr on guest (the device nr of the virtual 9p device
+ * that is).
+ *
+ * if P9_VARI_LENGTH_INODE_SUFFIXES == 0 :
+ * stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
  * to a unique QID path (64 bits). To avoid having to map and keep track
  * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
  * the device id to the 16 highest bits of the QID path. The 48 lowest bits
  * of the QID path equal to the lowest bits of the inode number.
  *
- * This takes advantage of the fact that inode number are usually not
- * random but allocated sequentially, so we have fewer items to keep
- * track of.
+ * if P9_VARI_LENGTH_INODE_SUFFIXES == 1 :
+ * Instead of fixed size (16 bit) generated prefix, we use variable size
+ * suffixes instead. See comment on P9_VARI_LENGTH_INODE_SUFFIXES for
+ * details.
  */
 static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
                                 uint64_t *path)
 {
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
+#endif
     QppEntry lookup = {
         .dev = stbuf->st_dev,
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> (64-ino_hash_bits))
+#else
         .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
+#endif
     }, *val;
     uint32_t hash = qpp_hash(lookup);
 
     val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
 
     if (!val) {
-        if (pdu->s->qp_prefix_next == 0) {
-            /* we ran out of prefixes */
+        if (pdu->s->qp_affix_next == 0) {
+            /* we ran out of affixes */
             return -ENFILE;
         }
 
         val = g_malloc0(sizeof(QppEntry));
         *val = lookup;
 
-        /* new unique inode prefix and device combo */
-        val->qp_prefix = pdu->s->qp_prefix_next++;
+        /* new unique inode affix and device combo */
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        val->qp_affix_index = pdu->s->qp_affix_next++;
+        val->qp_affix = affixForIndex(val->qp_affix_index);
+#else
+        val->qp_affix = pdu->s->qp_affix_next++;
+#endif
         qht_insert(&pdu->s->qpp_table, val, hash, NULL);
     }
-
-    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    /* assuming generated affix to be suffix type, not prefix */
+    *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
+#else
+    *path = ((uint64_t)val->qp_affix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
+#endif
     return 0;
 }
 
@@ -3799,9 +4014,17 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
 
     s->dev_id = 0;
 
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    qht_init(&s->qpd_table, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+#endif
+    /* most users won't need the fullmap, so init the table lazily */
+    qht_init(&s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
     /* QID path hash table. 1 entry ought to be enough for anybody ;) */
-    qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
-    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
+    qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    s->qp_ndevices = 0;
+#endif
+    s->qp_affix_next = 1; /* reserve 0 to detect overflow */
     s->qp_fullpath_next = 1;
 
     s->ctx.fst = &fse->fst;
@@ -3817,6 +4040,9 @@ out:
         }
         g_free(s->tag);
         g_free(s->ctx.fs_root);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+        qp_table_destroy(&s->qpd_table);
+#endif
         qp_table_destroy(&s->qpp_table);
         qp_table_destroy(&s->qpf_table);
         v9fs_path_free(&path);
@@ -3831,6 +4057,9 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
     }
     fsdev_throttle_cleanup(s->ctx.fst);
     g_free(s->tag);
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    qp_table_destroy(&s->qpd_table);
+#endif
     qp_table_destroy(&s->qpp_table);
     qp_table_destroy(&s->qpf_table);
     g_free(s->ctx.fs_root);
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 2b74561030..a94272a504 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -236,13 +236,68 @@ struct V9fsFidState
     V9fsFidState *rclm_lst;
 };
 
-#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+/*
+ * Defines how inode numbers from host shall be remapped on guest.
+ *
+ * When this compile time option is disabled then fixed length (16 bit)
+ * prefix values are used for all inode numbers on guest level. Accordingly
+ * guest's inode numbers will be quite large (>2^48).
+ *
+ * If this option is enabled then variable length suffixes will be used for
+ * guest's inode numbers instead which usually yields in much smaller inode
+ * numbers on guest level (typically around >2^1 .. >2^7).
+ */
+#define P9_VARI_LENGTH_INODE_SUFFIXES 1
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+typedef enum AffixType_t {
+    AffixType_Prefix,
+    AffixType_Suffix, /* A.k.a. postfix. */
+} AffixType_t;
+
+/** @brief Unique affix of variable length.
+ *
+ * An affix is (currently) either a suffix or a prefix, which is either
+ * going to be prepended (prefix) or appended (suffix) with some other
+ * number for the goal to generate unique numbers. Accordingly the
+ * suffixes (or prefixes) we generate @b must all have the mathematical
+ * property of being suffix-free (or prefix-free in case of prefixes)
+ * so that no matter what number we concatenate the affix with, that we
+ * always reliably get unique numbers as result after concatenation.
+ */
+typedef struct VariLenAffix {
+    AffixType_t type; /* Whether this affix is a suffix or a prefix. */
+    uint64_t value; /* Actual numerical value of this affix. */
+    int bits; /* Lenght of the affix, that is how many (of the lowest) bits of @c value must be used for appending/prepending this affix to its final resulting, unique number. */
+} VariLenAffix;
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
+
+#if !P9_VARI_LENGTH_INODE_SUFFIXES
+# define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
+#endif
+
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+
+/* See qid_inode_prefix_hash_bits(). */
+typedef struct {
+    dev_t dev; /* FS device on host. */
+    int prefix_bits; /* How many (high) bits of the original inode number shall be used for hashing. */
+} QpdEntry;
+
+#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
 
 /* QID path prefix entry, see stat_to_qid */
 typedef struct {
     dev_t dev;
     uint16_t ino_prefix;
-    uint16_t qp_prefix;
+    #if P9_VARI_LENGTH_INODE_SUFFIXES
+    uint32_t qp_affix_index;
+    VariLenAffix qp_affix;
+    #else
+    uint16_t qp_affix;
+    #endif
 } QppEntry;
 
 /* QID path full entry, as above */
@@ -274,9 +329,15 @@ struct V9fsState
     V9fsConf fsconf;
     V9fsQID root_qid;
     dev_t dev_id;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    struct qht qpd_table;
+#endif
     struct qht qpp_table;
     struct qht qpf_table;
-    uint16_t qp_prefix_next;
+#if P9_VARI_LENGTH_INODE_SUFFIXES
+    uint64_t qp_ndevices; /* Amount of entries in qpd_table. */
+#endif
+    uint16_t qp_affix_next;
     uint64_t qp_fullpath_next;
 };
 
-- 
2.11.0



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions
@ 2019-06-26 18:57 Christian Schoenebeck via Qemu-devel
  2019-06-26 18:25 ` [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path Christian Schoenebeck via Qemu-devel
                   ` (4 more replies)
  0 siblings, 5 replies; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-26 18:57 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

This is v4 of a proposed patch set for fixing file ID collisions with 9pfs.

v3->v4:

  * Rebased to latest git master head.

  * Splitted Antonios' patch set to its original 4 individual patches.
    (was merged previously as only 1 patch).

  * Addressed discussed issues directly on Antonios' patches
    (was a separate patch before).

  * Added virtfs command line option "remap_inodes": Unless this option
    is not enabled, no inode remapping is performed at all, the user
    just gets an error message when trying to use more than 1 device
    per export.

  * Dropped persistency feature of QIDs beyond reboots.

  * Dropped disputed "vii" feature.

Greg, please check if I am doing anything superfluous in patch 3 regarding
the new command line parameter "remap_inodes".

Daniel, I also have a libvirt patch for this new "remap_inodes" command
line parameter, but I guess I wait for this qemu patch set to get through.

Christian Schoenebeck (5):
  9p: unsigned type for type, version, path
  9p: Treat multiple devices on one export as an error
  9p: Added virtfs option "remap_inodes"
  9p: stat_to_qid: implement slow path
  9p: Use variable length suffixes for inode remapping

 fsdev/9p-marshal.h      |   6 +-
 fsdev/file-op-9p.h      |   1 +
 fsdev/qemu-fsdev-opts.c |   7 +-
 fsdev/qemu-fsdev.c      |   6 +
 hw/9pfs/9p.c            | 448 +++++++++++++++++++++++++++++++++++++++++++++---
 hw/9pfs/9p.h            |  83 +++++++++
 hw/9pfs/trace-events    |  14 +-
 qemu-options.hx         |  17 +-
 vl.c                    |   3 +
 9 files changed, 550 insertions(+), 35 deletions(-)

-- 
2.11.0



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path
  2019-06-26 18:25 ` [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path Christian Schoenebeck via Qemu-devel
@ 2019-06-27 16:12   ` Greg Kurz
  2019-06-28 11:42     ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Greg Kurz @ 2019-06-27 16:12 UTC (permalink / raw)
  To: Christian Schoenebeck via Qemu-devel
  Cc: Daniel P. Berrangé, Christian Schoenebeck, Antonios Motakis

On Wed, 26 Jun 2019 20:25:55 +0200
Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:

> There is no need for signedness on these QID fields for 9p.
> 
> Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>

You should mention here the changes you made on top of Antonios
original patch. Something like:

[CS: - also convert path
     - adapted trace-events and donttouch_stat()]

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  fsdev/9p-marshal.h   |  6 +++---
>  hw/9pfs/9p.c         |  6 +++---
>  hw/9pfs/trace-events | 14 +++++++-------
>  3 files changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h
> index c8823d878f..8f3babb60a 100644
> --- a/fsdev/9p-marshal.h
> +++ b/fsdev/9p-marshal.h
> @@ -9,9 +9,9 @@ typedef struct V9fsString
>  
>  typedef struct V9fsQID
>  {
> -    int8_t type;
> -    int32_t version;
> -    int64_t path;
> +    uint8_t type;
> +    uint32_t version;
> +    uint64_t path;
>  } V9fsQID;
>  
>  typedef struct V9fsStat
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 55821343e5..586a6dccba 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -743,9 +743,9 @@ static int donttouch_stat(V9fsStat *stat)
>  {
>      if (stat->type == -1 &&
>          stat->dev == -1 &&
> -        stat->qid.type == -1 &&
> -        stat->qid.version == -1 &&
> -        stat->qid.path == -1 &&
> +        stat->qid.type == 0xff &&
> +        stat->qid.version == (uint32_t) -1 &&
> +        stat->qid.path == (uint64_t) -1 &&
>          stat->mode == -1 &&
>          stat->atime == -1 &&
>          stat->mtime == -1 &&
> diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
> index c0a0a4ab5d..6964756922 100644
> --- a/hw/9pfs/trace-events
> +++ b/hw/9pfs/trace-events
> @@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d"
>  v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
>  v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
>  v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"
> -v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
> +v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64

I was expecting to see PRIu64 for an uint64_t but I now realize that %d
seems to be used all over the place for unsigned types... :-\

At least, please fix the masks of the lines you're changing in this
patch so that unsigned are passed to "u" or PRIu64. The rest of the
mess can be fixed later in a followup.

>  v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
>  v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}"
>  v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64
> @@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod
>  v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d"
>  v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p"
>  v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d"
> -v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
> +v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
>  v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u"
> -v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
> +v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
>  v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d"
>  v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
>  v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u"
> @@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t
>  v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d"
>  v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd"
>  v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d"
> -v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
> +v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
>  v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid,  char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u"
> -v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
> +v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
>  v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d"
>  v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s"
>  v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
>  v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}"
>  v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d"
> -v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
> +v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
>  v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
>  v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d"
>  v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
>  v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u"
>  v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u"
> -v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
> +v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
>  v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
>  v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
>  v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error
  2019-06-26 18:30 ` [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error Christian Schoenebeck via Qemu-devel
@ 2019-06-27 17:26   ` Greg Kurz
  2019-06-28 12:36     ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Greg Kurz @ 2019-06-27 17:26 UTC (permalink / raw)
  To: Christian Schoenebeck via Qemu-devel
  Cc: Daniel P. Berrangé, Christian Schoenebeck, Antonios Motakis

On Wed, 26 Jun 2019 20:30:41 +0200
Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:

> The QID path should uniquely identify a file. However, the
> inode of a file is currently used as the QID path, which
> on its own only uniquely identifies wiles within a device.

s/wile/files

> Here we track the device hosting the 9pfs share, in order
> to prevent security issues with QID path collisions from
> other devices.
> 
> Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>

You should mention here the changes you made to the original patch.

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++------------
>  hw/9pfs/9p.h |  1 +
>  2 files changed, 58 insertions(+), 14 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 586a6dccba..cbaa212625 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -572,10 +572,20 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
>                                  P9_STAT_MODE_SOCKET)
>  
>  /* This is the algorithm from ufs in spfs */
> -static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
> +static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
>  {
>      size_t size;
>  
> +    if (pdu->s->dev_id == 0) {
> +        pdu->s->dev_id = stbuf->st_dev;

st_dev should be captured in v9fs_device_realize_common() since we
lstat() the root there, instead of every request doing the check.

> +    } else if (pdu->s->dev_id != stbuf->st_dev) {
> +        error_report_once(
> +            "9p: Multiple devices detected in same VirtFS export. "
> +            "You must use a separate export for each device."
> +        );
> +        return -ENOSYS;

This error is likely to end up as the return value of a
syscall in the guest and -ENOSYS usually means the syscall
isn't implemented, which is obviously not the case. Maybe
return -EPERM instead ?

> +    }
> +
>      memset(&qidp->path, 0, sizeof(qidp->path));
>      size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
>      memcpy(&qidp->path, &stbuf->st_ino, size);
> @@ -587,6 +597,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
>      if (S_ISLNK(stbuf->st_mode)) {
>          qidp->type |= P9_QID_TYPE_SYMLINK;
>      }
> +
> +    return 0;
>  }
>  
>  static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
> @@ -599,7 +611,10 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
>      if (err < 0) {
>          return err;
>      }
> -    stat_to_qid(&stbuf, qidp);
> +    err = stat_to_qid(pdu, &stbuf, qidp);
> +    if (err < 0) {
> +        return err;
> +    }
>      return 0;
>  }
>  
> @@ -830,7 +845,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
>  
>      memset(v9stat, 0, sizeof(*v9stat));
>  
> -    stat_to_qid(stbuf, &v9stat->qid);
> +    err = stat_to_qid(pdu, stbuf, &v9stat->qid);
> +    if (err < 0) {
> +        return err;
> +    }
>      v9stat->mode = stat_to_v9mode(stbuf);
>      v9stat->atime = stbuf->st_atime;
>      v9stat->mtime = stbuf->st_mtime;
> @@ -891,7 +909,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
>  #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
>  
>  
> -static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
> +static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
>                                  V9fsStatDotl *v9lstat)
>  {
>      memset(v9lstat, 0, sizeof(*v9lstat));
> @@ -913,7 +931,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
>      /* Currently we only support BASIC fields in stat */
>      v9lstat->st_result_mask = P9_STATS_BASIC;
>  
> -    stat_to_qid(stbuf, &v9lstat->qid);
> +    return stat_to_qid(pdu, stbuf, &v9lstat->qid);
>  }
>  
>  static void print_sg(struct iovec *sg, int cnt)
> @@ -1115,7 +1133,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
>      uint64_t request_mask;
>      V9fsStatDotl v9stat_dotl;
>      V9fsPDU *pdu = opaque;
> -    V9fsState *s = pdu->s;
>  
>      retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
>      if (retval < 0) {
> @@ -1136,7 +1153,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
>      if (retval < 0) {
>          goto out;
>      }
> -    stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
> +    retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
> +    if (retval < 0) {
> +        goto out;
> +    }
>  
>      /*  fill st_gen if requested and supported by underlying fs */
>      if (request_mask & P9_STATS_GEN) {
> @@ -1381,7 +1401,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
>              if (err < 0) {
>                  goto out;
>              }
> -            stat_to_qid(&stbuf, &qid);
> +            err = stat_to_qid(pdu, &stbuf, &qid);
> +            if (err < 0) {
> +                goto out;
> +            }
>              v9fs_path_copy(&dpath, &path);
>          }
>          memcpy(&qids[name_idx], &qid, sizeof(qid));
> @@ -1483,7 +1506,10 @@ static void coroutine_fn v9fs_open(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      if (S_ISDIR(stbuf.st_mode)) {
>          err = v9fs_co_opendir(pdu, fidp);
>          if (err < 0) {
> @@ -1593,7 +1619,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
>          fidp->flags |= FID_NON_RECLAIMABLE;
>      }
>      iounit =  get_iounit(pdu, &fidp->path);
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
>      if (err < 0) {
>          goto out;
> @@ -2327,7 +2356,10 @@ static void coroutine_fn v9fs_create(void *opaque)
>          }
>      }
>      iounit = get_iounit(pdu, &fidp->path);
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
>      if (err < 0) {
>          goto out;
> @@ -2384,7 +2416,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err =  pdu_marshal(pdu, offset, "Q", &qid);
>      if (err < 0) {
>          goto out;
> @@ -3064,7 +3099,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Q", &qid);
>      if (err < 0) {
>          goto out;
> @@ -3222,7 +3260,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
>      if (err < 0) {
>          goto out;
>      }
> -    stat_to_qid(&stbuf, &qid);
> +    err = stat_to_qid(pdu, &stbuf, &qid);
> +    if (err < 0) {
> +        goto out;
> +    }
>      err = pdu_marshal(pdu, offset, "Q", &qid);
>      if (err < 0) {
>          goto out;
> @@ -3633,6 +3674,8 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>          goto out;
>      }
>  
> +    s->dev_id = 0;
> +

Set it to stat->st_dev after lstat() was called later in this function.

>      s->ctx.fst = &fse->fst;
>      fsdev_throttle_init(s->ctx.fst);
>  
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 8883761b2c..5e316178d5 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -256,6 +256,7 @@ struct V9fsState
>      Error *migration_blocker;
>      V9fsConf fsconf;
>      V9fsQID root_qid;
> +    dev_t dev_id;
>  };
>  
>  /* 9p2000.L open flags */



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes"
  2019-06-26 18:42 ` [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes" Christian Schoenebeck via Qemu-devel
@ 2019-06-28 10:09   ` Greg Kurz
  2019-06-28 13:47     ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Greg Kurz @ 2019-06-28 10:09 UTC (permalink / raw)
  To: Christian Schoenebeck via Qemu-devel
  Cc: Daniel P. Berrangé, Christian Schoenebeck, Antonios Motakis

On Wed, 26 Jun 2019 20:42:13 +0200
Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:

> To support multiple devices on the 9p share, and avoid
> qid path collisions we take the device id as input
> to generate a unique QID path. The lowest 48 bits of
> the path will be set equal to the file inode, and the
> top bits will be uniquely assigned based on the top
> 16 bits of the inode and the device id.
> 
> Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>

Same remark about changes to the original patch.

BTW, I had a concern with the way v9fs_do_readdir() open-codes QID
generation without calling stat_to_qid().

See discussion here:

https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02724.html

I guess you should ensure in a preliminary patch that QIDs only
come out of stat_to_qid().

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  fsdev/file-op-9p.h      |   1 +
>  fsdev/qemu-fsdev-opts.c |   7 +++-
>  fsdev/qemu-fsdev.c      |   6 +++
>  hw/9pfs/9p.c            | 105 ++++++++++++++++++++++++++++++++++++++++++------
>  hw/9pfs/9p.h            |  12 ++++++
>  qemu-options.hx         |  17 +++++++-
>  vl.c                    |   3 ++
>  7 files changed, 135 insertions(+), 16 deletions(-)
> 
> diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h
> index c757c8099f..6c1663c252 100644
> --- a/fsdev/file-op-9p.h
> +++ b/fsdev/file-op-9p.h
> @@ -59,6 +59,7 @@ typedef struct ExtendedOps {
>  #define V9FS_RDONLY                 0x00000040
>  #define V9FS_PROXY_SOCK_FD          0x00000080
>  #define V9FS_PROXY_SOCK_NAME        0x00000100
> +#define V9FS_REMAP_INODES           0x00000200
>  
>  #define V9FS_SEC_MASK               0x0000003C
>  
> diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c
> index 7c31ffffaf..64a8052266 100644
> --- a/fsdev/qemu-fsdev-opts.c
> +++ b/fsdev/qemu-fsdev-opts.c
> @@ -31,7 +31,9 @@ static QemuOptsList qemu_fsdev_opts = {
>          }, {
>              .name = "readonly",
>              .type = QEMU_OPT_BOOL,
> -
> +        }, {
> +            .name = "remap_inodes",
> +            .type = QEMU_OPT_BOOL,
>          }, {
>              .name = "socket",
>              .type = QEMU_OPT_STRING,
> @@ -76,6 +78,9 @@ static QemuOptsList qemu_virtfs_opts = {
>              .name = "readonly",
>              .type = QEMU_OPT_BOOL,
>          }, {
> +            .name = "remap_inodes",
> +            .type = QEMU_OPT_BOOL,
> +        }, {
>              .name = "socket",
>              .type = QEMU_OPT_STRING,
>          }, {
> diff --git a/fsdev/qemu-fsdev.c b/fsdev/qemu-fsdev.c
> index 077a8c4e2b..b6fa9799be 100644
> --- a/fsdev/qemu-fsdev.c
> +++ b/fsdev/qemu-fsdev.c
> @@ -121,6 +121,7 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
>      const char *fsdev_id = qemu_opts_id(opts);
>      const char *fsdriver = qemu_opt_get(opts, "fsdriver");
>      const char *writeout = qemu_opt_get(opts, "writeout");
> +    bool remap_inodes = qemu_opt_get_bool(opts, "remap_inodes", 0);
>      bool ro = qemu_opt_get_bool(opts, "readonly", 0);
>  
>      if (!fsdev_id) {
> @@ -161,6 +162,11 @@ int qemu_fsdev_add(QemuOpts *opts, Error **errp)
>      } else {
>          fsle->fse.export_flags &= ~V9FS_RDONLY;
>      }
> +    if (remap_inodes) {
> +        fsle->fse.export_flags |= V9FS_REMAP_INODES;
> +    } else {
> +        fsle->fse.export_flags &= ~V9FS_REMAP_INODES;
> +    }
>  
>      if (fsle->fse.ops->parse_opts) {
>          if (fsle->fse.ops->parse_opts(opts, &fsle->fse, errp)) {
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index cbaa212625..7ccc68a829 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -25,6 +25,7 @@
>  #include "trace.h"
>  #include "migration/blocker.h"
>  #include "sysemu/qtest.h"
> +#include "qemu/xxhash.h"
>  
>  int open_fd_hw;
>  int total_open_fd;
> @@ -571,24 +572,96 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
>                                  P9_STAT_MODE_NAMED_PIPE |   \
>                                  P9_STAT_MODE_SOCKET)
>  
> -/* This is the algorithm from ufs in spfs */
> +
> +/* creative abuse of tb_hash_func7, which is based on xxhash */
> +static uint32_t qpp_hash(QppEntry e)
> +{
> +    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
> +}
> +
> +static bool qpp_lookup_func(const void *obj, const void *userp)
> +{
> +    const QppEntry *e1 = obj, *e2 = userp;
> +    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
> +}
> +
> +static void qpp_table_remove(void *p, uint32_t h, void *up)
> +{
> +    g_free(p);
> +}
> +
> +static void qpp_table_destroy(struct qht *ht)
> +{
> +    qht_iter(ht, qpp_table_remove, NULL);
> +    qht_destroy(ht);
> +}
> +
> +/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
> + * to a unique QID path (64 bits). To avoid having to map and keep track
> + * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> + * the device id to the 16 highest bits of the QID path. The 48 lowest bits
> + * of the QID path equal to the lowest bits of the inode number.
> + *
> + * This takes advantage of the fact that inode number are usually not
> + * random but allocated sequentially, so we have fewer items to keep
> + * track of.
> + */
> +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> +                                uint64_t *path)
> +{
> +    QppEntry lookup = {
> +        .dev = stbuf->st_dev,
> +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> +    }, *val;
> +    uint32_t hash = qpp_hash(lookup);
> +
> +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> +
> +    if (!val) {
> +        if (pdu->s->qp_prefix_next == 0) {
> +            /* we ran out of prefixes */

And we won't ever be able to allocate a new one. Maybe worth
adding an error_report_once() to inform the user ?

> +            return -ENFILE;
> +        }
> +
> +        val = g_malloc0(sizeof(QppEntry));
> +        *val = lookup;
> +
> +        /* new unique inode prefix and device combo */
> +        val->qp_prefix = pdu->s->qp_prefix_next++;
> +        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
> +    }
> +
> +    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> +    return 0;
> +}
> +
>  static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
>  {
> -    size_t size;
> +    int err;
>  
> -    if (pdu->s->dev_id == 0) {
> -        pdu->s->dev_id = stbuf->st_dev;
> -    } else if (pdu->s->dev_id != stbuf->st_dev) {
> -        error_report_once(
> -            "9p: Multiple devices detected in same VirtFS export. "
> -            "You must use a separate export for each device."
> -        );
> -        return -ENOSYS;
> +    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> +        /* map inode+device to qid path (fast path) */
> +        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> +        if (err) {
> +            return err;
> +        }
> +    } else {
> +        if (pdu->s->dev_id == 0) {
> +            pdu->s->dev_id = stbuf->st_dev;
> +        } else if (pdu->s->dev_id != stbuf->st_dev) {
> +            error_report_once(
> +                "9p: Multiple devices detected in same VirtFS export. "
> +                "You must either use a separate export for each device "
> +                "shared from host or enable virtfs option 'remap_inodes'."
> +            );
> +            return -ENOSYS;
> +        }
> +        size_t size;

From CODING_STYLE:

5. Declarations

Mixed declarations (interleaving statements and declarations within
blocks) are generally not allowed; declarations should be at the beginning
of blocks.

Please do so for "size" and add an extra blank line.

> +        memset(&qidp->path, 0, sizeof(qidp->path));
> +        size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
> +        memcpy(&qidp->path, &stbuf->st_ino, size);
>      }
>  
> -    memset(&qidp->path, 0, sizeof(qidp->path));
> -    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
> -    memcpy(&qidp->path, &stbuf->st_ino, size);
>      qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
>      qidp->type = 0;
>      if (S_ISDIR(stbuf->st_mode)) {
> @@ -3676,6 +3749,10 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>  
>      s->dev_id = 0;
>  
> +    /* QID path hash table. 1 entry ought to be enough for anybody ;) */
> +    qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
> +    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
> +
>      s->ctx.fst = &fse->fst;
>      fsdev_throttle_init(s->ctx.fst);
>  
> @@ -3689,6 +3766,7 @@ out:
>          }
>          g_free(s->tag);
>          g_free(s->ctx.fs_root);
> +        qpp_table_destroy(&s->qpp_table);
>          v9fs_path_free(&path);
>      }
>      return rc;
> @@ -3701,6 +3779,7 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
>      }
>      fsdev_throttle_cleanup(s->ctx.fst);
>      g_free(s->tag);
> +    qpp_table_destroy(&s->qpp_table);
>      g_free(s->ctx.fs_root);
>  }
>  
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 5e316178d5..0200e04176 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -8,6 +8,7 @@
>  #include "fsdev/9p-iov-marshal.h"
>  #include "qemu/thread.h"
>  #include "qemu/coroutine.h"
> +#include "qemu/qht.h"
>  
>  enum {
>      P9_TLERROR = 6,
> @@ -235,6 +236,15 @@ struct V9fsFidState
>      V9fsFidState *rclm_lst;
>  };
>  
> +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)

This won't give the expected result on a 32-bit host. Since this
is a mask for 64-bit entities, it should rather be:

#define QPATH_INO_MASK        ((1ULL << 48) - 1)

> +
> +/* QID path prefix entry, see stat_to_qid */
> +typedef struct {
> +    dev_t dev;
> +    uint16_t ino_prefix;
> +    uint16_t qp_prefix;
> +} QppEntry;
> +
>  struct V9fsState
>  {
>      QLIST_HEAD(, V9fsPDU) free_list;
> @@ -257,6 +267,8 @@ struct V9fsState
>      V9fsConf fsconf;
>      V9fsQID root_qid;
>      dev_t dev_id;
> +    struct qht qpp_table;
> +    uint16_t qp_prefix_next;
>  };
>  
>  /* 9p2000.L open flags */
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 0d8beb4afd..e7ea136da1 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -1334,7 +1334,7 @@ ETEXI
>  
>  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
>      "-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n"
> -    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n"
> +    "        [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,remap_inodes]\n"

This feature applies to all backends IIUC. We don't really care for the
synth backend since it generates non-colliding inode numbers by design,
but the proxy backend has the same issue as local. So...

>      "-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n"
>      "-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n"

... please update these two ^^ as well.

>      "-virtfs synth,mount_tag=tag[,id=id][,readonly]\n",
> @@ -1342,7 +1342,7 @@ DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
>  
>  STEXI
>  
> -@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}]
> +@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,remap_inodes]
>  @itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
>  @itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]

Ditto.

>  @itemx -virtfs synth,mount_tag=@var{mount_tag}
> @@ -1398,6 +1398,19 @@ Specifies the default mode for newly created directories on the host. Works
>  only with security models "mapped-xattr" and "mapped-file".
>  @item mount_tag=@var{mount_tag}
>  Specifies the tag name to be used by the guest to mount this export point.
> +@item remap_inodes
> +By default virtfs 9p supports only one device per export in order to avoid
> +file ID collisions on guest which may otherwise happen because the original
> +device IDs from host are not passed and exposed on guest. Instead all files
> +of an export shared with virtfs do have the same device id on guest. So two
> +files with identical inode numbers but from actually different devices on
> +host would otherwise cause a file ID collision and hence potential
> +misbehaviours on guest. For that reason it is recommended to create a
> +separate virtfs export for each device to be shared with guests. However
> +you may also enable "remap_inodes" which allows you to share multiple
> +devices with only one export instead, which is achieved by remapping the
> +original inode numbers from host to guest in a way that would prevent such
> +collisions.
>  @end table
>  ETEXI
>  
> diff --git a/vl.c b/vl.c
> index 99a56b5556..de9d7b846c 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -3457,6 +3457,9 @@ int main(int argc, char **argv, char **envp)
>                  qemu_opt_set_bool(fsdev, "readonly",
>                                    qemu_opt_get_bool(opts, "readonly", 0),
>                                    &error_abort);
> +                qemu_opt_set_bool(fsdev, "remap_inodes",
> +                                  qemu_opt_get_bool(opts, "remap_inodes", 0),
> +                                  &error_abort);
>                  device = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
>                                            &error_abort);
>                  qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path
  2019-06-26 18:46 ` [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path Christian Schoenebeck via Qemu-devel
@ 2019-06-28 10:21   ` Greg Kurz
  2019-06-28 14:03     ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Greg Kurz @ 2019-06-28 10:21 UTC (permalink / raw)
  To: Christian Schoenebeck via Qemu-devel
  Cc: Daniel P. Berrangé, Christian Schoenebeck, Antonios Motakis

On Wed, 26 Jun 2019 20:46:24 +0200
Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:

> stat_to_qid attempts via qid_path_prefixmap to map unique files (which are
> identified by 64 bit inode nr and 32 bit device id) to a 64 QID path value.
> However this implementation makes some assumptions about inode number
> generation on the host.
> 
> If qid_path_prefixmap fails, we still have 48 bits available in the QID
> path to fall back to a less memory efficient full mapping.
> 
> Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  hw/9pfs/9p.h |  9 +++++++++
>  2 files changed, 67 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index 7ccc68a829..e6e410972f 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -579,23 +579,69 @@ static uint32_t qpp_hash(QppEntry e)
>      return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
>  }
>  
> +static uint32_t qpf_hash(QpfEntry e)
> +{
> +    return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
> +}
> +
>  static bool qpp_lookup_func(const void *obj, const void *userp)
>  {
>      const QppEntry *e1 = obj, *e2 = userp;
>      return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
>  }
>  
> -static void qpp_table_remove(void *p, uint32_t h, void *up)
> +static bool qpf_lookup_func(const void *obj, const void *userp)
> +{
> +    const QpfEntry *e1 = obj, *e2 = userp;
> +    return e1->dev == e2->dev && e1->ino == e2->ino;
> +}
> +
> +static void qp_table_remove(void *p, uint32_t h, void *up)
>  {
>      g_free(p);
>  }
>  
> -static void qpp_table_destroy(struct qht *ht)
> +static void qp_table_destroy(struct qht *ht)
>  {
> -    qht_iter(ht, qpp_table_remove, NULL);
> +    qht_iter(ht, qp_table_remove, NULL);
>      qht_destroy(ht);
>  }
>  
> +static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
> +                            uint64_t *path)
> +{
> +    QpfEntry lookup = {
> +        .dev = stbuf->st_dev,
> +        .ino = stbuf->st_ino
> +    }, *val;
> +    uint32_t hash = qpf_hash(lookup);
> +
> +    /* most users won't need the fullmap, so init the table lazily */
> +    if (!pdu->s->qpf_table.map) {
> +        qht_init(&pdu->s->qpf_table, qpf_lookup_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
> +    }
> +
> +    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
> +
> +    if (!val) {
> +        if (pdu->s->qp_fullpath_next == 0) {
> +            /* no more files can be mapped :'( */

This would be the place to put the error_report_once() suggested
in the previous patch actually.

> +            return -ENFILE;
> +        }
> +
> +        val = g_malloc0(sizeof(QppEntry));
> +        *val = lookup;
> +
> +        /* new unique inode and device combo */
> +        val->path = pdu->s->qp_fullpath_next++;
> +        pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
> +        qht_insert(&pdu->s->qpf_table, val, hash, NULL);
> +    }
> +
> +    *path = val->path;
> +    return 0;
> +}
> +
>  /* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
>   * to a unique QID path (64 bits). To avoid having to map and keep track
>   * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
> @@ -642,6 +688,10 @@ static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
>      if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
>          /* map inode+device to qid path (fast path) */
>          err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> +        if (err == -ENFILE) {
> +            /* fast path didn't work, fall back to full map */
> +            err = qid_path_fullmap(pdu, stbuf, &qidp->path);
> +        }
>          if (err) {
>              return err;
>          }
> @@ -3752,6 +3802,7 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>      /* QID path hash table. 1 entry ought to be enough for anybody ;) */
>      qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
>      s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
> +    s->qp_fullpath_next = 1;
>  
>      s->ctx.fst = &fse->fst;
>      fsdev_throttle_init(s->ctx.fst);
> @@ -3766,7 +3817,8 @@ out:
>          }
>          g_free(s->tag);
>          g_free(s->ctx.fs_root);
> -        qpp_table_destroy(&s->qpp_table);
> +        qp_table_destroy(&s->qpp_table);
> +        qp_table_destroy(&s->qpf_table);
>          v9fs_path_free(&path);
>      }
>      return rc;
> @@ -3779,7 +3831,8 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
>      }
>      fsdev_throttle_cleanup(s->ctx.fst);
>      g_free(s->tag);
> -    qpp_table_destroy(&s->qpp_table);
> +    qp_table_destroy(&s->qpp_table);
> +    qp_table_destroy(&s->qpf_table);

I'm starting to think v9fs_device_unrealize_common() should be made
idempotent, so that it can be used to handle rollback of a partially
realized device, and thus avoid the code duplication. But this is
out-of-scope for this series.

LGTM.

>      g_free(s->ctx.fs_root);
>  }
>  
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 0200e04176..2b74561030 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -245,6 +245,13 @@ typedef struct {
>      uint16_t qp_prefix;
>  } QppEntry;
>  
> +/* QID path full entry, as above */
> +typedef struct {
> +    dev_t dev;
> +    ino_t ino;
> +    uint64_t path;
> +} QpfEntry;
> +
>  struct V9fsState
>  {
>      QLIST_HEAD(, V9fsPDU) free_list;
> @@ -268,7 +275,9 @@ struct V9fsState
>      V9fsQID root_qid;
>      dev_t dev_id;
>      struct qht qpp_table;
> +    struct qht qpf_table;
>      uint16_t qp_prefix_next;
> +    uint64_t qp_fullpath_next;
>  };
>  
>  /* 9p2000.L open flags */



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path
  2019-06-27 16:12   ` Greg Kurz
@ 2019-06-28 11:42     ` Christian Schoenebeck via Qemu-devel
  2019-06-28 12:06       ` Greg Kurz
  0 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-28 11:42 UTC (permalink / raw)
  To: qemu-devel
  Cc: Christian Schoenebeck, Daniel P. Berrangé,
	Greg Kurz, Antonios Motakis

On Donnerstag, 27. Juni 2019 18:12:03 CEST Greg Kurz wrote:
> On Wed, 26 Jun 2019 20:25:55 +0200
> Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > There is no need for signedness on these QID fields for 9p.
> > 
> > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> 
> You should mention here the changes you made on top of Antonios
> original patch. Something like:
> 
> [CS: - also convert path
>      - adapted trace-events and donttouch_stat()]

Haven't seen that comment style in the git logs. Any example hash for that?

> > diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
> > index c0a0a4ab5d..6964756922 100644
> > --- a/hw/9pfs/trace-events
> > +++ b/hw/9pfs/trace-events
> > @@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id
> > %d err %d"> 
> >  v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag
> >  %d id %d msize %d version %s" v9fs_version_return(uint16_t tag, uint8_t
> >  id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
> >  v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char*
> >  uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"> 
> > -v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t
> > version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
> > +v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t
> > version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64
> I was expecting to see PRIu64 for an uint64_t but I now realize that %d
> seems to be used all over the place for unsigned types... :-\
> 
> At least, please fix the masks of the lines you're changing in this
> patch so that unsigned are passed to "u" or PRIu64. The rest of the
> mess can be fixed later in a followup.

If you don't mind I will restrict it to your latter suggestion for now, that 
is adjusting it using the short format specifiers e.g. "u", the rest would IMO 
be out of the scope of this patch series.

Too bad that no format specifier warnings are thrown on these.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping
  2019-06-26 18:52 ` [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping Christian Schoenebeck via Qemu-devel
@ 2019-06-28 11:50   ` Greg Kurz
  2019-06-28 14:56     ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Greg Kurz @ 2019-06-28 11:50 UTC (permalink / raw)
  To: Christian Schoenebeck via Qemu-devel
  Cc: Daniel P. Berrangé, Christian Schoenebeck, Antonios Motakis

On Wed, 26 Jun 2019 20:52:09 +0200
Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:

> Use variable length suffixes for inode remapping instead of the fixed 16
> bit size prefixes before. With this change the inode numbers on guest will
> typically be much smaller (e.g. around >2^1 .. >2^7 instead of >2^48 with
> the previous fixed size inode remapping.
> 
> Additionally this solution should be more efficient, since inode numbers in
> practice can take almost their entire 64 bit range on guest as well. Which
> might also be beneficial for nested virtualization.
> 
> The "Exponential Golomb" algorithm is used as basis for generating the
> variable length suffixes. The algorithm has a paramter k which controls the
> distribution of bits on increasing indeces (minimum bits at low index vs.
> maximum bits at high index). With k=0 the generated suffixes look like:
> 
> Index Dec/Bin -> Generated Suffix Bin
> 1 [1] -> [1] (1 bits)
> 2 [10] -> [010] (3 bits)
> 3 [11] -> [110] (3 bits)
> 4 [100] -> [00100] (5 bits)
> 5 [101] -> [10100] (5 bits)
> 6 [110] -> [01100] (5 bits)
> 7 [111] -> [11100] (5 bits)
> 8 [1000] -> [0001000] (7 bits)
> 9 [1001] -> [1001000] (7 bits)
> 10 [1010] -> [0101000] (7 bits)
> 11 [1011] -> [1101000] (7 bits)
> 12 [1100] -> [0011000] (7 bits)
> ...
> 65533 [1111111111111101] ->  [1011111111111111000000000000000] (31 bits)
> 65534 [1111111111111110] ->  [0111111111111111000000000000000] (31 bits)
> 65535 [1111111111111111] ->  [1111111111111111000000000000000] (31 bits)
> Hence minBits=1 maxBits=31
> 
> And with k=5 they would look like:
> 
> Index Dec/Bin -> Generated Suffix Bin
> 1 [1] -> [000001] (6 bits)
> 2 [10] -> [100001] (6 bits)
> 3 [11] -> [010001] (6 bits)
> 4 [100] -> [110001] (6 bits)
> 5 [101] -> [001001] (6 bits)
> 6 [110] -> [101001] (6 bits)
> 7 [111] -> [011001] (6 bits)
> 8 [1000] -> [111001] (6 bits)
> 9 [1001] -> [000101] (6 bits)
> 10 [1010] -> [100101] (6 bits)
> 11 [1011] -> [010101] (6 bits)
> 12 [1100] -> [110101] (6 bits)
> ...
> 65533 [1111111111111101] -> [0011100000000000100000000000] (28 bits)
> 65534 [1111111111111110] -> [1011100000000000100000000000] (28 bits)
> 65535 [1111111111111111] -> [0111100000000000100000000000] (28 bits)
> Hence minBits=6 maxBits=28
> 

IIUC, this k control parameter should be constant for the
lifetime of QIDs. So it all boils down to choose a _good_
value that would cover most scenarios, right ?

For now, I just have some _cosmetic_ remarks.

> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  hw/9pfs/9p.h |  67 ++++++++++++++-
>  2 files changed, 312 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index e6e410972f..46c9f11384 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -26,6 +26,7 @@
>  #include "migration/blocker.h"
>  #include "sysemu/qtest.h"
>  #include "qemu/xxhash.h"
> +#include <math.h>
>  
>  int open_fd_hw;
>  int total_open_fd;
> @@ -572,6 +573,123 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
>                                  P9_STAT_MODE_NAMED_PIPE |   \
>                                  P9_STAT_MODE_SOCKET)
>  
> +#if P9_VARI_LENGTH_INODE_SUFFIXES

The numerous locations guarded by P9_VARI_LENGTH_INODE_SUFFIXES
really obfuscate the code, and don't ease review (for me at least).
And anyway, if we go for variable length suffixes, we won't keep
the fixed length prefix code.

> +
> +/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
> +static inline uint8_t mirror8bit(uint8_t byte) {

From CODING_STYLE:

4. Block structure

[...]

for reasons of tradition and clarity it comes on a line by itself:

    void a_function(void)
    {
        do_something();
    }

> +    return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
> +}
> +
> +/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
> +static inline uint64_t mirror64bit(uint64_t value) {

Ditto.

> +    return ((uint64_t)mirror8bit( value        & 0xff) << 56) |
> +           ((uint64_t)mirror8bit((value >> 8)  & 0xff) << 48) |
> +           ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
> +           ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
> +           ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
> +           ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
> +           ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8 ) |
> +           ((uint64_t)mirror8bit((value >> 56) & 0xff)      ) ;
> +}
> +
> +/* Parameter k for the Exponential Golomb algorihm to be used.
> + *
> + * The smaller this value, the smaller the minimum bit count for the Exp.
> + * Golomb generated affixes will be (at lowest index) however for the
> + * price of having higher maximum bit count of generated affixes (at highest
> + * index). Likewise increasing this parameter yields in smaller maximum bit
> + * count for the price of having higher minimum bit count.

Forgive my laziness but what are the benefits of a smaller or larger
value, in term of user experience ?

> + */
> +#define EXP_GOLOMB_K    0
> +
> +# if !EXP_GOLOMB_K
> +
> +/** @brief Exponential Golomb algorithm limited to the case k=0.
> + *

This doesn't really help to have a special implementation for k=0. The
resulting function is nearly identical to the general case. It is likely
that the compiler can optimize it and generate the same code.

> + * See expGolombEncode() below for details.
> + *
> + * @param n - natural number (or index) of the prefix to be generated
> + *            (1, 2, 3, ...)
> + */
> +static VariLenAffix expGolombEncodeK0(uint64_t n) {
> +    const int bits = (int) log2(n) + 1;
> +    return (VariLenAffix) {
> +        .type = AffixType_Prefix,
> +        .value = n,
> +        .bits = bits + bits - 1
> +    };
> +}
> +
> +# else
> +
> +/** @brief Exponential Golomb algorithm for arbitrary k (including k=0).
> + *
> + * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!)
> + * with growing length and with the mathematical property of being
> + * "prefix-free". The latter means the generated prefixes can be prepended
> + * in front of arbitrary numbers and the resulting concatenated numbers are
> + * guaranteed to be always unique.
> + *
> + * This is a minor adjustment to the original Exp. Golomb algorithm in the
> + * sense that lowest allowed index (@param n) starts with 1, not with zero.
> + *
> + * @param n - natural number (or index) of the prefix to be generated
> + *            (1, 2, 3, ...)
> + * @param k - parameter k of Exp. Golomb algorithm to be used
> + *            (see comment on EXP_GOLOMB_K macro for details about k)
> + */
> +static VariLenAffix expGolombEncode(uint64_t n, int k) {

Function.

> +    const uint64_t value = n + (1 << k) - 1;
> +    const int bits = (int) log2(value) + 1;
> +    return (VariLenAffix) {
> +        .type = AffixType_Prefix,
> +        .value = value,
> +        .bits = bits + MAX((bits - 1 - k), 0)
> +    };
> +}
> +
> +# endif /* !EXP_GOLOMB_K */
> +
> +/** @brief Converts a suffix into a prefix, or a prefix into a suffix.
> + *
> + * What this function does is actually mirroring all bits of the affix value,

Drop the "What this function does..." wording and use an imperative style
instead, ie. "Mirror all bits of..."

> + * with the purpose to preserve respectively the mathematical "prefix-free"
> + * or "suffix-free" property after the conversion.
> + *
> + * In other words: if a passed prefix is suitable to create unique numbers,
> + * then the returned suffix is suitable to create unique numbers as well
> + * (and vice versa).
> + */
> +static VariLenAffix invertAffix(const VariLenAffix* affix) {

Function.

> +    return (VariLenAffix) {
> +        .type = (affix->type == AffixType_Suffix) ? AffixType_Prefix : AffixType_Suffix,
> +        .value =  mirror64bit(affix->value) >> ((sizeof(affix->value) * 8) - affix->bits),
> +        .bits = affix->bits
> +    };
> +}
> +
> +/** @brief Generates suffix numbers with "suffix-free" property.
> + *
> + * This is just a wrapper function on top of the Exp. Golomb algorithm.
> + *
> + * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
> + * this function converts the Exp. Golomb prefixes into appropriate suffixes
> + * which are still suitable for generating unique numbers.
> + *
> + * @param n - natural number (or index) of the suffix to be generated
> + *            (1, 2, 3, ...)
> + */
> +static VariLenAffix affixForIndex(uint64_t index) {

Function.

> +    VariLenAffix prefix;
> +    #if EXP_GOLOMB_K
> +    prefix = expGolombEncode(index, EXP_GOLOMB_K);
> +    #else
> +    prefix = expGolombEncodeK0(index);
> +    #endif
> +    return invertAffix(&prefix); /* convert prefix to suffix */
> +}
> +
> +#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
>  
>  /* creative abuse of tb_hash_func7, which is based on xxhash */
>  static uint32_t qpp_hash(QppEntry e)
> @@ -584,13 +702,23 @@ static uint32_t qpf_hash(QpfEntry e)
>      return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
>  }
>  
> -static bool qpp_lookup_func(const void *obj, const void *userp)
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +
> +static bool qpd_cmp_func(const void *obj, const void *userp)
> +{
> +    const QpdEntry *e1 = obj, *e2 = userp;
> +    return e1->dev == e2->dev;
> +}
> +
> +#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
> +
> +static bool qpp_cmp_func(const void *obj, const void *userp)
>  {
>      const QppEntry *e1 = obj, *e2 = userp;
>      return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
>  }
>  
> -static bool qpf_lookup_func(const void *obj, const void *userp)
> +static bool qpf_cmp_func(const void *obj, const void *userp)
>  {
>      const QpfEntry *e1 = obj, *e2 = userp;
>      return e1->dev == e2->dev && e1->ino == e2->ino;
> @@ -607,6 +735,58 @@ static void qp_table_destroy(struct qht *ht)
>      qht_destroy(ht);
>  }
>  
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +
> +/*
> + * Returns how many (high end) bits of inode numbers of the passed fs
> + * device shall be used (in combination with the device number) to
> + * generate hash values for qpp_table entries.
> + *
> + * This function is required if variable length suffixes are used for inode
> + * number mapping on guest level. Since a device may end up having multiple
> + * entries in qpp_table, each entry most probably with a different suffix
> + * length, we thus need this function in conjunction with qpd_table to
> + * "agree" about a fix amount of bits (per device) to be always used for
> + * generating hash values for the purpose of accessing qpp_table in order
> + * get consistent behaviour when accessing qpp_table.
> + */
> +static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
> +{
> +    QpdEntry lookup = {
> +        .dev = dev
> +    }, *val;
> +    uint32_t hash = dev;
> +    VariLenAffix affix;
> +
> +    val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
> +    if (!val) {
> +        val = g_malloc0(sizeof(QpdEntry));
> +        *val = lookup;
> +        affix = affixForIndex(pdu->s->qp_affix_next);
> +        val->prefix_bits = affix.bits;
> +        qht_insert(&pdu->s->qpd_table, val, hash, NULL);
> +        pdu->s->qp_ndevices++;
> +    }
> +    return val->prefix_bits;
> +}
> +
> +#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
> +
> +/** @brief Slow / full mapping host inode nr -> guest inode nr.
> + *
> + * This function performs a slower and much more costly remapping of an
> + * original file inode number on host to an appropriate different inode
> + * number on guest. For every (dev, inode) combination on host a new
> + * sequential number is generated, cached and exposed as inode number on
> + * guest.
> + *
> + * This is just a "last resort" fallback solution if the much faster/cheaper
> + * qid_path_prefixmap() failed. In practice this slow / full mapping is not
> + * expected ever to be used at all though.
> + *
> + * @see qid_path_prefixmap() for details
> + *
> + */
>  static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
>                              uint64_t *path)
>  {
> @@ -615,11 +795,9 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
>          .ino = stbuf->st_ino
>      }, *val;
>      uint32_t hash = qpf_hash(lookup);
> -
> -    /* most users won't need the fullmap, so init the table lazily */
> -    if (!pdu->s->qpf_table.map) {
> -        qht_init(&pdu->s->qpf_table, qpf_lookup_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
> -    }
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    VariLenAffix affix;
> +#endif
>  

Move this declaration to the beginning of the block.

>      val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
>  
> @@ -633,8 +811,16 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
>          *val = lookup;
>  
>          /* new unique inode and device combo */
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +        affix = affixForIndex(
> +            1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
> +        );
> +        val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
> +        pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
> +#else
>          val->path = pdu->s->qp_fullpath_next++;
>          pdu->s->qp_fullpath_next &= QPATH_INO_MASK;
> +#endif
>          qht_insert(&pdu->s->qpf_table, val, hash, NULL);
>      }
>  
> @@ -642,42 +828,71 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
>      return 0;
>  }
>  
> -/* stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
> +/** @brief Quick mapping host inode nr -> guest inode nr.
> + *
> + * This function performs quick remapping of an original file inode number
> + * on host to an appropriate different inode number on guest. This remapping
> + * of inodes is required to avoid inode nr collisions on guest which would
> + * happen if the 9p export contains more than 1 exported file system (or
> + * more than 1 file system data set), because unlike on host level where the
> + * files would have different device nrs, all files exported by 9p would
> + * share the same device nr on guest (the device nr of the virtual 9p device
> + * that is).
> + *
> + * if P9_VARI_LENGTH_INODE_SUFFIXES == 0 :
> + * stat_to_qid needs to map inode number (64 bits) and device id (32 bits)
>   * to a unique QID path (64 bits). To avoid having to map and keep track
>   * of up to 2^64 objects, we map only the 16 highest bits of the inode plus
>   * the device id to the 16 highest bits of the QID path. The 48 lowest bits
>   * of the QID path equal to the lowest bits of the inode number.
>   *
> - * This takes advantage of the fact that inode number are usually not
> - * random but allocated sequentially, so we have fewer items to keep
> - * track of.

Hmm... why dropping this comment ?

> + * if P9_VARI_LENGTH_INODE_SUFFIXES == 1 :
> + * Instead of fixed size (16 bit) generated prefix, we use variable size
> + * suffixes instead. See comment on P9_VARI_LENGTH_INODE_SUFFIXES for
> + * details.
>   */
>  static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
>                                  uint64_t *path)
>  {
> +#if P9_VARI_LENGTH_INODE_SUFFIXES

Starting from here, the patch has too many P9_VARI_LENGTH_INODE_SUFFIXES
guards for my laziness and available time... I'll rather wait for the next
round where both approaches don't coexist.

Cheers,

--
Greg


> +    const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
> +#endif
>      QppEntry lookup = {
>          .dev = stbuf->st_dev,
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +        .ino_prefix = (uint16_t) (stbuf->st_ino >> (64-ino_hash_bits))
> +#else
>          .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> +#endif
>      }, *val;
>      uint32_t hash = qpp_hash(lookup);
>  
>      val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
>  
>      if (!val) {
> -        if (pdu->s->qp_prefix_next == 0) {
> -            /* we ran out of prefixes */
> +        if (pdu->s->qp_affix_next == 0) {
> +            /* we ran out of affixes */
>              return -ENFILE;
>          }
>  
>          val = g_malloc0(sizeof(QppEntry));
>          *val = lookup;
>  
> -        /* new unique inode prefix and device combo */
> -        val->qp_prefix = pdu->s->qp_prefix_next++;
> +        /* new unique inode affix and device combo */
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +        val->qp_affix_index = pdu->s->qp_affix_next++;
> +        val->qp_affix = affixForIndex(val->qp_affix_index);
> +#else
> +        val->qp_affix = pdu->s->qp_affix_next++;
> +#endif
>          qht_insert(&pdu->s->qpp_table, val, hash, NULL);
>      }
> -
> -    *path = ((uint64_t)val->qp_prefix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    /* assuming generated affix to be suffix type, not prefix */
> +    *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
> +#else
> +    *path = ((uint64_t)val->qp_affix << 48) | (stbuf->st_ino & QPATH_INO_MASK);
> +#endif
>      return 0;
>  }
>  
> @@ -3799,9 +4014,17 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
>  
>      s->dev_id = 0;
>  
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    qht_init(&s->qpd_table, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
> +#endif
> +    /* most users won't need the fullmap, so init the table lazily */
> +    qht_init(&s->qpf_table, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
>      /* QID path hash table. 1 entry ought to be enough for anybody ;) */
> -    qht_init(&s->qpp_table, qpp_lookup_func, 1, QHT_MODE_AUTO_RESIZE);
> -    s->qp_prefix_next = 1; /* reserve 0 to detect overflow */
> +    qht_init(&s->qpp_table, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    s->qp_ndevices = 0;
> +#endif
> +    s->qp_affix_next = 1; /* reserve 0 to detect overflow */
>      s->qp_fullpath_next = 1;
>  
>      s->ctx.fst = &fse->fst;
> @@ -3817,6 +4040,9 @@ out:
>          }
>          g_free(s->tag);
>          g_free(s->ctx.fs_root);
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +        qp_table_destroy(&s->qpd_table);
> +#endif
>          qp_table_destroy(&s->qpp_table);
>          qp_table_destroy(&s->qpf_table);
>          v9fs_path_free(&path);
> @@ -3831,6 +4057,9 @@ void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
>      }
>      fsdev_throttle_cleanup(s->ctx.fst);
>      g_free(s->tag);
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    qp_table_destroy(&s->qpd_table);
> +#endif
>      qp_table_destroy(&s->qpp_table);
>      qp_table_destroy(&s->qpf_table);
>      g_free(s->ctx.fs_root);
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 2b74561030..a94272a504 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -236,13 +236,68 @@ struct V9fsFidState
>      V9fsFidState *rclm_lst;
>  };
>  
> -#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> +/*
> + * Defines how inode numbers from host shall be remapped on guest.
> + *
> + * When this compile time option is disabled then fixed length (16 bit)
> + * prefix values are used for all inode numbers on guest level. Accordingly
> + * guest's inode numbers will be quite large (>2^48).
> + *
> + * If this option is enabled then variable length suffixes will be used for
> + * guest's inode numbers instead which usually yields in much smaller inode
> + * numbers on guest level (typically around >2^1 .. >2^7).
> + */
> +#define P9_VARI_LENGTH_INODE_SUFFIXES 1
> +
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +
> +typedef enum AffixType_t {
> +    AffixType_Prefix,
> +    AffixType_Suffix, /* A.k.a. postfix. */
> +} AffixType_t;
> +
> +/** @brief Unique affix of variable length.
> + *
> + * An affix is (currently) either a suffix or a prefix, which is either
> + * going to be prepended (prefix) or appended (suffix) with some other
> + * number for the goal to generate unique numbers. Accordingly the
> + * suffixes (or prefixes) we generate @b must all have the mathematical
> + * property of being suffix-free (or prefix-free in case of prefixes)
> + * so that no matter what number we concatenate the affix with, that we
> + * always reliably get unique numbers as result after concatenation.
> + */
> +typedef struct VariLenAffix {
> +    AffixType_t type; /* Whether this affix is a suffix or a prefix. */
> +    uint64_t value; /* Actual numerical value of this affix. */
> +    int bits; /* Lenght of the affix, that is how many (of the lowest) bits of @c value must be used for appending/prepending this affix to its final resulting, unique number. */
> +} VariLenAffix;
> +
> +#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
> +
> +#if !P9_VARI_LENGTH_INODE_SUFFIXES
> +# define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> +#endif
> +
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +
> +/* See qid_inode_prefix_hash_bits(). */
> +typedef struct {
> +    dev_t dev; /* FS device on host. */
> +    int prefix_bits; /* How many (high) bits of the original inode number shall be used for hashing. */
> +} QpdEntry;
> +
> +#endif /* P9_VARI_LENGTH_INODE_SUFFIXES */
>  
>  /* QID path prefix entry, see stat_to_qid */
>  typedef struct {
>      dev_t dev;
>      uint16_t ino_prefix;
> -    uint16_t qp_prefix;
> +    #if P9_VARI_LENGTH_INODE_SUFFIXES
> +    uint32_t qp_affix_index;
> +    VariLenAffix qp_affix;
> +    #else
> +    uint16_t qp_affix;
> +    #endif
>  } QppEntry;
>  
>  /* QID path full entry, as above */
> @@ -274,9 +329,15 @@ struct V9fsState
>      V9fsConf fsconf;
>      V9fsQID root_qid;
>      dev_t dev_id;
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    struct qht qpd_table;
> +#endif
>      struct qht qpp_table;
>      struct qht qpf_table;
> -    uint16_t qp_prefix_next;
> +#if P9_VARI_LENGTH_INODE_SUFFIXES
> +    uint64_t qp_ndevices; /* Amount of entries in qpd_table. */
> +#endif
> +    uint16_t qp_affix_next;
>      uint64_t qp_fullpath_next;
>  };
>  



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path
  2019-06-28 11:42     ` Christian Schoenebeck via Qemu-devel
@ 2019-06-28 12:06       ` Greg Kurz
  0 siblings, 0 replies; 22+ messages in thread
From: Greg Kurz @ 2019-06-28 12:06 UTC (permalink / raw)
  To: Christian Schoenebeck
  Cc: Daniel P. Berrangé, qemu-devel, Antonios Motakis

On Fri, 28 Jun 2019 13:42:43 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 27. Juni 2019 18:12:03 CEST Greg Kurz wrote:
> > On Wed, 26 Jun 2019 20:25:55 +0200
> > Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > > There is no need for signedness on these QID fields for 9p.
> > > 
> > > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> > 
> > You should mention here the changes you made on top of Antonios
> > original patch. Something like:
> > 
> > [CS: - also convert path
> >      - adapted trace-events and donttouch_stat()]
> 
> Haven't seen that comment style in the git logs. Any example hash for that?
> 

$ git log | egrep '^[[:space:]]*\[' | head -15
    [Commit message tweaked]
    [Superfluous #include dropped]
    [Comment reformatted to make checkpatch.pl happy, #include <dirent.h>
    [monitor_is_qmp() tidied up to make checkpatch.pl happy,
    [Header guard symbol tidied up, superfluous #include dropped, FIXME in
    [sortcmdlist() cleaned up to make checkpatch.pl happy]
    [Superfluous variable in monitor_data_destroy() eliminated, whitespace
    [Superfluous variable in monitor_data_destroy() eliminated]
    [Zero initialization of Monitor moved from monitor_data_init() to
        [ ... ]
        [ ... ]
    [mreitz: Dropped superfluous printf from _filter_offsets, as suggested
    [mreitz: Adjusted commit message as per John's proposal]
    [mreitz: Moved from 250 to 256]
    [AJB: fix conflicts with tests/vm: Port basevm to Python 3]

This is something you should do when re-posting someone else's patch
with modifications.

> > > diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
> > > index c0a0a4ab5d..6964756922 100644
> > > --- a/hw/9pfs/trace-events
> > > +++ b/hw/9pfs/trace-events
> > > @@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id
> > > %d err %d"> 
> > >  v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag
> > >  %d id %d msize %d version %s" v9fs_version_return(uint16_t tag, uint8_t
> > >  id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
> > >  v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char*
> > >  uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"> 
> > > -v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t
> > > version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
> > > +v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t
> > > version, uint64_t path) "tag %d id %d type %d version %d path %"PRId64
> > I was expecting to see PRIu64 for an uint64_t but I now realize that %d
> > seems to be used all over the place for unsigned types... :-\
> > 
> > At least, please fix the masks of the lines you're changing in this
> > patch so that unsigned are passed to "u" or PRIu64. The rest of the
> > mess can be fixed later in a followup.
> 
> If you don't mind I will restrict it to your latter suggestion for now, that 
> is adjusting it using the short format specifiers e.g. "u", the rest would IMO 
> be out of the scope of this patch series.
> 

Sure.

> Too bad that no format specifier warnings are thrown on these.
> 

Yeah :-\

> Best regards,
> Christian Schoenebeck



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error
  2019-06-27 17:26   ` Greg Kurz
@ 2019-06-28 12:36     ` Christian Schoenebeck via Qemu-devel
  2019-06-28 12:47       ` Greg Kurz
  0 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-28 12:36 UTC (permalink / raw)
  To: qemu-devel
  Cc: Christian Schoenebeck, Daniel P. Berrangé,
	Greg Kurz, Antonios Motakis

On Donnerstag, 27. Juni 2019 19:26:22 CEST Greg Kurz wrote:
> On Wed, 26 Jun 2019 20:30:41 +0200
> 
> Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > The QID path should uniquely identify a file. However, the
> > inode of a file is currently used as the QID path, which
> > on its own only uniquely identifies wiles within a device.
> 
> s/wile/files

Ah right. :)

> > Here we track the device hosting the 9pfs share, in order
> > to prevent security issues with QID path collisions from
> > other devices.
> > 
> > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> 
> You should mention here the changes you made to the original patch.

Got it. Will do for the other cases as well of course.

> > -static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
> > +static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID
> > *qidp)> 
> >  {
> >  
> >      size_t size;
> > 
> > +    if (pdu->s->dev_id == 0) {
> > +        pdu->s->dev_id = stbuf->st_dev;
> 
> st_dev should be captured in v9fs_device_realize_common() since we
> lstat() the root there, instead of every request doing the check.

Ok.

> > +    } else if (pdu->s->dev_id != stbuf->st_dev) {
> > +        error_report_once(
> > +            "9p: Multiple devices detected in same VirtFS export. "
> > +            "You must use a separate export for each device."
> > +        );
> > +        return -ENOSYS;
> 
> This error is likely to end up as the return value of a
> syscall in the guest and -ENOSYS usually means the syscall
> isn't implemented, which is obviously not the case. Maybe
> return -EPERM instead ?

I would rather suggest -ENODEV. The entire device of the requested file/dir is 
not available on guest.

-EPERM IMO rather motivates users looking for file system permission settings 
on individual files intead, and probably not checking the host's logs for the 
detailled error message.

> > @@ -3633,6 +3674,8 @@ int v9fs_device_realize_common(V9fsState *s, const
> > V9fsTransport *t,> 
> >          goto out;
> >      
> >      }
> > 
> > +    s->dev_id = 0;
> > +
> 
> Set it to stat->st_dev after lstat() was called later in this function.

I guesst you mean "earlier" not "later". The lstat() call is just before that 
dev_id initalization line. But in general your suggestion makes sense of 
course.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error
  2019-06-28 12:36     ` Christian Schoenebeck via Qemu-devel
@ 2019-06-28 12:47       ` Greg Kurz
  0 siblings, 0 replies; 22+ messages in thread
From: Greg Kurz @ 2019-06-28 12:47 UTC (permalink / raw)
  To: Christian Schoenebeck
  Cc: Daniel P. Berrangé, qemu-devel, Antonios Motakis

On Fri, 28 Jun 2019 14:36:41 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 27. Juni 2019 19:26:22 CEST Greg Kurz wrote:
> > On Wed, 26 Jun 2019 20:30:41 +0200
> > 
> > Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > > The QID path should uniquely identify a file. However, the
> > > inode of a file is currently used as the QID path, which
> > > on its own only uniquely identifies wiles within a device.
> > 
> > s/wile/files
> 
> Ah right. :)
> 
> > > Here we track the device hosting the 9pfs share, in order
> > > to prevent security issues with QID path collisions from
> > > other devices.
> > > 
> > > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> > 
> > You should mention here the changes you made to the original patch.
> 
> Got it. Will do for the other cases as well of course.
> 

Cool.

> > > -static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
> > > +static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID
> > > *qidp)> 
> > >  {
> > >  
> > >      size_t size;
> > > 
> > > +    if (pdu->s->dev_id == 0) {
> > > +        pdu->s->dev_id = stbuf->st_dev;
> > 
> > st_dev should be captured in v9fs_device_realize_common() since we
> > lstat() the root there, instead of every request doing the check.
> 
> Ok.
> 
> > > +    } else if (pdu->s->dev_id != stbuf->st_dev) {
> > > +        error_report_once(
> > > +            "9p: Multiple devices detected in same VirtFS export. "
> > > +            "You must use a separate export for each device."
> > > +        );
> > > +        return -ENOSYS;
> > 
> > This error is likely to end up as the return value of a
> > syscall in the guest and -ENOSYS usually means the syscall
> > isn't implemented, which is obviously not the case. Maybe
> > return -EPERM instead ?
> 
> I would rather suggest -ENODEV. The entire device of the requested file/dir is 
> not available on guest.
> 
> -EPERM IMO rather motivates users looking for file system permission settings 
> on individual files intead, and probably not checking the host's logs for the 
> detailled error message.
> 

-ENODEV is ok with me then.

> > > @@ -3633,6 +3674,8 @@ int v9fs_device_realize_common(V9fsState *s, const
> > > V9fsTransport *t,> 
> > >          goto out;
> > >      
> > >      }
> > > 
> > > +    s->dev_id = 0;
> > > +
> > 
> > Set it to stat->st_dev after lstat() was called later in this function.
> 
> I guesst you mean "earlier" not "later". The lstat() call is just before that 
> dev_id initalization line. But in general your suggestion makes sense of 
> course.
> 

Oops... "earlier" indeed :)

> Best regards,
> Christian Schoenebeck



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes"
  2019-06-28 10:09   ` Greg Kurz
@ 2019-06-28 13:47     ` Christian Schoenebeck via Qemu-devel
  2019-06-28 14:23       ` Greg Kurz
  0 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-28 13:47 UTC (permalink / raw)
  To: qemu-devel
  Cc: Christian Schoenebeck, Daniel P. Berrangé,
	Greg Kurz, Antonios Motakis

On Freitag, 28. Juni 2019 12:09:31 CEST Greg Kurz wrote:
> On Wed, 26 Jun 2019 20:42:13 +0200
> 
> Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > To support multiple devices on the 9p share, and avoid
> > qid path collisions we take the device id as input
> > to generate a unique QID path. The lowest 48 bits of
> > the path will be set equal to the file inode, and the
> > top bits will be uniquely assigned based on the top
> > 16 bits of the inode and the device id.
> > 
> > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> 
> Same remark about changes to the original patch.

ack_once();   :)

> BTW, I had a concern with the way v9fs_do_readdir() open-codes QID
> generation without calling stat_to_qid().
> 
> See discussion here:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02724.html
> 
> I guess you should ensure in a preliminary patch that QIDs only
> come out of stat_to_qid().

Mja, actually I first omitted your suggestion consciously, because I first 
thought it was an overkill pure visibility issue lmited to the default case 
remap_inodes==false, but now that I look at it again, it is actually an issue 
even when remap_inodes==true since dirent would expose wrong inode numbers on 
guest as well.

I will see what to do about it. However about your other concern here, quote:

	"Also, if we hit a collision while reading the directory, I'm
	 afraid the remaining entries won't be read at all. I'm not
	 sure this is really what we want."

That's however still a concern here that I would consider overkill to address. 
I mean if a user gets into that situation then because of a configuration error 
that must be corrected by user; the point of this patch set is to prevent 
undefined behaviour and to make the user aware about the root cause of the 
overall issue; the purpose is not to address all possible issues while there 
is still a configuration error.

> > +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> > +                                uint64_t *path)
> > +{
> > +    QppEntry lookup = {
> > +        .dev = stbuf->st_dev,
> > +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> > +    }, *val;
> > +    uint32_t hash = qpp_hash(lookup);
> > +
> > +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> > +
> > +    if (!val) {
> > +        if (pdu->s->qp_prefix_next == 0) {
> > +            /* we ran out of prefixes */
> 
> And we won't ever be able to allocate a new one. Maybe worth
> adding an error_report_once() to inform the user ?

Yeah, I thought about that as well. Will do.

> >  static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID
> >  *qidp) {
> > 
> > -    size_t size;
> > +    int err;
> > 
> > -    if (pdu->s->dev_id == 0) {
> > -        pdu->s->dev_id = stbuf->st_dev;
> > -    } else if (pdu->s->dev_id != stbuf->st_dev) {
> > -        error_report_once(
> > -            "9p: Multiple devices detected in same VirtFS export. "
> > -            "You must use a separate export for each device."
> > -        );
> > -        return -ENOSYS;
> > +    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> > +        /* map inode+device to qid path (fast path) */
> > +        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> > +        if (err) {
> > +            return err;
> > +        }
> > +    } else {
> > +        if (pdu->s->dev_id == 0) {
> > +            pdu->s->dev_id = stbuf->st_dev;
> > +        } else if (pdu->s->dev_id != stbuf->st_dev) {
> > +            error_report_once(
> > +                "9p: Multiple devices detected in same VirtFS export. "
> > +                "You must either use a separate export for each device "
> > +                "shared from host or enable virtfs option
> > 'remap_inodes'."
> > +            );
> > +            return -ENOSYS;
> > +        }
> > +        size_t size;
> 
> From CODING_STYLE:
> 
> 5. Declarations
> 
> Mixed declarations (interleaving statements and declarations within
> blocks) are generally not allowed; declarations should be at the beginning
> of blocks.
> 
> Please do so for "size" and add an extra blank line.

Ok.

> > +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> 
> This won't give the expected result on a 32-bit host. Since this
> is a mask for 64-bit entities, it should rather be:
> 
> #define QPATH_INO_MASK        ((1ULL << 48) - 1)

Correct, will fix it.

> > diff --git a/qemu-options.hx b/qemu-options.hx
> > index 0d8beb4afd..e7ea136da1 100644
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
> > @@ -1334,7 +1334,7 @@ ETEXI
> > 
> >  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
> >  
> >      "-virtfs
> >      local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-fil
> >      e|passthrough|none\n"> 
> > -    "       
> > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n" +
> >    "       
> > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,rem
> > ap_inodes]\n"
> This feature applies to all backends IIUC. We don't really care for the
> synth backend since it generates non-colliding inode numbers by design,
> but the proxy backend has the same issue as local. So...

Yeah, I was not sure about these, because I did not even know what these two 
were for exactly. :)  [ lazyness disclaimer end]

Will do for the other manual locations you mentioned as well.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path
  2019-06-28 10:21   ` Greg Kurz
@ 2019-06-28 14:03     ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-28 14:03 UTC (permalink / raw)
  To: qemu-devel
  Cc: Christian Schoenebeck, Daniel P. Berrangé,
	Greg Kurz, Antonios Motakis

On Freitag, 28. Juni 2019 12:21:20 CEST Greg Kurz wrote:
> > +static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
> > +                            uint64_t *path)
> > +{
> > +    QpfEntry lookup = {
> > +        .dev = stbuf->st_dev,
> > +        .ino = stbuf->st_ino
> > +    }, *val;
> > +    uint32_t hash = qpf_hash(lookup);
> > +
> > +    /* most users won't need the fullmap, so init the table lazily */
> > +    if (!pdu->s->qpf_table.map) {
> > +        qht_init(&pdu->s->qpf_table, qpf_lookup_func, 1 << 16,
> > QHT_MODE_AUTO_RESIZE); +    }
> > +
> > +    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
> > +
> > +    if (!val) {
> > +        if (pdu->s->qp_fullpath_next == 0) {
> > +            /* no more files can be mapped :'( */
> 
> This would be the place to put the error_report_once() suggested
> in the previous patch actually.

I will add the suggested error message to qid_path_prefixmap() in patch 3 and 
then will move over that error message to qid_path_fullmap() in patch 4.

Or if you want I can also leave an error_report_once() in qid_path_prefixmap() 
in patch 4 about potential degraded performance.

> > @@ -3779,7 +3831,8 @@ void v9fs_device_unrealize_common(V9fsState *s,
> > Error **errp)> 
> >      }
> >      fsdev_throttle_cleanup(s->ctx.fst);
> >      g_free(s->tag);
> > 
> > -    qpp_table_destroy(&s->qpp_table);
> > +    qp_table_destroy(&s->qpp_table);
> > +    qp_table_destroy(&s->qpf_table);
> 
> I'm starting to think v9fs_device_unrealize_common() should be made
> idempotent, so that it can be used to handle rollback of a partially
> realized device, and thus avoid the code duplication. But this is
> out-of-scope for this series.

Well, I can also make that e.g.:

	if (s->qpf_table.map)
		qp_table_destroy(&s->qpf_table);

if you prefer the occurrence amount to be reduced.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes"
  2019-06-28 13:47     ` Christian Schoenebeck via Qemu-devel
@ 2019-06-28 14:23       ` Greg Kurz
  2019-06-29 10:20         ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Greg Kurz @ 2019-06-28 14:23 UTC (permalink / raw)
  To: Christian Schoenebeck
  Cc: Daniel P. Berrangé, qemu-devel, Antonios Motakis

On Fri, 28 Jun 2019 15:47:52 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Freitag, 28. Juni 2019 12:09:31 CEST Greg Kurz wrote:
> > On Wed, 26 Jun 2019 20:42:13 +0200
> > 
> > Christian Schoenebeck via Qemu-devel <qemu-devel@nongnu.org> wrote:
> > > To support multiple devices on the 9p share, and avoid
> > > qid path collisions we take the device id as input
> > > to generate a unique QID path. The lowest 48 bits of
> > > the path will be set equal to the file inode, and the
> > > top bits will be uniquely assigned based on the top
> > > 16 bits of the inode and the device id.
> > > 
> > > Signed-off-by: Antonios Motakis <antonios.motakis@huawei.com>
> > 
> > Same remark about changes to the original patch.
> 
> ack_once();   :)
> 

:)

> > BTW, I had a concern with the way v9fs_do_readdir() open-codes QID
> > generation without calling stat_to_qid().
> > 
> > See discussion here:
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2018-02/msg02724.html
> > 
> > I guess you should ensure in a preliminary patch that QIDs only
> > come out of stat_to_qid().
> 
> Mja, actually I first omitted your suggestion consciously, because I first 
> thought it was an overkill pure visibility issue lmited to the default case 
> remap_inodes==false, but now that I look at it again, it is actually an issue 
> even when remap_inodes==true since dirent would expose wrong inode numbers on 
> guest as well.
> 
> I will see what to do about it. However about your other concern here, quote:
> 
> 	"Also, if we hit a collision while reading the directory, I'm
> 	 afraid the remaining entries won't be read at all. I'm not
> 	 sure this is really what we want."
> 
> That's however still a concern here that I would consider overkill to address. 
> I mean if a user gets into that situation then because of a configuration error 
> that must be corrected by user; the point of this patch set is to prevent 
> undefined behaviour and to make the user aware about the root cause of the 
> overall issue; the purpose is not to address all possible issues while there 
> is still a configuration error.
> 

Fair enough. And anyway, if we really need to address that, it can be done
later.

> > > +static int qid_path_prefixmap(V9fsPDU *pdu, const struct stat *stbuf,
> > > +                                uint64_t *path)
> > > +{
> > > +    QppEntry lookup = {
> > > +        .dev = stbuf->st_dev,
> > > +        .ino_prefix = (uint16_t) (stbuf->st_ino >> 48)
> > > +    }, *val;
> > > +    uint32_t hash = qpp_hash(lookup);
> > > +
> > > +    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
> > > +
> > > +    if (!val) {
> > > +        if (pdu->s->qp_prefix_next == 0) {
> > > +            /* we ran out of prefixes */
> > 
> > And we won't ever be able to allocate a new one. Maybe worth
> > adding an error_report_once() to inform the user ?
> 
> Yeah, I thought about that as well. Will do.
> 
> > >  static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID
> > >  *qidp) {
> > > 
> > > -    size_t size;
> > > +    int err;
> > > 
> > > -    if (pdu->s->dev_id == 0) {
> > > -        pdu->s->dev_id = stbuf->st_dev;
> > > -    } else if (pdu->s->dev_id != stbuf->st_dev) {
> > > -        error_report_once(
> > > -            "9p: Multiple devices detected in same VirtFS export. "
> > > -            "You must use a separate export for each device."
> > > -        );
> > > -        return -ENOSYS;
> > > +    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
> > > +        /* map inode+device to qid path (fast path) */
> > > +        err = qid_path_prefixmap(pdu, stbuf, &qidp->path);
> > > +        if (err) {
> > > +            return err;
> > > +        }
> > > +    } else {
> > > +        if (pdu->s->dev_id == 0) {
> > > +            pdu->s->dev_id = stbuf->st_dev;
> > > +        } else if (pdu->s->dev_id != stbuf->st_dev) {
> > > +            error_report_once(
> > > +                "9p: Multiple devices detected in same VirtFS export. "
> > > +                "You must either use a separate export for each device "
> > > +                "shared from host or enable virtfs option
> > > 'remap_inodes'."
> > > +            );
> > > +            return -ENOSYS;
> > > +        }
> > > +        size_t size;
> > 
> > From CODING_STYLE:
> > 
> > 5. Declarations
> > 
> > Mixed declarations (interleaving statements and declarations within
> > blocks) are generally not allowed; declarations should be at the beginning
> > of blocks.
> > 
> > Please do so for "size" and add an extra blank line.
> 
> Ok.
> 
> > > +#define QPATH_INO_MASK        (((unsigned long)1 << 48) - 1)
> > 
> > This won't give the expected result on a 32-bit host. Since this
> > is a mask for 64-bit entities, it should rather be:
> > 
> > #define QPATH_INO_MASK        ((1ULL << 48) - 1)
> 
> Correct, will fix it.
> 
> > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > index 0d8beb4afd..e7ea136da1 100644
> > > --- a/qemu-options.hx
> > > +++ b/qemu-options.hx
> > > @@ -1334,7 +1334,7 @@ ETEXI
> > > 
> > >  DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
> > >  
> > >      "-virtfs
> > >      local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-fil
> > >      e|passthrough|none\n"> 
> > > -    "       
> > > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n" +
> > >    "       
> > > [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,rem
> > > ap_inodes]\n"
> > This feature applies to all backends IIUC. We don't really care for the
> > synth backend since it generates non-colliding inode numbers by design,
> > but the proxy backend has the same issue as local. So...
> 
> Yeah, I was not sure about these, because I did not even know what these two 
> were for exactly. :)  [ lazyness disclaimer end]
> 

"proxy" is a backend where all I/O accesses are performed by a separate
process running the virtfs-proxy-helper command. It runs with root
privileges, which provides the same level of functionality as "local"
with security_model=passthrough. It also chroot() into the shared
folder for extra security. But it is slower since it all requests
still go through the virtio-9p device in QEMU. This would call
for a vhost-9p implementation, but it's yet another story.

"synth" is a software pseudo-backend, currently used to test 9pfs
with QTest (see tests/virtio-9p-test.c).

> Will do for the other manual locations you mentioned as well.
> 
> Best regards,
> Christian Schoenebeck



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping
  2019-06-28 11:50   ` Greg Kurz
@ 2019-06-28 14:56     ` Christian Schoenebeck via Qemu-devel
  2019-06-29 11:01       ` Christian Schoenebeck via Qemu-devel
  0 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-28 14:56 UTC (permalink / raw)
  To: qemu-devel
  Cc: Christian Schoenebeck, Daniel P. Berrangé,
	Greg Kurz, Antonios Motakis

On Freitag, 28. Juni 2019 13:50:15 CEST Greg Kurz wrote:
> > And with k=5 they would look like:
> > 
> > Index Dec/Bin -> Generated Suffix Bin
> > 1 [1] -> [000001] (6 bits)
> > 2 [10] -> [100001] (6 bits)
> > 3 [11] -> [010001] (6 bits)
> > 4 [100] -> [110001] (6 bits)
> > 5 [101] -> [001001] (6 bits)
> > 6 [110] -> [101001] (6 bits)
> > 7 [111] -> [011001] (6 bits)
> > 8 [1000] -> [111001] (6 bits)
> > 9 [1001] -> [000101] (6 bits)
> > 10 [1010] -> [100101] (6 bits)
> > 11 [1011] -> [010101] (6 bits)
> > 12 [1100] -> [110101] (6 bits)
> > ...
> > 65533 [1111111111111101] -> [0011100000000000100000000000] (28 bits)
> > 65534 [1111111111111110] -> [1011100000000000100000000000] (28 bits)
> > 65535 [1111111111111111] -> [0111100000000000100000000000] (28 bits)
> > Hence minBits=6 maxBits=28
> 
> IIUC, this k control parameter should be constant for the
> lifetime of QIDs. So it all boils down to choose a _good_
> value that would cover most scenarios, right ?

That's correct. In the end it's just a matter of how many devices do you 
expect to be exposed with the same 9p export for choosing an appropriate value 
for k. That parameter k must be constant at least until guest is rebooted, 
otherwise you would end up with completely different inode numbers if you 
would change that parameter k while guest is still running.

Like I mentioned before, I can send a short C file if you want to play around 
with that parameter k to see how the generated suffixes would look like. The 
console output is actually like shown above. Additionally the C demo also 
checks and proofs that all generated suffixes really generate unique numbers for 
the entire possible 64 bit range, so that should take away the scare about 
what this algorithm does.

Since you said before that you find it already exotic to have more than 1 
device being exported by 9p, I hence did not even bother to make that 
parameter "k" a runtime parameter. I *think* k=0 should be fine in practice.

> For now, I just have some _cosmetic_ remarks.
> 
> > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > ---
> > 
> >  hw/9pfs/9p.c | 267
> >  ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- hw/9pfs/9p.h
> >  |  67 ++++++++++++++-
> >  2 files changed, 312 insertions(+), 22 deletions(-)
> > 
> > diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> > index e6e410972f..46c9f11384 100644
> > --- a/hw/9pfs/9p.c
> > +++ b/hw/9pfs/9p.c
> > @@ -26,6 +26,7 @@
> > 
> >  #include "migration/blocker.h"
> >  #include "sysemu/qtest.h"
> >  #include "qemu/xxhash.h"
> > 
> > +#include <math.h>
> > 
> >  int open_fd_hw;
> >  int total_open_fd;
> > 
> > @@ -572,6 +573,123 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
> > 
> >                                  P9_STAT_MODE_NAMED_PIPE |   \
> >                                  P9_STAT_MODE_SOCKET)
> > 
> > +#if P9_VARI_LENGTH_INODE_SUFFIXES
> 
> The numerous locations guarded by P9_VARI_LENGTH_INODE_SUFFIXES
> really obfuscate the code, and don't ease review (for me at least).
> And anyway, if we go for variable length suffixes, we won't keep
> the fixed length prefix code.

I just did that to provide a direct comparison between the fixed size prefix vs. 
variable size suffix code, since the fixed size prefix code is easier to 
understand.

If you want I can add a 6th "cleanup" patch that would remove the fixed size 
prefix code and all the #ifs ?

> > +
> > +/* Mirrors all bits of a byte. So e.g. binary 10100000 would become
> > 00000101. */ +static inline uint8_t mirror8bit(uint8_t byte) {
> 
> From CODING_STYLE:
> 
> 4. Block structure
> 
> [...]
> 
> for reasons of tradition and clarity it comes on a line by itself:
> 
>     void a_function(void)
>     {
>         do_something();
>     }

Got it.

> > +/* Parameter k for the Exponential Golomb algorihm to be used.
> > + *
> > + * The smaller this value, the smaller the minimum bit count for the Exp.
> > + * Golomb generated affixes will be (at lowest index) however for the
> > + * price of having higher maximum bit count of generated affixes (at
> > highest + * index). Likewise increasing this parameter yields in smaller
> > maximum bit + * count for the price of having higher minimum bit count.
> 
> Forgive my laziness but what are the benefits of a smaller or larger
> value, in term of user experience ?

Well, the goal in general is too keep the generated suffix numbers (or their bit 
count actually) as small as possible, because you are cutting away the same 
amount of bits of the orginal inode number from host. So the smaller the 
generated suffix number is, the more bits you can simply directly use from the 
original inode number from host, and hence the cheaper this remap code becomes 
in practice. Because if you have e.g. a suffix number of just 3 bits, then in 
practice you will very likely only have exactly 1 entry in that hash table 
*ever*. So hash lookup will be very cheap, etc.

If you had a suffix number of 32 bit instead that would mean you would need to 
cut away 32 bits from the original inode numbers, and hence you would likely 
end up with multiple entries in the hash table and the remap code becomes more 
expensive due to the hash table lookups, and even worse, you might even end up 
getting into that "full map" code which generates a hash entry for every next 
inode.

In practice this issue becomes probably more relevant when nested 
virtualization becomes more popular OR if you are using a large number of 
devices on the same export.

> > + */
> > +#define EXP_GOLOMB_K    0
> > +
> > +# if !EXP_GOLOMB_K
> > +
> > +/** @brief Exponential Golomb algorithm limited to the case k=0.
> > + *
> 
> This doesn't really help to have a special implementation for k=0. The
> resulting function is nearly identical to the general case. It is likely
> that the compiler can optimize it and generate the same code.

I don't think the compiler's optimizer would really drop all the instructions 
automatically of the generalized variant of that particular function. Does it 
matter in practice? No, I actually just provided that optimized variant for 
the special case k=0 because I think k=0 will be the default value for that 
parameter and because you were already picky about a simple

	if (pdu->s->dev_id == 0)

to be optimized. In practice users won't feel the runtime difference either 
one of the two optimization scenarios.

> > +/** @brief Exponential Golomb algorithm for arbitrary k (including k=0).
> > + *
> > + * The Exponential Golomb algorithm generates @b prefixes (@b not
> > suffixes!) + * with growing length and with the mathematical property of
> > being + * "prefix-free". The latter means the generated prefixes can be
> > prepended + * in front of arbitrary numbers and the resulting
> > concatenated numbers are + * guaranteed to be always unique.
> > + *
> > + * This is a minor adjustment to the original Exp. Golomb algorithm in
> > the
> > + * sense that lowest allowed index (@param n) starts with 1, not with
> > zero. + *
> > + * @param n - natural number (or index) of the prefix to be generated
> > + *            (1, 2, 3, ...)
> > + * @param k - parameter k of Exp. Golomb algorithm to be used
> > + *            (see comment on EXP_GOLOMB_K macro for details about k)
> > + */
> > +static VariLenAffix expGolombEncode(uint64_t n, int k) {
> 
> Function.

ack_once();

> > +    const uint64_t value = n + (1 << k) - 1;
> > +    const int bits = (int) log2(value) + 1;
> > +    return (VariLenAffix) {
> > +        .type = AffixType_Prefix,
> > +        .value = value,
> > +        .bits = bits + MAX((bits - 1 - k), 0)
> > +    };
> > +}
> > +
> > +# endif /* !EXP_GOLOMB_K */
> > +
> > +/** @brief Converts a suffix into a prefix, or a prefix into a suffix.
> > + *
> > + * What this function does is actually mirroring all bits of the affix
> > value,
> Drop the "What this function does..." wording and use an imperative style
> instead, ie. "Mirror all bits of..."

Ok.

> >  static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
> >  
> >                              uint64_t *path)
> >  
> >  {
> > 
> > @@ -615,11 +795,9 @@ static int qid_path_fullmap(V9fsPDU *pdu, const
> > struct stat *stbuf,> 
> >          .ino = stbuf->st_ino
> >      
> >      }, *val;
> >      uint32_t hash = qpf_hash(lookup);
> > 
> > -
> > -    /* most users won't need the fullmap, so init the table lazily */
> > -    if (!pdu->s->qpf_table.map) {
> > -        qht_init(&pdu->s->qpf_table, qpf_lookup_func, 1 << 16,
> > QHT_MODE_AUTO_RESIZE); -    }
> > +#if P9_VARI_LENGTH_INODE_SUFFIXES
> > +    VariLenAffix affix;
> > +#endif
> 
> Move this declaration to the beginning of the block.

Ok.

> > -/* stat_to_qid needs to map inode number (64 bits) and device id (32
> > bits)
> > +/** @brief Quick mapping host inode nr -> guest inode nr.
> > + *
> > + * This function performs quick remapping of an original file inode
> > number
> > + * on host to an appropriate different inode number on guest. This
> > remapping + * of inodes is required to avoid inode nr collisions on guest
> > which would + * happen if the 9p export contains more than 1 exported
> > file system (or + * more than 1 file system data set), because unlike on
> > host level where the + * files would have different device nrs, all files
> > exported by 9p would + * share the same device nr on guest (the device nr
> > of the virtual 9p device + * that is).
> > + *
> > + * if P9_VARI_LENGTH_INODE_SUFFIXES == 0 :
> > + * stat_to_qid needs to map inode number (64 bits) and device id (32
> > bits)
> > 
> >   * to a unique QID path (64 bits). To avoid having to map and keep track
> >   * of up to 2^64 objects, we map only the 16 highest bits of the inode
> >   plus
> >   * the device id to the 16 highest bits of the QID path. The 48 lowest
> >   bits
> >   * of the QID path equal to the lowest bits of the inode number.
> >   *
> > 
> > - * This takes advantage of the fact that inode number are usually not
> > - * random but allocated sequentially, so we have fewer items to keep
> > - * track of.
> 
> Hmm... why dropping this comment ?

Because I found the entire function comment block became already too verbose 
and that particular sentence to be not relevant. At least IMO.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes"
  2019-06-28 14:23       ` Greg Kurz
@ 2019-06-29 10:20         ` Christian Schoenebeck via Qemu-devel
  2019-07-02  8:01           ` Greg Kurz
  0 siblings, 1 reply; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-29 10:20 UTC (permalink / raw)
  To: qemu-devel
  Cc: Daniel P. Berrangé,
	Christian Schoenebeck, Greg Kurz, Antonios Motakis

On Freitag, 28. Juni 2019 16:23:08 CEST Greg Kurz wrote:
> > > This feature applies to all backends IIUC. We don't really care for the
> > > synth backend since it generates non-colliding inode numbers by design,
> > > but the proxy backend has the same issue as local. So...
> > 
> > Yeah, I was not sure about these, because I did not even know what these
> > two were for exactly. :)  [ lazyness disclaimer end]
> 
> "proxy" is a backend where all I/O accesses are performed by a separate
> process running the virtfs-proxy-helper command. It runs with root
> privileges, which provides the same level of functionality as "local"
> with security_model=passthrough. It also chroot() into the shared
> folder for extra security. But it is slower since it all requests
> still go through the virtio-9p device in QEMU. This would call
> for a vhost-9p implementation, but it's yet another story.
> 
> "synth" is a software pseudo-backend, currently used to test 9pfs
> with QTest (see tests/virtio-9p-test.c).

Thanks for the clarification!

So the proxy backend sounds like an idea that has not been implemented fully 
to its end. I guess it is not really used in production environments, right? 
What is the actual motivation for this proxy backend?

And now that I look at it, I am a bit surprised that there is this pure Unix 
pipe socket based proxy variant, but no TCPIP network socket variant. I mean 
the latter is AFAIK the original idea behind the 9p protocol and IMO might be 
interesting to physical separate pure storage backends that way.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping
  2019-06-28 14:56     ` Christian Schoenebeck via Qemu-devel
@ 2019-06-29 11:01       ` Christian Schoenebeck via Qemu-devel
  0 siblings, 0 replies; 22+ messages in thread
From: Christian Schoenebeck via Qemu-devel @ 2019-06-29 11:01 UTC (permalink / raw)
  To: qemu-devel, Christian Schoenebeck
  Cc: Daniel P. Berrangé, Greg Kurz, Antonios Motakis

On Freitag, 28. Juni 2019 16:56:15 CEST Christian Schoenebeck via Qemu-devel 
> > > + */
> > > +#define EXP_GOLOMB_K    0
> > > +
> > > +# if !EXP_GOLOMB_K
> > > +
> > > +/** @brief Exponential Golomb algorithm limited to the case k=0.
> > > + *
> > 
> > This doesn't really help to have a special implementation for k=0. The
> > resulting function is nearly identical to the general case. It is likely
> > that the compiler can optimize it and generate the same code.
> 
> I don't think the compiler's optimizer would really drop all the
> instructions automatically of the generalized variant of that particular
> function. Does it matter in practice? No, I actually just provided that
> optimized variant for the special case k=0 because I think k=0 will be the
> default value for that parameter and because you were already picky about a
> simple
> 
> 	if (pdu->s->dev_id == 0)
> 
> to be optimized. In practice users won't feel the runtime difference either
> one of the two optimization scenarios.

I just checked my assmption made here with a simple C test unit:

	// Use manually optimized function for case k=0.
	VariLenAffix implK0(uint64_t n) {
	    return expGolombEncodeK0(n);
	}

	// Rely on compiler to optimize generalized function for k=0
	VariLenAffix implGenK(uint64_t n) {
	    return expGolombEncode(n, 0);
	}

And :   gcc -S -O3 -ffast-math k.c

Like expected the generated 2 functions are almost identical, except that the 
manually optimized variant saves the following 3 instructions:

	movl	$0, %eax
	...
	testl	%edx, %edx
	cmovns	%edx, %eax

But like I said, it is a tiny difference of course. Not really relevant in 
practice.

Best regards,
Christian Schoenebeck


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes"
  2019-06-29 10:20         ` Christian Schoenebeck via Qemu-devel
@ 2019-07-02  8:01           ` Greg Kurz
  0 siblings, 0 replies; 22+ messages in thread
From: Greg Kurz @ 2019-07-02  8:01 UTC (permalink / raw)
  To: Christian Schoenebeck
  Cc: Daniel P. Berrangé, qemu-devel, Antonios Motakis

On Sat, 29 Jun 2019 12:20:49 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Freitag, 28. Juni 2019 16:23:08 CEST Greg Kurz wrote:
> > > > This feature applies to all backends IIUC. We don't really care for the
> > > > synth backend since it generates non-colliding inode numbers by design,
> > > > but the proxy backend has the same issue as local. So...
> > > 
> > > Yeah, I was not sure about these, because I did not even know what these
> > > two were for exactly. :)  [ lazyness disclaimer end]
> > 
> > "proxy" is a backend where all I/O accesses are performed by a separate
> > process running the virtfs-proxy-helper command. It runs with root
> > privileges, which provides the same level of functionality as "local"
> > with security_model=passthrough. It also chroot() into the shared
> > folder for extra security. But it is slower since it all requests
> > still go through the virtio-9p device in QEMU. This would call
> > for a vhost-9p implementation, but it's yet another story.
> > 
> > "synth" is a software pseudo-backend, currently used to test 9pfs
> > with QTest (see tests/virtio-9p-test.c).
> 
> Thanks for the clarification!
> 
> So the proxy backend sounds like an idea that has not been implemented fully 
> to its end. I guess it is not really used in production environments, right? 

I don't have any feedback unfortunately... The biggest problem with proxy is
likely it's poor performance : every request needs to go through many hops.

guest->QEMU->proxy->QEMU->guest 

> What is the actual motivation for this proxy backend?
> 

The motivation is security: only the proxy helper runs with privileges (we
generally don't want to run QEMU as root), the helper can chroot() and thus
prevent the guest to access anything outside the shared folder.

> And now that I look at it, I am a bit surprised that there is this pure Unix 
> pipe socket based proxy variant, but no TCPIP network socket variant. I mean 

The Unix socket is required in order to pass open file descriptors between
QEMU and the proxy, using SCM_RIGHTS ancillary messages. There's no such
thing with TCPIP sockets.

> the latter is AFAIK the original idea behind the 9p protocol and IMO might be 
> interesting to physical separate pure storage backends that way.
> 

The right thing to do would be to have the "proxy" process to directly
access the rings of the virtio-9p device (ie, vhost), so that requests
only go through:

guest->proxy->guest

> Best regards,
> Christian Schoenebeck



^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2019-07-02  8:05 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-26 18:57 [Qemu-devel] [PATCH v4 0/5] 9p: Fix file ID collisions Christian Schoenebeck via Qemu-devel
2019-06-26 18:25 ` [Qemu-devel] [PATCH v4 1/5] 9p: unsigned type for type, version, path Christian Schoenebeck via Qemu-devel
2019-06-27 16:12   ` Greg Kurz
2019-06-28 11:42     ` Christian Schoenebeck via Qemu-devel
2019-06-28 12:06       ` Greg Kurz
2019-06-26 18:30 ` [Qemu-devel] [PATCH v4 2/5] 9p: Treat multiple devices on one export as an error Christian Schoenebeck via Qemu-devel
2019-06-27 17:26   ` Greg Kurz
2019-06-28 12:36     ` Christian Schoenebeck via Qemu-devel
2019-06-28 12:47       ` Greg Kurz
2019-06-26 18:42 ` [Qemu-devel] [PATCH v4 3/5] 9p: Added virtfs option "remap_inodes" Christian Schoenebeck via Qemu-devel
2019-06-28 10:09   ` Greg Kurz
2019-06-28 13:47     ` Christian Schoenebeck via Qemu-devel
2019-06-28 14:23       ` Greg Kurz
2019-06-29 10:20         ` Christian Schoenebeck via Qemu-devel
2019-07-02  8:01           ` Greg Kurz
2019-06-26 18:46 ` [Qemu-devel] [PATCH v4 4/5] 9p: stat_to_qid: implement slow path Christian Schoenebeck via Qemu-devel
2019-06-28 10:21   ` Greg Kurz
2019-06-28 14:03     ` Christian Schoenebeck via Qemu-devel
2019-06-26 18:52 ` [Qemu-devel] [PATCH v4 5/5] 9p: Use variable length suffixes for inode remapping Christian Schoenebeck via Qemu-devel
2019-06-28 11:50   ` Greg Kurz
2019-06-28 14:56     ` Christian Schoenebeck via Qemu-devel
2019-06-29 11:01       ` Christian Schoenebeck via Qemu-devel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).