All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6 1/5] tests/virtio-9p: added split readdir tests
  2020-04-19 15:10 [PATCH v6 0/5] 9pfs: readdir optimization Christian Schoenebeck
@ 2020-04-19 15:00 ` Christian Schoenebeck
  2020-04-19 15:00 ` [PATCH v6 2/5] 9pfs: make v9fs_readdir_response_size() public Christian Schoenebeck
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-19 15:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

The previous, already existing 'basic' readdir test simply used a
'count' parameter big enough to retrieve all directory entries with a
single Treaddir request.

In the 3 new 'split' readdir tests added by this patch, directory
entries are retrieved, split over several Treaddir requests by picking
small 'count' parameters which force the server to truncate the
response. So the test client sends as many Treaddir requests as
necessary to get all directory entries.

The following 3 new tests are added (executed in this sequence):

1. Split readdir test with count=512
2. Split readdir test with count=256
3. Split readdir test with count=128

This test case sequence is chosen because the smaller the 'count' value,
the higher the chance of errors in case of implementation bugs on server
side.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 tests/qtest/virtio-9p-test.c | 108 +++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/tests/qtest/virtio-9p-test.c b/tests/qtest/virtio-9p-test.c
index 2167322985..de30b717b6 100644
--- a/tests/qtest/virtio-9p-test.c
+++ b/tests/qtest/virtio-9p-test.c
@@ -578,6 +578,7 @@ static bool fs_dirents_contain_name(struct V9fsDirent *e, const char* name)
     return false;
 }
 
+/* basic readdir test where reply fits into a single response message */
 static void fs_readdir(void *obj, void *data, QGuestAllocator *t_alloc)
 {
     QVirtio9P *v9p = obj;
@@ -631,6 +632,89 @@ static void fs_readdir(void *obj, void *data, QGuestAllocator *t_alloc)
     g_free(wnames[0]);
 }
 
+/* readdir test where overall request is split over several messages */
+static void fs_readdir_split(void *obj, void *data, QGuestAllocator *t_alloc,
+                             uint32_t count)
+{
+    QVirtio9P *v9p = obj;
+    alloc = t_alloc;
+    char *const wnames[] = { g_strdup(QTEST_V9FS_SYNTH_READDIR_DIR) };
+    uint16_t nqid;
+    v9fs_qid qid;
+    uint32_t nentries, npartialentries;
+    struct V9fsDirent *entries, *tail, *partialentries;
+    P9Req *req;
+    int fid;
+    uint64_t offset;
+
+    fs_attach(v9p, NULL, t_alloc);
+
+    fid = 1;
+    offset = 0;
+    entries = NULL;
+    nentries = 0;
+    tail = NULL;
+
+    req = v9fs_twalk(v9p, 0, fid, 1, wnames, 0);
+    v9fs_req_wait_for_reply(req, NULL);
+    v9fs_rwalk(req, &nqid, NULL);
+    g_assert_cmpint(nqid, ==, 1);
+
+    req = v9fs_tlopen(v9p, fid, O_DIRECTORY, 0);
+    v9fs_req_wait_for_reply(req, NULL);
+    v9fs_rlopen(req, &qid, NULL);
+
+    /*
+     * send as many Treaddir requests as required to get all directory
+     * entries
+     */
+    while (true) {
+        npartialentries = 0;
+        partialentries = NULL;
+
+        req = v9fs_treaddir(v9p, fid, offset, count, 0);
+        v9fs_req_wait_for_reply(req, NULL);
+        v9fs_rreaddir(req, &count, &npartialentries, &partialentries);
+        if (npartialentries > 0 && partialentries) {
+            if (!entries) {
+                entries = partialentries;
+                nentries = npartialentries;
+                tail = partialentries;
+            } else {
+                tail->next = partialentries;
+                nentries += npartialentries;
+            }
+            while (tail->next) {
+                tail = tail->next;
+            }
+            offset = tail->offset;
+        } else {
+            break;
+        }
+    }
+
+    g_assert_cmpint(
+        nentries, ==,
+        QTEST_V9FS_SYNTH_READDIR_NFILES + 2 /* "." and ".." */
+    );
+
+    /*
+     * Check all file names exist in returned entries, ignore their order
+     * though.
+     */
+    g_assert_cmpint(fs_dirents_contain_name(entries, "."), ==, true);
+    g_assert_cmpint(fs_dirents_contain_name(entries, ".."), ==, true);
+    for (int i = 0; i < QTEST_V9FS_SYNTH_READDIR_NFILES; ++i) {
+        char *name = g_strdup_printf(QTEST_V9FS_SYNTH_READDIR_FILE, i);
+        g_assert_cmpint(fs_dirents_contain_name(entries, name), ==, true);
+        g_free(name);
+    }
+
+    v9fs_free_dirents(entries);
+
+    g_free(wnames[0]);
+}
+
 static void fs_walk_no_slash(void *obj, void *data, QGuestAllocator *t_alloc)
 {
     QVirtio9P *v9p = obj;
@@ -793,6 +877,24 @@ static void fs_flush_ignored(void *obj, void *data, QGuestAllocator *t_alloc)
     g_free(wnames[0]);
 }
 
+static void fs_readdir_split_128(void *obj, void *data,
+                                 QGuestAllocator *t_alloc)
+{
+    fs_readdir_split(obj, data, t_alloc, 128);
+}
+
+static void fs_readdir_split_256(void *obj, void *data,
+                                 QGuestAllocator *t_alloc)
+{
+    fs_readdir_split(obj, data, t_alloc, 256);
+}
+
+static void fs_readdir_split_512(void *obj, void *data,
+                                 QGuestAllocator *t_alloc)
+{
+    fs_readdir_split(obj, data, t_alloc, 512);
+}
+
 static void register_virtio_9p_test(void)
 {
     qos_add_test("config", "virtio-9p", pci_config, NULL);
@@ -810,6 +912,12 @@ static void register_virtio_9p_test(void)
     qos_add_test("fs/flush/ignored", "virtio-9p", fs_flush_ignored,
                  NULL);
     qos_add_test("fs/readdir/basic", "virtio-9p", fs_readdir, NULL);
+    qos_add_test("fs/readdir/split_512", "virtio-9p",
+                 fs_readdir_split_512, NULL);
+    qos_add_test("fs/readdir/split_256", "virtio-9p",
+                 fs_readdir_split_256, NULL);
+    qos_add_test("fs/readdir/split_128", "virtio-9p",
+                 fs_readdir_split_128, NULL);
 }
 
 libqos_init(register_virtio_9p_test);
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v6 2/5] 9pfs: make v9fs_readdir_response_size() public
  2020-04-19 15:10 [PATCH v6 0/5] 9pfs: readdir optimization Christian Schoenebeck
  2020-04-19 15:00 ` [PATCH v6 1/5] tests/virtio-9p: added split readdir tests Christian Schoenebeck
@ 2020-04-19 15:00 ` Christian Schoenebeck
  2020-04-19 15:02 ` [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many() Christian Schoenebeck
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-19 15:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

Rename function v9fs_readdir_data_size() -> v9fs_readdir_response_size()
and make it callable from other units. So far this function is only
used by 9p.c, however subsequent patches require the function to be
callable from another 9pfs unit. And as we're at it; also make it clear
for what this function is used for.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 10 ++++++++--
 hw/9pfs/9p.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 9e046f7acb..43584aca41 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -2322,7 +2322,13 @@ out_nofid:
     pdu_complete(pdu, err);
 }
 
-static size_t v9fs_readdir_data_size(V9fsString *name)
+/**
+ * Returns size required in Rreaddir response for the passed dirent @p name.
+ *
+ * @param name - directory entry's name (i.e. file name, directory name)
+ * @returns required size in bytes
+ */
+size_t v9fs_readdir_response_size(V9fsString *name)
 {
     /*
      * Size of each dirent on the wire: size of qid (13) + size of offset (8)
@@ -2357,7 +2363,7 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
         }
         v9fs_string_init(&name);
         v9fs_string_sprintf(&name, "%s", dent->d_name);
-        if ((count + v9fs_readdir_data_size(&name)) > max_count) {
+        if ((count + v9fs_readdir_response_size(&name)) > max_count) {
             v9fs_readdir_unlock(&fidp->fs.dir);
 
             /* Ran out of buffer. Set dir back to old position and return */
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index b8f72a3bd9..9553700dbb 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -419,6 +419,7 @@ void v9fs_path_init(V9fsPath *path);
 void v9fs_path_free(V9fsPath *path);
 void v9fs_path_sprintf(V9fsPath *path, const char *fmt, ...);
 void v9fs_path_copy(V9fsPath *dst, const V9fsPath *src);
+size_t v9fs_readdir_response_size(V9fsString *name);
 int v9fs_name_to_path(V9fsState *s, V9fsPath *dirpath,
                       const char *name, V9fsPath *path);
 int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-04-19 15:10 [PATCH v6 0/5] 9pfs: readdir optimization Christian Schoenebeck
  2020-04-19 15:00 ` [PATCH v6 1/5] tests/virtio-9p: added split readdir tests Christian Schoenebeck
  2020-04-19 15:00 ` [PATCH v6 2/5] 9pfs: make v9fs_readdir_response_size() public Christian Schoenebeck
@ 2020-04-19 15:02 ` Christian Schoenebeck
  2020-04-30 11:42   ` Greg Kurz
  2020-04-19 15:06 ` [PATCH v6 4/5] 9pfs: T_readdir latency optimization Christian Schoenebeck
  2020-04-19 15:07 ` [PATCH v6 5/5] 9pfs: clarify latency of v9fs_co_run_in_worker() Christian Schoenebeck
  4 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-19 15:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

The newly added function v9fs_co_readdir_many() retrieves multiple
directory entries with a single fs driver request. It is intended to
replace uses of v9fs_co_readdir(), the latter only retrives a single
directory entry per fs driver request instead.

The reason for this planned replacement is that for every fs driver
request the coroutine is dispatched from main I/O thread to a
background I/O thread and eventually dispatched back to main I/O
thread. Hopping between threads adds latency. So if a 9pfs Treaddir
request reads a large amount of directory entries, this currently
sums up to huge latencies of several hundred ms or even more. So
using v9fs_co_readdir_many() instead of v9fs_co_readdir() will
provide significant performance improvements.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.h    |  22 ++++++
 hw/9pfs/codir.c | 181 +++++++++++++++++++++++++++++++++++++++++++++---
 hw/9pfs/coth.h  |   3 +
 3 files changed, 195 insertions(+), 11 deletions(-)

diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 9553700dbb..116977939b 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
     qemu_mutex_init(&dir->readdir_mutex);
 }
 
+/**
+ * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
+ * which is a chained list of directory entries.
+ */
+typedef struct V9fsDirEnt {
+    /* mandatory (must not be NULL) information for all readdir requests */
+    struct dirent *dent;
+    /*
+     * optional (may be NULL): A full stat of each directory entry is just
+     * done if explicitly told to fs driver.
+     */
+    struct stat *st;
+    /*
+     * instead of an array, directory entries are always returned as
+     * chained list, that's because the amount of entries retrieved by fs
+     * drivers is dependent on the individual entries' name (since response
+     * messages are size limited), so the final amount cannot be estimated
+     * before hand
+     */
+    struct V9fsDirEnt *next;
+} V9fsDirEnt;
+
 /*
  * Filled by fs driver on open and other
  * calls.
diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
index 73f9a751e1..45c65a8f5b 100644
--- a/hw/9pfs/codir.c
+++ b/hw/9pfs/codir.c
@@ -18,28 +18,187 @@
 #include "qemu/main-loop.h"
 #include "coth.h"
 
+/*
+ * This is solely executed on a background IO thread.
+ */
+static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
+{
+    int err = 0;
+    V9fsState *s = pdu->s;
+    struct dirent *entry;
+
+    errno = 0;
+    entry = s->ops->readdir(&s->ctx, &fidp->fs);
+    if (!entry && errno) {
+        *dent = NULL;
+        err = -errno;
+    } else {
+        *dent = entry;
+    }
+    return err;
+}
+
+/*
+ * TODO: This will be removed for performance reasons.
+ * Use v9fs_co_readdir_many() instead.
+ */
 int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
                                  struct dirent **dent)
 {
     int err;
-    V9fsState *s = pdu->s;
 
     if (v9fs_request_cancelled(pdu)) {
         return -EINTR;
     }
-    v9fs_co_run_in_worker(
-        {
-            struct dirent *entry;
+    v9fs_co_run_in_worker({
+        err = do_readdir(pdu, fidp, dent);
+    });
+    return err;
+}
+
+/*
+ * This is solely executed on a background IO thread.
+ *
+ * See v9fs_co_readdir_many() (as its only user) below for details.
+ */
+static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
+                             struct V9fsDirEnt **entries,
+                             int32_t maxsize, bool dostat)
+{
+    V9fsState *s = pdu->s;
+    V9fsString name;
+    int len, err = 0;
+    int32_t size = 0;
+    off_t saved_dir_pos;
+    struct dirent *dent;
+    struct V9fsDirEnt *e = NULL;
+    V9fsPath path;
+    struct stat stbuf;
 
-            errno = 0;
-            entry = s->ops->readdir(&s->ctx, &fidp->fs);
-            if (!entry && errno) {
+    *entries = NULL;
+    v9fs_path_init(&path);
+
+    /*
+     * TODO: Here should be a warn_report_once() if lock failed.
+     *
+     * With a good 9p client we should not get into concurrency here,
+     * because a good client would not use the same fid for concurrent
+     * requests. We do the lock here for safety reasons though. However
+     * the client would then suffer performance issues, so better log that
+     * issue here.
+     */
+    v9fs_readdir_lock(&fidp->fs.dir);
+
+    /* save the directory position */
+    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
+    if (saved_dir_pos < 0) {
+        err = saved_dir_pos;
+        goto out;
+    }
+
+    while (true) {
+        /* get directory entry from fs driver */
+        err = do_readdir(pdu, fidp, &dent);
+        if (err || !dent) {
+            break;
+        }
+
+        /*
+         * stop this loop as soon as it would exceed the allowed maximum
+         * response message size for the directory entries collected so far,
+         * because anything beyond that size would need to be discarded by
+         * 9p controller (main thread / top half) anyway
+         */
+        v9fs_string_init(&name);
+        v9fs_string_sprintf(&name, "%s", dent->d_name);
+        len = v9fs_readdir_response_size(&name);
+        v9fs_string_free(&name);
+        if (size + len > maxsize) {
+            /* this is not an error case actually */
+            break;
+        }
+
+        /* append next node to result chain */
+        if (!e) {
+            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
+        } else {
+            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
+        }
+        e->dent = g_malloc0(sizeof(struct dirent));
+        memcpy(e->dent, dent, sizeof(struct dirent));
+
+        /* perform a full stat() for directory entry if requested by caller */
+        if (dostat) {
+            err = s->ops->name_to_path(
+                &s->ctx, &fidp->path, dent->d_name, &path
+            );
+            if (err < 0) {
                 err = -errno;
-            } else {
-                *dent = entry;
-                err = 0;
+                break;
             }
-        });
+
+            err = s->ops->lstat(&s->ctx, &path, &stbuf);
+            if (err < 0) {
+                err = -errno;
+                break;
+            }
+
+            e->st = g_malloc0(sizeof(struct stat));
+            memcpy(e->st, &stbuf, sizeof(struct stat));
+        }
+
+        size += len;
+        saved_dir_pos = dent->d_off;
+    }
+
+    /* restore (last) saved position */
+    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
+
+out:
+    v9fs_readdir_unlock(&fidp->fs.dir);
+    v9fs_path_free(&path);
+    if (err < 0) {
+        return err;
+    }
+    return size;
+}
+
+/**
+ * @brief Reads multiple directory entries in one rush.
+ *
+ * Retrieves the requested (max. amount of) directory entries from the fs
+ * driver. This function must only be called by the main IO thread (top half).
+ * Internally this function call will be dispatched to a background IO thread
+ * (bottom half) where it is eventually executed by the fs driver.
+ *
+ * Acquiring multiple directory entries in one rush from the fs driver,
+ * instead of retrieving each directory entry individually, is very beneficial
+ * from performance point of view. Because for every fs driver request latency
+ * is added, which in practice could lead to overall latencies of several
+ * hundred ms for reading all entries (of just a single directory) if every
+ * directory entry was individually requested from driver.
+ *
+ * @param pdu - the causing 9p (T_readdir) client request
+ * @param fidp - already opened directory where readdir shall be performed on
+ * @param entries - output for directory entries (must not be NULL)
+ * @param maxsize - maximum result message body size (in bytes)
+ * @param dostat - whether a stat() should be performed and returned for
+ *                 each directory entry
+ * @returns resulting response message body size (in bytes) on success,
+ *          negative error code otherwise
+ */
+int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
+                                      struct V9fsDirEnt **entries,
+                                      int32_t maxsize, bool dostat)
+{
+    int err = 0;
+
+    if (v9fs_request_cancelled(pdu)) {
+        return -EINTR;
+    }
+    v9fs_co_run_in_worker({
+        err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
+    });
     return err;
 }
 
diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index c2cdc7a9ea..a6851822d5 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -49,6 +49,9 @@
 void co_run_in_worker_bh(void *);
 int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
 int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
+int coroutine_fn v9fs_co_readdir_many(V9fsPDU *, V9fsFidState *,
+                                      struct V9fsDirEnt **,
+                                      int32_t, bool);
 off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
 void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
 void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-04-19 15:10 [PATCH v6 0/5] 9pfs: readdir optimization Christian Schoenebeck
                   ` (2 preceding siblings ...)
  2020-04-19 15:02 ` [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many() Christian Schoenebeck
@ 2020-04-19 15:06 ` Christian Schoenebeck
  2020-06-03 17:16   ` Christian Schoenebeck
  2020-04-19 15:07 ` [PATCH v6 5/5] 9pfs: clarify latency of v9fs_co_run_in_worker() Christian Schoenebeck
  4 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-19 15:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

Make top half really top half and bottom half really bottom half:

Each T_readdir request handling is hopping between threads (main
I/O thread and background I/O driver threads) several times for
every individual directory entry, which sums up to huge latencies
for handling just a single T_readdir request.

Instead of doing that, collect now all required directory entries
(including all potentially required stat buffers for each entry) in
one rush on a background I/O thread from fs driver by calling the
previously added function v9fs_co_readdir_many() instead of
v9fs_co_readdir(), then assemble the entire resulting network
response message for the readdir request on main I/O thread. The
fs driver is still aborting the directory entry retrieval loop
(on the background I/O thread inside of v9fs_co_readdir_many())
as soon as it would exceed the client's requested maximum R_readdir
response size. So this will not introduce a performance penalty on
another end.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/9p.c | 122 +++++++++++++++++++++++----------------------------
 1 file changed, 55 insertions(+), 67 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 43584aca41..8283a3cfbb 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -971,30 +971,6 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
     return 0;
 }
 
-static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
-                                      struct dirent *dent, V9fsQID *qidp)
-{
-    struct stat stbuf;
-    V9fsPath path;
-    int err;
-
-    v9fs_path_init(&path);
-
-    err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
-    if (err < 0) {
-        goto out;
-    }
-    err = v9fs_co_lstat(pdu, &path, &stbuf);
-    if (err < 0) {
-        goto out;
-    }
-    err = stat_to_qid(pdu, &stbuf, qidp);
-
-out:
-    v9fs_path_free(&path);
-    return err;
-}
-
 V9fsPDU *pdu_alloc(V9fsState *s)
 {
     V9fsPDU *pdu = NULL;
@@ -2337,6 +2313,18 @@ size_t v9fs_readdir_response_size(V9fsString *name)
     return 24 + v9fs_string_size(name);
 }
 
+static void v9fs_free_dirents(struct V9fsDirEnt *e)
+{
+    struct V9fsDirEnt *next = NULL;
+
+    for (; e; e = next) {
+        next = e->next;
+        g_free(e->dent);
+        g_free(e->st);
+        g_free(e);
+    }
+}
+
 static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
                                         int32_t max_count)
 {
@@ -2345,54 +2333,53 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
     V9fsString name;
     int len, err = 0;
     int32_t count = 0;
-    off_t saved_dir_pos;
     struct dirent *dent;
+    struct stat *st;
+    struct V9fsDirEnt *entries = NULL;
 
-    /* save the directory position */
-    saved_dir_pos = v9fs_co_telldir(pdu, fidp);
-    if (saved_dir_pos < 0) {
-        return saved_dir_pos;
-    }
-
-    while (1) {
-        v9fs_readdir_lock(&fidp->fs.dir);
+    /*
+     * inode remapping requires the device id, which in turn might be
+     * different for different directory entries, so if inode remapping is
+     * enabled we have to make a full stat for each directory entry
+     */
+    const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
 
-        err = v9fs_co_readdir(pdu, fidp, &dent);
-        if (err || !dent) {
-            break;
-        }
-        v9fs_string_init(&name);
-        v9fs_string_sprintf(&name, "%s", dent->d_name);
-        if ((count + v9fs_readdir_response_size(&name)) > max_count) {
-            v9fs_readdir_unlock(&fidp->fs.dir);
+    /*
+     * Fetch all required directory entries altogether on a background IO
+     * thread from fs driver. We don't want to do that for each entry
+     * individually, because hopping between threads (this main IO thread
+     * and background IO driver thread) would sum up to huge latencies.
+     */
+    count = v9fs_co_readdir_many(pdu, fidp, &entries, max_count, dostat);
+    if (count < 0) {
+        err = count;
+        count = 0;
+        goto out;
+    }
+    count = 0;
 
-            /* Ran out of buffer. Set dir back to old position and return */
-            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-            v9fs_string_free(&name);
-            return count;
-        }
+    for (struct V9fsDirEnt *e = entries; e; e = e->next) {
+        dent = e->dent;
 
         if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
-            /*
-             * dirent_to_qid() implies expensive stat call for each entry,
-             * we must do that here though since inode remapping requires
-             * the device id, which in turn might be different for
-             * different entries; we cannot make any assumption to avoid
-             * that here.
-             */
-            err = dirent_to_qid(pdu, fidp, dent, &qid);
+            st = e->st;
+            /* e->st should never be NULL, but just to be sure */
+            if (!st) {
+                err = -1;
+                break;
+            }
+
+            /* remap inode */
+            err = stat_to_qid(pdu, st, &qid);
             if (err < 0) {
-                v9fs_readdir_unlock(&fidp->fs.dir);
-                v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-                v9fs_string_free(&name);
-                return err;
+                break;
             }
         } else {
             /*
              * Fill up just the path field of qid because the client uses
              * only that. To fill the entire qid structure we will have
              * to stat each dirent found, which is expensive. For the
-             * latter reason we don't call dirent_to_qid() here. Only drawback
+             * latter reason we don't call stat_to_qid() here. Only drawback
              * is that no multi-device export detection of stat_to_qid()
              * would be done and provided as error to the user here. But
              * user would get that error anyway when accessing those
@@ -2405,25 +2392,26 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
             qid.version = 0;
         }
 
+        v9fs_string_init(&name);
+        v9fs_string_sprintf(&name, "%s", dent->d_name);
+
         /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
         len = pdu_marshal(pdu, 11 + count, "Qqbs",
                           &qid, dent->d_off,
                           dent->d_type, &name);
 
-        v9fs_readdir_unlock(&fidp->fs.dir);
+        v9fs_string_free(&name);
 
         if (len < 0) {
-            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-            v9fs_string_free(&name);
-            return len;
+            err = len;
+            break;
         }
+
         count += len;
-        v9fs_string_free(&name);
-        saved_dir_pos = dent->d_off;
     }
 
-    v9fs_readdir_unlock(&fidp->fs.dir);
-
+out:
+    v9fs_free_dirents(entries);
     if (err < 0) {
         return err;
     }
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v6 5/5] 9pfs: clarify latency of v9fs_co_run_in_worker()
  2020-04-19 15:10 [PATCH v6 0/5] 9pfs: readdir optimization Christian Schoenebeck
                   ` (3 preceding siblings ...)
  2020-04-19 15:06 ` [PATCH v6 4/5] 9pfs: T_readdir latency optimization Christian Schoenebeck
@ 2020-04-19 15:07 ` Christian Schoenebeck
  4 siblings, 0 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-19 15:07 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

As we just fixed a severe performance issue with Treaddir request
handling, clarify this overall issue as a comment on
v9fs_co_run_in_worker() with the intention to hopefully prevent
such performance mistakes in future (and fixing other yet
outstanding ones).

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/coth.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
index a6851822d5..8b6f76840a 100644
--- a/hw/9pfs/coth.h
+++ b/hw/9pfs/coth.h
@@ -19,7 +19,7 @@
 #include "qemu/coroutine.h"
 #include "9p.h"
 
-/*
+/**
  * we want to use bottom half because we want to make sure the below
  * sequence of events.
  *
@@ -28,6 +28,16 @@
  *   3. Enter the coroutine in the worker thread.
  * we cannot swap step 1 and 2, because that would imply worker thread
  * can enter coroutine while step1 is still running
+ *
+ * @b PERFORMANCE @b CONSIDERATIONS: As a rule of thumb, keep in mind
+ * that hopping between threads adds @b latency! So when handling a
+ * 9pfs request, avoid calling v9fs_co_run_in_worker() too often, because
+ * this might otherwise sum up to a significant, huge overall latency for
+ * providing the response for just a single request. For that reason it
+ * is highly recommended to fetch all data from fs driver with a single
+ * fs driver request on a background I/O thread (bottom half) in one rush
+ * first and then eventually assembling the final response from that data
+ * on main I/O thread (top half).
  */
 #define v9fs_co_run_in_worker(code_block)                               \
     do {                                                                \
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v6 0/5] 9pfs: readdir optimization
@ 2020-04-19 15:10 Christian Schoenebeck
  2020-04-19 15:00 ` [PATCH v6 1/5] tests/virtio-9p: added split readdir tests Christian Schoenebeck
                   ` (4 more replies)
  0 siblings, 5 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-19 15:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

As previously mentioned, I was investigating performance issues with 9pfs.
Raw file read/write of 9pfs is actually quite good, provided that client
picked a reasonable high msize (maximum message size). I would recommend
to log a warning on 9p server side if a client attached with a small msize
that would cause performance issues for that reason.

However there are other aspects where 9pfs currently performs suboptimally,
especially readdir handling of 9pfs is extremely slow, a simple readdir
request of a guest typically blocks for several hundred milliseconds or
even several seconds, no matter how powerful the underlying hardware is.
The reason for this performance issue: latency.
Currently 9pfs is heavily dispatching a T_readdir request numerous times
between main I/O thread and a background I/O thread back and forth; in fact
it is actually hopping between threads even multiple times for every single
directory entry during T_readdir request handling which leads in total to
huge latencies for a single T_readdir request.

This patch series aims to address this severe performance issue of 9pfs
T_readdir request handling. The actual performance optimization is patch 4.

v5->v6:

  * Rebased to tag: v5.0.0-rc3 (SHA-1 20038cd7).

  * Dropped patch 2 ("9pfs readdir: rename max_count -> maxsize").

Message-ID of previous version (v5):
  cover.1585258105.git.qemu_oss@crudebyte.com

Message-ID of version with performance benchmark (v4):
  cover.1579567019.git.qemu_oss@crudebyte.com

Christian Schoenebeck (5):
  tests/virtio-9p: added split readdir tests
  9pfs: make v9fs_readdir_response_size() public
  9pfs: add new function v9fs_co_readdir_many()
  9pfs: T_readdir latency optimization
  9pfs: clarify latency of v9fs_co_run_in_worker()

 hw/9pfs/9p.c                 | 130 ++++++++++++-------------
 hw/9pfs/9p.h                 |  23 +++++
 hw/9pfs/codir.c              | 181 ++++++++++++++++++++++++++++++++---
 hw/9pfs/coth.h               |  15 ++-
 tests/qtest/virtio-9p-test.c | 108 +++++++++++++++++++++
 5 files changed, 377 insertions(+), 80 deletions(-)

-- 
2.20.1



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-04-19 15:02 ` [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many() Christian Schoenebeck
@ 2020-04-30 11:42   ` Greg Kurz
  2020-04-30 12:50     ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-04-30 11:42 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Sun, 19 Apr 2020 17:02:27 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> The newly added function v9fs_co_readdir_many() retrieves multiple
> directory entries with a single fs driver request. It is intended to
> replace uses of v9fs_co_readdir(), the latter only retrives a single
> directory entry per fs driver request instead.
> 
> The reason for this planned replacement is that for every fs driver
> request the coroutine is dispatched from main I/O thread to a
> background I/O thread and eventually dispatched back to main I/O
> thread. Hopping between threads adds latency. So if a 9pfs Treaddir
> request reads a large amount of directory entries, this currently
> sums up to huge latencies of several hundred ms or even more. So
> using v9fs_co_readdir_many() instead of v9fs_co_readdir() will
> provide significant performance improvements.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.h    |  22 ++++++
>  hw/9pfs/codir.c | 181 +++++++++++++++++++++++++++++++++++++++++++++---
>  hw/9pfs/coth.h  |   3 +
>  3 files changed, 195 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
> index 9553700dbb..116977939b 100644
> --- a/hw/9pfs/9p.h
> +++ b/hw/9pfs/9p.h
> @@ -215,6 +215,28 @@ static inline void v9fs_readdir_init(V9fsDir *dir)
>      qemu_mutex_init(&dir->readdir_mutex);
>  }
>  
> +/**
> + * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests,
> + * which is a chained list of directory entries.
> + */
> +typedef struct V9fsDirEnt {
> +    /* mandatory (must not be NULL) information for all readdir requests */
> +    struct dirent *dent;
> +    /*
> +     * optional (may be NULL): A full stat of each directory entry is just
> +     * done if explicitly told to fs driver.
> +     */
> +    struct stat *st;
> +    /*
> +     * instead of an array, directory entries are always returned as
> +     * chained list, that's because the amount of entries retrieved by fs
> +     * drivers is dependent on the individual entries' name (since response
> +     * messages are size limited), so the final amount cannot be estimated
> +     * before hand
> +     */
> +    struct V9fsDirEnt *next;
> +} V9fsDirEnt;
> +
>  /*
>   * Filled by fs driver on open and other
>   * calls.
> diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c
> index 73f9a751e1..45c65a8f5b 100644
> --- a/hw/9pfs/codir.c
> +++ b/hw/9pfs/codir.c
> @@ -18,28 +18,187 @@
>  #include "qemu/main-loop.h"
>  #include "coth.h"
>  
> +/*
> + * This is solely executed on a background IO thread.
> + */
> +static int do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, struct dirent **dent)
> +{
> +    int err = 0;
> +    V9fsState *s = pdu->s;
> +    struct dirent *entry;
> +
> +    errno = 0;
> +    entry = s->ops->readdir(&s->ctx, &fidp->fs);
> +    if (!entry && errno) {
> +        *dent = NULL;
> +        err = -errno;
> +    } else {
> +        *dent = entry;
> +    }
> +    return err;
> +}
> +
> +/*
> + * TODO: This will be removed for performance reasons.
> + * Use v9fs_co_readdir_many() instead.
> + */
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
>                                   struct dirent **dent)
>  {
>      int err;
> -    V9fsState *s = pdu->s;
>  
>      if (v9fs_request_cancelled(pdu)) {
>          return -EINTR;
>      }
> -    v9fs_co_run_in_worker(
> -        {
> -            struct dirent *entry;
> +    v9fs_co_run_in_worker({
> +        err = do_readdir(pdu, fidp, dent);
> +    });
> +    return err;
> +}
> +
> +/*
> + * This is solely executed on a background IO thread.
> + *
> + * See v9fs_co_readdir_many() (as its only user) below for details.
> + */
> +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> +                             struct V9fsDirEnt **entries,
> +                             int32_t maxsize, bool dostat)
> +{
> +    V9fsState *s = pdu->s;
> +    V9fsString name;
> +    int len, err = 0;
> +    int32_t size = 0;
> +    off_t saved_dir_pos;
> +    struct dirent *dent;
> +    struct V9fsDirEnt *e = NULL;
> +    V9fsPath path;
> +    struct stat stbuf;
>  
> -            errno = 0;
> -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> -            if (!entry && errno) {
> +    *entries = NULL;
> +    v9fs_path_init(&path);
> +
> +    /*
> +     * TODO: Here should be a warn_report_once() if lock failed.
> +     *
> +     * With a good 9p client we should not get into concurrency here,
> +     * because a good client would not use the same fid for concurrent
> +     * requests. We do the lock here for safety reasons though. However
> +     * the client would then suffer performance issues, so better log that
> +     * issue here.
> +     */
> +    v9fs_readdir_lock(&fidp->fs.dir);

I agree that a client that issues concurrent readdir requests on the
same fid is probably asking for troubles, but this permitted by the
spec. Whether we should detect such conditions and warn or even fail
is discussion for another thread.

The locking is only needed to avoid concurrent accesses to the dirent
structure returned by readdir(), otherwise we could return partially
overwritten file names to the client. It must be done for each individual
call to readdir(), but certainly not for multiple calls.

As discussed privately, I'm working on a patch to better address the
locking and I'd really prefer to merge this before your series. Sorry
for the delay again. I'll try to post ASAP.

Anyway, I have some more remarks.

> +
> +    /* save the directory position */
> +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> +    if (saved_dir_pos < 0) {
> +        err = saved_dir_pos;
> +        goto out;
> +    }
> +
> +    while (true) {
> +        /* get directory entry from fs driver */
> +        err = do_readdir(pdu, fidp, &dent);
> +        if (err || !dent) {
> +            break;
> +        }
> +
> +        /*
> +         * stop this loop as soon as it would exceed the allowed maximum
> +         * response message size for the directory entries collected so far,
> +         * because anything beyond that size would need to be discarded by
> +         * 9p controller (main thread / top half) anyway
> +         */
> +        v9fs_string_init(&name);
> +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> +        len = v9fs_readdir_response_size(&name);
> +        v9fs_string_free(&name);
> +        if (size + len > maxsize) {
> +            /* this is not an error case actually */
> +            break;
> +        }
> +
> +        /* append next node to result chain */
> +        if (!e) {
> +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> +        } else {
> +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> +        }
> +        e->dent = g_malloc0(sizeof(struct dirent));

So we're allocating a bunch of stuff here...

> +        memcpy(e->dent, dent, sizeof(struct dirent));
> +
> +        /* perform a full stat() for directory entry if requested by caller */
> +        if (dostat) {
> +            err = s->ops->name_to_path(
> +                &s->ctx, &fidp->path, dent->d_name, &path
> +            );
> +            if (err < 0) {
>                  err = -errno;
> -            } else {
> -                *dent = entry;
> -                err = 0;
> +                break;

... but we're erroring out there and it seems that we're leaking
all the entries that have been allocated so far.

Also I have the impression that all the if (dostat) { } block could
be done before chaining a new entry.

So, I think I'll just rebase your series to accommodate the locking
fix I've mentioned earlier, re-post the whole thing and let you
add the missing rollback. Sounds like a plan ?

>              }
> -        });
> +
> +            err = s->ops->lstat(&s->ctx, &path, &stbuf);
> +            if (err < 0) {
> +                err = -errno;
> +                break;
> +            }
> +
> +            e->st = g_malloc0(sizeof(struct stat));
> +            memcpy(e->st, &stbuf, sizeof(struct stat));
> +        }
> +
> +        size += len;
> +        saved_dir_pos = dent->d_off;
> +    }
> +
> +    /* restore (last) saved position */
> +    s->ops->seekdir(&s->ctx, &fidp->fs, saved_dir_pos);
> +
> +out:
> +    v9fs_readdir_unlock(&fidp->fs.dir);
> +    v9fs_path_free(&path);
> +    if (err < 0) {
> +        return err;
> +    }
> +    return size;
> +}
> +
> +/**
> + * @brief Reads multiple directory entries in one rush.
> + *
> + * Retrieves the requested (max. amount of) directory entries from the fs
> + * driver. This function must only be called by the main IO thread (top half).
> + * Internally this function call will be dispatched to a background IO thread
> + * (bottom half) where it is eventually executed by the fs driver.
> + *
> + * Acquiring multiple directory entries in one rush from the fs driver,
> + * instead of retrieving each directory entry individually, is very beneficial
> + * from performance point of view. Because for every fs driver request latency
> + * is added, which in practice could lead to overall latencies of several
> + * hundred ms for reading all entries (of just a single directory) if every
> + * directory entry was individually requested from driver.
> + *
> + * @param pdu - the causing 9p (T_readdir) client request
> + * @param fidp - already opened directory where readdir shall be performed on
> + * @param entries - output for directory entries (must not be NULL)
> + * @param maxsize - maximum result message body size (in bytes)
> + * @param dostat - whether a stat() should be performed and returned for
> + *                 each directory entry
> + * @returns resulting response message body size (in bytes) on success,
> + *          negative error code otherwise
> + */
> +int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> +                                      struct V9fsDirEnt **entries,
> +                                      int32_t maxsize, bool dostat)
> +{
> +    int err = 0;
> +
> +    if (v9fs_request_cancelled(pdu)) {
> +        return -EINTR;
> +    }
> +    v9fs_co_run_in_worker({
> +        err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> +    });
>      return err;
>  }
>  
> diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h
> index c2cdc7a9ea..a6851822d5 100644
> --- a/hw/9pfs/coth.h
> +++ b/hw/9pfs/coth.h
> @@ -49,6 +49,9 @@
>  void co_run_in_worker_bh(void *);
>  int coroutine_fn v9fs_co_readlink(V9fsPDU *, V9fsPath *, V9fsString *);
>  int coroutine_fn v9fs_co_readdir(V9fsPDU *, V9fsFidState *, struct dirent **);
> +int coroutine_fn v9fs_co_readdir_many(V9fsPDU *, V9fsFidState *,
> +                                      struct V9fsDirEnt **,
> +                                      int32_t, bool);
>  off_t coroutine_fn v9fs_co_telldir(V9fsPDU *, V9fsFidState *);
>  void coroutine_fn v9fs_co_seekdir(V9fsPDU *, V9fsFidState *, off_t);
>  void coroutine_fn v9fs_co_rewinddir(V9fsPDU *, V9fsFidState *);



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-04-30 11:42   ` Greg Kurz
@ 2020-04-30 12:50     ` Christian Schoenebeck
  2020-04-30 13:30       ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-04-30 12:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Donnerstag, 30. April 2020 13:42:35 CEST Greg Kurz wrote:
> > +/*
> > + * This is solely executed on a background IO thread.
> > + *
> > + * See v9fs_co_readdir_many() (as its only user) below for details.
> > + */
> > +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> > +                             struct V9fsDirEnt **entries,
> > +                             int32_t maxsize, bool dostat)
> > +{
> > +    V9fsState *s = pdu->s;
> > +    V9fsString name;
> > +    int len, err = 0;
> > +    int32_t size = 0;
> > +    off_t saved_dir_pos;
> > +    struct dirent *dent;
> > +    struct V9fsDirEnt *e = NULL;
> > +    V9fsPath path;
> > +    struct stat stbuf;
> > 
> > -            errno = 0;
> > -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > -            if (!entry && errno) {
> > +    *entries = NULL;
> > +    v9fs_path_init(&path);
> > +
> > +    /*
> > +     * TODO: Here should be a warn_report_once() if lock failed.
> > +     *
> > +     * With a good 9p client we should not get into concurrency here,
> > +     * because a good client would not use the same fid for concurrent
> > +     * requests. We do the lock here for safety reasons though. However
> > +     * the client would then suffer performance issues, so better log
> > that
> > +     * issue here.
> > +     */
> > +    v9fs_readdir_lock(&fidp->fs.dir);
> 
> I agree that a client that issues concurrent readdir requests on the
> same fid is probably asking for troubles, but this permitted by the
> spec. Whether we should detect such conditions and warn or even fail
> is discussion for another thread.
> 
> The locking is only needed to avoid concurrent accesses to the dirent
> structure returned by readdir(), otherwise we could return partially
> overwritten file names to the client. It must be done for each individual
> call to readdir(), but certainly not for multiple calls.

Yeah, that would resolve this issue more appropriately for 9p2000.L, since 
Treaddir specifies an offset, but for 9p2000.u the result of a concurrent read 
on a directory (9p2000.u) would still be undefined.

> As discussed privately, I'm working on a patch to better address the
> locking and I'd really prefer to merge this before your series. Sorry
> for the delay again. I'll try to post ASAP.
> 
> Anyway, I have some more remarks.
> 
> > +
> > +    /* save the directory position */
> > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > +    if (saved_dir_pos < 0) {
> > +        err = saved_dir_pos;
> > +        goto out;
> > +    }
> > +
> > +    while (true) {
> > +        /* get directory entry from fs driver */
> > +        err = do_readdir(pdu, fidp, &dent);
> > +        if (err || !dent) {
> > +            break;
> > +        }
> > +
> > +        /*
> > +         * stop this loop as soon as it would exceed the allowed maximum
> > +         * response message size for the directory entries collected so
> > far, +         * because anything beyond that size would need to be
> > discarded by +         * 9p controller (main thread / top half) anyway
> > +         */
> > +        v9fs_string_init(&name);
> > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > +        len = v9fs_readdir_response_size(&name);
> > +        v9fs_string_free(&name);
> > +        if (size + len > maxsize) {
> > +            /* this is not an error case actually */
> > +            break;
> > +        }
> > +
> > +        /* append next node to result chain */
> > +        if (!e) {
> > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > +        } else {
> > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > +        }
> > +        e->dent = g_malloc0(sizeof(struct dirent));
> 
> So we're allocating a bunch of stuff here...
> 
> > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > +
> > +        /* perform a full stat() for directory entry if requested by
> > caller */ +        if (dostat) {
> > +            err = s->ops->name_to_path(
> > +                &s->ctx, &fidp->path, dent->d_name, &path
> > +            );
> > +            if (err < 0) {
> > 
> >                  err = -errno;
> > 
> > -            } else {
> > -                *dent = entry;
> > -                err = 0;
> > +                break;
> 
> ... but we're erroring out there and it seems that we're leaking
> all the entries that have been allocated so far.

No, they are not leaking actually.

You are right that they are not deallocated in do_readdir_many(), but that's 
intentional: in the new implementation of v9fs_do_readdir() you see that 
v9fs_free_dirents(entries) is *always* called at the very end of the function, 
no matter if success or any error. That's one of the measures to simplify 
overall code as much as possible.

As you might have noticed, the previous/current v9fs_do_readdir() 
implementation had quite a bunch of individual error pathes, which is quite 
error prone or at least makes it difficult to maintain. So I think it makes 
sense to strip unnecessary branches as much as possible.

> Also I have the impression that all the if (dostat) { } block could
> be done before chaining a new entry.

Yes, you could move it forward, but what would you buy from that?

I think you mean the case when there's an error inside the if (dostat) {} 
block: The comments on struct V9fsDirEnt already suggest that the "st" member 
is optional and may be NULL. So if there's an error inside if (dostat) {}
then caller still has a valid "dent" field at least and it's up to caller 
whether or not it's a problem for its purpose that "st" is empty. For that 
reason I would not move the block forward.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-04-30 12:50     ` Christian Schoenebeck
@ 2020-04-30 13:30       ` Greg Kurz
  2020-05-01 14:04         ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-04-30 13:30 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Thu, 30 Apr 2020 14:50:31 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 30. April 2020 13:42:35 CEST Greg Kurz wrote:
> > > +/*
> > > + * This is solely executed on a background IO thread.
> > > + *
> > > + * See v9fs_co_readdir_many() (as its only user) below for details.
> > > + */
> > > +static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> > > +                             struct V9fsDirEnt **entries,
> > > +                             int32_t maxsize, bool dostat)
> > > +{
> > > +    V9fsState *s = pdu->s;
> > > +    V9fsString name;
> > > +    int len, err = 0;
> > > +    int32_t size = 0;
> > > +    off_t saved_dir_pos;
> > > +    struct dirent *dent;
> > > +    struct V9fsDirEnt *e = NULL;
> > > +    V9fsPath path;
> > > +    struct stat stbuf;
> > > 
> > > -            errno = 0;
> > > -            entry = s->ops->readdir(&s->ctx, &fidp->fs);
> > > -            if (!entry && errno) {
> > > +    *entries = NULL;
> > > +    v9fs_path_init(&path);
> > > +
> > > +    /*
> > > +     * TODO: Here should be a warn_report_once() if lock failed.
> > > +     *
> > > +     * With a good 9p client we should not get into concurrency here,
> > > +     * because a good client would not use the same fid for concurrent
> > > +     * requests. We do the lock here for safety reasons though. However
> > > +     * the client would then suffer performance issues, so better log
> > > that
> > > +     * issue here.
> > > +     */
> > > +    v9fs_readdir_lock(&fidp->fs.dir);
> > 
> > I agree that a client that issues concurrent readdir requests on the
> > same fid is probably asking for troubles, but this permitted by the
> > spec. Whether we should detect such conditions and warn or even fail
> > is discussion for another thread.
> > 
> > The locking is only needed to avoid concurrent accesses to the dirent
> > structure returned by readdir(), otherwise we could return partially
> > overwritten file names to the client. It must be done for each individual
> > call to readdir(), but certainly not for multiple calls.
> 
> Yeah, that would resolve this issue more appropriately for 9p2000.L, since 
> Treaddir specifies an offset, but for 9p2000.u the result of a concurrent read 
> on a directory (9p2000.u) would still be undefined.
> 

The bad client behavior you want to tackle has nothing to do with
the locking itself. Since all the code in 9p.c runs serialized in
the context of the QEMU main loop, concurrent readdir requests could
easily be detected up-front with a simple flag in the fid structure.

> > As discussed privately, I'm working on a patch to better address the
> > locking and I'd really prefer to merge this before your series. Sorry
> > for the delay again. I'll try to post ASAP.
> > 
> > Anyway, I have some more remarks.
> > 
> > > +
> > > +    /* save the directory position */
> > > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > > +    if (saved_dir_pos < 0) {
> > > +        err = saved_dir_pos;
> > > +        goto out;
> > > +    }
> > > +
> > > +    while (true) {
> > > +        /* get directory entry from fs driver */
> > > +        err = do_readdir(pdu, fidp, &dent);
> > > +        if (err || !dent) {
> > > +            break;
> > > +        }
> > > +
> > > +        /*
> > > +         * stop this loop as soon as it would exceed the allowed maximum
> > > +         * response message size for the directory entries collected so
> > > far, +         * because anything beyond that size would need to be
> > > discarded by +         * 9p controller (main thread / top half) anyway
> > > +         */
> > > +        v9fs_string_init(&name);
> > > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > > +        len = v9fs_readdir_response_size(&name);
> > > +        v9fs_string_free(&name);
> > > +        if (size + len > maxsize) {
> > > +            /* this is not an error case actually */
> > > +            break;
> > > +        }
> > > +
> > > +        /* append next node to result chain */
> > > +        if (!e) {
> > > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > > +        } else {
> > > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > > +        }
> > > +        e->dent = g_malloc0(sizeof(struct dirent));
> > 
> > So we're allocating a bunch of stuff here...
> > 
> > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > +
> > > +        /* perform a full stat() for directory entry if requested by
> > > caller */ +        if (dostat) {
> > > +            err = s->ops->name_to_path(
> > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > +            );
> > > +            if (err < 0) {
> > > 
> > >                  err = -errno;
> > > 
> > > -            } else {
> > > -                *dent = entry;
> > > -                err = 0;
> > > +                break;
> > 
> > ... but we're erroring out there and it seems that we're leaking
> > all the entries that have been allocated so far.
> 
> No, they are not leaking actually.
> 
> You are right that they are not deallocated in do_readdir_many(), but that's 
> intentional: in the new implementation of v9fs_do_readdir() you see that 
> v9fs_free_dirents(entries) is *always* called at the very end of the function, 
> no matter if success or any error. That's one of the measures to simplify 
> overall code as much as possible.
> 

Hmm... I still don't quite like the idea of having an erroring function
asking for extra cleanup. I suggest you come up with an idem-potent version
of v9fs_free_dirents(), move it to codir.c (I also prefer locality of calls
to g_malloc and g_free in the same unit), make it extern and call it
both on the error path of v9fs_co_readdir_many() and in v9fs_do_readdir().

> As you might have noticed, the previous/current v9fs_do_readdir() 
> implementation had quite a bunch of individual error pathes, which is quite 
> error prone or at least makes it difficult to maintain. So I think it makes 
> sense to strip unnecessary branches as much as possible.
> 
> > Also I have the impression that all the if (dostat) { } block could
> > be done before chaining a new entry.
> 
> Yes, you could move it forward, but what would you buy from that?
> 

It just seems a better practice to do the things that can fail up front.

> I think you mean the case when there's an error inside the if (dostat) {} 
> block: The comments on struct V9fsDirEnt already suggest that the "st" member 
> is optional and may be NULL. So if there's an error inside if (dostat) {}
> then caller still has a valid "dent" field at least and it's up to caller 
> whether or not it's a problem for its purpose that "st" is empty. For that 
> reason I would not move the block forward.
> 

Hrm... the comment is:

    /*
     * optional (may be NULL): A full stat of each directory entry is just
     * done if explicitly told to fs driver.
     */

I don't read that it is optional for the fs driver to populate "st"
if this was required by the caller. Also, looking at the next patch
I see that the condition for calling stat() is V9FS_REMAP_INODES and
the code explicitly requires "st" to be available in this case.

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-04-30 13:30       ` Greg Kurz
@ 2020-05-01 14:04         ` Christian Schoenebeck
  2020-05-04  9:18           ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-05-01 14:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > I agree that a client that issues concurrent readdir requests on the
> > > same fid is probably asking for troubles, but this permitted by the
> > > spec. Whether we should detect such conditions and warn or even fail
> > > is discussion for another thread.
> > > 
> > > The locking is only needed to avoid concurrent accesses to the dirent
> > > structure returned by readdir(), otherwise we could return partially
> > > overwritten file names to the client. It must be done for each
> > > individual
> > > call to readdir(), but certainly not for multiple calls.
> > 
> > Yeah, that would resolve this issue more appropriately for 9p2000.L, since
> > Treaddir specifies an offset, but for 9p2000.u the result of a concurrent
> > read on a directory (9p2000.u) would still be undefined.
> 
> The bad client behavior you want to tackle has nothing to do with
> the locking itself. Since all the code in 9p.c runs serialized in
> the context of the QEMU main loop, concurrent readdir requests could
> easily be detected up-front with a simple flag in the fid structure.

Well, it's fine with me. I don't really see an issue here right now. But that 
all the code was serialized is not fully true. Most of the 9p.c code is still 
heavily dispatching between main thread and worker threads back and forth. And 
for that reason the order of request processing might change quite arbitrarily 
in between. Just keep that in mind.

> > > > +
> > > > +    /* save the directory position */
> > > > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > > > +    if (saved_dir_pos < 0) {
> > > > +        err = saved_dir_pos;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    while (true) {
> > > > +        /* get directory entry from fs driver */
> > > > +        err = do_readdir(pdu, fidp, &dent);
> > > > +        if (err || !dent) {
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        /*
> > > > +         * stop this loop as soon as it would exceed the allowed
> > > > maximum
> > > > +         * response message size for the directory entries collected
> > > > so
> > > > far, +         * because anything beyond that size would need to be
> > > > discarded by +         * 9p controller (main thread / top half) anyway
> > > > +         */
> > > > +        v9fs_string_init(&name);
> > > > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > > > +        len = v9fs_readdir_response_size(&name);
> > > > +        v9fs_string_free(&name);
> > > > +        if (size + len > maxsize) {
> > > > +            /* this is not an error case actually */
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        /* append next node to result chain */
> > > > +        if (!e) {
> > > > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > > > +        } else {
> > > > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > > > +        }
> > > > +        e->dent = g_malloc0(sizeof(struct dirent));
> > > 
> > > So we're allocating a bunch of stuff here...
> > > 
> > > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > > +
> > > > +        /* perform a full stat() for directory entry if requested by
> > > > caller */ +        if (dostat) {
> > > > +            err = s->ops->name_to_path(
> > > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > > +            );
> > > > +            if (err < 0) {
> > > > 
> > > >                  err = -errno;
> > > > 
> > > > -            } else {
> > > > -                *dent = entry;
> > > > -                err = 0;
> > > > +                break;
> > > 
> > > ... but we're erroring out there and it seems that we're leaking
> > > all the entries that have been allocated so far.
> > 
> > No, they are not leaking actually.
> > 
> > You are right that they are not deallocated in do_readdir_many(), but
> > that's intentional: in the new implementation of v9fs_do_readdir() you
> > see that v9fs_free_dirents(entries) is *always* called at the very end of
> > the function, no matter if success or any error. That's one of the
> > measures to simplify overall code as much as possible.
> 
> Hmm... I still don't quite like the idea of having an erroring function
> asking for extra cleanup. I suggest you come up with an idem-potent version
> of v9fs_free_dirents(), move it to codir.c (I also prefer locality of calls
> to g_malloc and g_free in the same unit), make it extern and call it
> both on the error path of v9fs_co_readdir_many() and in v9fs_do_readdir().

I understand your position of course, but I still won't find that to be a good 
move.

My veto here has a reason: your requested change would prevent an application 
that I had in mind for future purpose actually: Allowing "greedy" fetching 
directory entries, that is returning all successful read entries while some of 
the entries might have been errored for some reason. Think about a directory 
where one entry is another device which errors for some reason, then a user 
probably still would want to see the other entries at least. I know that 
purpose would still need some minor adjustments, but no changes to the current 
structure of this function actually.

But to also make this clear: there is no "extra" cleanup right now. 
v9fs_free_dirents(entries) is *always* called by caller, no matter if success, 
error, allocated list or NULL. It couldn't be more simple. A user of this 
function must call v9fs_free_dirents(entries) at a certain point anyway.

If you have a bad feeling about it, I can make it more clear by an API doc 
comment on v9fs_co_readdir_many() if you want, like e.g. "You @b MUST @ ALWAYS 
call v9fs_free_dirents() after using this function, both on success or 
error.".

> > I think you mean the case when there's an error inside the if (dostat) {}
> > block: The comments on struct V9fsDirEnt already suggest that the "st"
> > member is optional and may be NULL. So if there's an error inside if
> > (dostat) {} then caller still has a valid "dent" field at least and it's
> > up to caller whether or not it's a problem for its purpose that "st" is
> > empty. For that reason I would not move the block forward.
> 
> Hrm... the comment is:
> 
>     /*
>      * optional (may be NULL): A full stat of each directory entry is just
>      * done if explicitly told to fs driver.
>      */
> 
> I don't read that it is optional for the fs driver to populate "st"
> if this was required by the caller.

Well, if you find that ambigious, I could add an additional sentence "It might 
also be NULL if stat failed.".

> Also, looking at the next patch
> I see that the condition for calling stat() is V9FS_REMAP_INODES and
> the code explicitly requires "st" to be available in this case.

Yes, but there is also an if (!st) { ... } in the subsequent patch already. So 
I don't see an issue here.

Changing the order in readdir_many() would prevent future applications of that 
function that I described. Or let's say, order would need to be reverted back 
again later on.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-05-01 14:04         ` Christian Schoenebeck
@ 2020-05-04  9:18           ` Greg Kurz
  2020-05-04 10:08             ` Christian Schoenebeck
  2020-05-07 12:16             ` Christian Schoenebeck
  0 siblings, 2 replies; 30+ messages in thread
From: Greg Kurz @ 2020-05-04  9:18 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Fri, 01 May 2020 16:04:41 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > > I agree that a client that issues concurrent readdir requests on the
> > > > same fid is probably asking for troubles, but this permitted by the
> > > > spec. Whether we should detect such conditions and warn or even fail
> > > > is discussion for another thread.
> > > > 
> > > > The locking is only needed to avoid concurrent accesses to the dirent
> > > > structure returned by readdir(), otherwise we could return partially
> > > > overwritten file names to the client. It must be done for each
> > > > individual
> > > > call to readdir(), but certainly not for multiple calls.
> > > 
> > > Yeah, that would resolve this issue more appropriately for 9p2000.L, since
> > > Treaddir specifies an offset, but for 9p2000.u the result of a concurrent
> > > read on a directory (9p2000.u) would still be undefined.
> > 
> > The bad client behavior you want to tackle has nothing to do with
> > the locking itself. Since all the code in 9p.c runs serialized in
> > the context of the QEMU main loop, concurrent readdir requests could
> > easily be detected up-front with a simple flag in the fid structure.
> 
> Well, it's fine with me. I don't really see an issue here right now. But that 
> all the code was serialized is not fully true. Most of the 9p.c code is still 
> heavily dispatching between main thread and worker threads back and forth. And 
> for that reason the order of request processing might change quite arbitrarily 
> in between. Just keep that in mind.
> 

Just to make things clear. The code in 9p.c is ALWAYS exclusively run by
the main thread. Only the code called under v9fs_co_run_in_worker() is
dispatched on worker threads. So, yes the order of individual backend
operations may change, but the start of a new client request is necessarily
serialized with the completion of pending ones, which is the only thing
we care for actually.

> > > > > +
> > > > > +    /* save the directory position */
> > > > > +    saved_dir_pos = s->ops->telldir(&s->ctx, &fidp->fs);
> > > > > +    if (saved_dir_pos < 0) {
> > > > > +        err = saved_dir_pos;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    while (true) {
> > > > > +        /* get directory entry from fs driver */
> > > > > +        err = do_readdir(pdu, fidp, &dent);
> > > > > +        if (err || !dent) {
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        /*
> > > > > +         * stop this loop as soon as it would exceed the allowed
> > > > > maximum
> > > > > +         * response message size for the directory entries collected
> > > > > so
> > > > > far, +         * because anything beyond that size would need to be
> > > > > discarded by +         * 9p controller (main thread / top half) anyway
> > > > > +         */
> > > > > +        v9fs_string_init(&name);
> > > > > +        v9fs_string_sprintf(&name, "%s", dent->d_name);
> > > > > +        len = v9fs_readdir_response_size(&name);
> > > > > +        v9fs_string_free(&name);
> > > > > +        if (size + len > maxsize) {
> > > > > +            /* this is not an error case actually */
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        /* append next node to result chain */
> > > > > +        if (!e) {
> > > > > +            *entries = e = g_malloc0(sizeof(V9fsDirEnt));
> > > > > +        } else {
> > > > > +            e = e->next = g_malloc0(sizeof(V9fsDirEnt));
> > > > > +        }
> > > > > +        e->dent = g_malloc0(sizeof(struct dirent));
> > > > 
> > > > So we're allocating a bunch of stuff here...
> > > > 
> > > > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > > > +
> > > > > +        /* perform a full stat() for directory entry if requested by
> > > > > caller */ +        if (dostat) {
> > > > > +            err = s->ops->name_to_path(
> > > > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > > > +            );
> > > > > +            if (err < 0) {
> > > > > 
> > > > >                  err = -errno;
> > > > > 
> > > > > -            } else {
> > > > > -                *dent = entry;
> > > > > -                err = 0;
> > > > > +                break;
> > > > 
> > > > ... but we're erroring out there and it seems that we're leaking
> > > > all the entries that have been allocated so far.
> > > 
> > > No, they are not leaking actually.
> > > 
> > > You are right that they are not deallocated in do_readdir_many(), but
> > > that's intentional: in the new implementation of v9fs_do_readdir() you
> > > see that v9fs_free_dirents(entries) is *always* called at the very end of
> > > the function, no matter if success or any error. That's one of the
> > > measures to simplify overall code as much as possible.
> > 
> > Hmm... I still don't quite like the idea of having an erroring function
> > asking for extra cleanup. I suggest you come up with an idem-potent version
> > of v9fs_free_dirents(), move it to codir.c (I also prefer locality of calls
> > to g_malloc and g_free in the same unit), make it extern and call it
> > both on the error path of v9fs_co_readdir_many() and in v9fs_do_readdir().
> 
> I understand your position of course, but I still won't find that to be a good 
> move.
> 
> My veto here has a reason: your requested change would prevent an application 
> that I had in mind for future purpose actually: Allowing "greedy" fetching 

Are you telling that this series has some kind of hidden agenda related to
a possible future change ?!?

> directory entries, that is returning all successful read entries while some of 
> the entries might have been errored for some reason. Think about a directory 
> where one entry is another device which errors for some reason, then a user 
> probably still would want to see the other entries at least. I know that 

Hrm... if dostat is such a weak requirement that callers might want to
simply ignore the missing stat, then readdir_many() shouldn't error out
in the first place.

> purpose would still need some minor adjustments, but no changes to the current 
> structure of this function actually.
> 
> But to also make this clear: there is no "extra" cleanup right now. 
> v9fs_free_dirents(entries) is *always* called by caller, no matter if success, 
> error, allocated list or NULL. It couldn't be more simple. A user of this 
> function must call v9fs_free_dirents(entries) at a certain point anyway.
> 
> If you have a bad feeling about it, I can make it more clear by an API doc 
> comment on v9fs_co_readdir_many() if you want, like e.g. "You @b MUST @ ALWAYS 
> call v9fs_free_dirents() after using this function, both on success or 
> error.".
> 

No, I just cannot ack the case of a function that returns partially valid
data and an error at the same time. Doesn't look sane to me.

> > > I think you mean the case when there's an error inside the if (dostat) {}
> > > block: The comments on struct V9fsDirEnt already suggest that the "st"
> > > member is optional and may be NULL. So if there's an error inside if
> > > (dostat) {} then caller still has a valid "dent" field at least and it's
> > > up to caller whether or not it's a problem for its purpose that "st" is
> > > empty. For that reason I would not move the block forward.
> > 
> > Hrm... the comment is:
> > 
> >     /*
> >      * optional (may be NULL): A full stat of each directory entry is just
> >      * done if explicitly told to fs driver.
> >      */
> > 
> > I don't read that it is optional for the fs driver to populate "st"
> > if this was required by the caller.
> 
> Well, if you find that ambigious, I could add an additional sentence "It might 
> also be NULL if stat failed.".
> 
> > Also, looking at the next patch
> > I see that the condition for calling stat() is V9FS_REMAP_INODES and
> > the code explicitly requires "st" to be available in this case.
> 
> Yes, but there is also an if (!st) { ... } in the subsequent patch already. So 
> I don't see an issue here.
> 
> Changing the order in readdir_many() would prevent future applications of that 
> function that I described. Or let's say, order would need to be reverted back 
> again later on.
> 
> Best regards,
> Christian Schoenebeck
> 
> 



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-05-04  9:18           ` Greg Kurz
@ 2020-05-04 10:08             ` Christian Schoenebeck
  2020-05-07 12:16             ` Christian Schoenebeck
  1 sibling, 0 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-05-04 10:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Montag, 4. Mai 2020 11:18:34 CEST Greg Kurz wrote:
> > > > > > +        memcpy(e->dent, dent, sizeof(struct dirent));
> > > > > > +
> > > > > > +        /* perform a full stat() for directory entry if requested
> > > > > > by
> > > > > > caller */ +        if (dostat) {
> > > > > > +            err = s->ops->name_to_path(
> > > > > > +                &s->ctx, &fidp->path, dent->d_name, &path
> > > > > > +            );
> > > > > > +            if (err < 0) {
> > > > > > 
> > > > > >                  err = -errno;
> > > > > > 
> > > > > > -            } else {
> > > > > > -                *dent = entry;
> > > > > > -                err = 0;
> > > > > > +                break;
> > > > > 
> > > > > ... but we're erroring out there and it seems that we're leaking
> > > > > all the entries that have been allocated so far.
> > > > 
> > > > No, they are not leaking actually.
> > > > 
> > > > You are right that they are not deallocated in do_readdir_many(), but
> > > > that's intentional: in the new implementation of v9fs_do_readdir() you
> > > > see that v9fs_free_dirents(entries) is *always* called at the very end
> > > > of
> > > > the function, no matter if success or any error. That's one of the
> > > > measures to simplify overall code as much as possible.
> > > 
> > > Hmm... I still don't quite like the idea of having an erroring function
> > > asking for extra cleanup. I suggest you come up with an idem-potent
> > > version
> > > of v9fs_free_dirents(), move it to codir.c (I also prefer locality of
> > > calls
> > > to g_malloc and g_free in the same unit), make it extern and call it
> > > both on the error path of v9fs_co_readdir_many() and in
> > > v9fs_do_readdir().
> > 
> > I understand your position of course, but I still won't find that to be a
> > good move.
> > 
> > My veto here has a reason: your requested change would prevent an
> > application that I had in mind for future purpose actually: Allowing
> > "greedy" fetching
> Are you telling that this series has some kind of hidden agenda related to
> a possible future change ?!?

readdir_many() is written intended as general purpose directory retrieval 
function, that is for other purposes in future in mind, yes.

What I don't do is adding code which is not explicitly needed right now of 
course. That would not make sense and would make code unnecessarily bloated 
and of course too complicated (e.g. readdir_many() is currently simply 
directly calling v9fs_readdir_response_size() to decide whether to terminate 
the loop instead of taking some complicated general-purpose loop end 
"predicate" structure or callback as function argument).

But when it comes to the structure of the code that I have to add NOW, then I 
indeed take potential future changes into account, yes! And this applies 
specifically to the two changes you requested here inside readdir_many(). 
Because I already know, I would need to revert those 2 changes that you 
requested later on. And I don't see any issue whatsover retaining the current 
version concerning those 2.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-05-04  9:18           ` Greg Kurz
  2020-05-04 10:08             ` Christian Schoenebeck
@ 2020-05-07 12:16             ` Christian Schoenebeck
  2020-05-07 15:59               ` Greg Kurz
  1 sibling, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-05-07 12:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Montag, 4. Mai 2020 11:18:34 CEST Greg Kurz wrote:
> On Fri, 01 May 2020 16:04:41 +0200
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > > > I agree that a client that issues concurrent readdir requests on the
> > > > > same fid is probably asking for troubles, but this permitted by the
> > > > > spec. Whether we should detect such conditions and warn or even fail
> > > > > is discussion for another thread.
> > > > > 
> > > > > The locking is only needed to avoid concurrent accesses to the
> > > > > dirent
> > > > > structure returned by readdir(), otherwise we could return partially
> > > > > overwritten file names to the client. It must be done for each
> > > > > individual
> > > > > call to readdir(), but certainly not for multiple calls.
> > > > 
> > > > Yeah, that would resolve this issue more appropriately for 9p2000.L,
> > > > since
> > > > Treaddir specifies an offset, but for 9p2000.u the result of a
> > > > concurrent
> > > > read on a directory (9p2000.u) would still be undefined.
> > > 
> > > The bad client behavior you want to tackle has nothing to do with
> > > the locking itself. Since all the code in 9p.c runs serialized in
> > > the context of the QEMU main loop, concurrent readdir requests could
> > > easily be detected up-front with a simple flag in the fid structure.
> > 
> > Well, it's fine with me. I don't really see an issue here right now. But
> > that all the code was serialized is not fully true. Most of the 9p.c code
> > is still heavily dispatching between main thread and worker threads back
> > and forth. And for that reason the order of request processing might
> > change quite arbitrarily in between. Just keep that in mind.
> 
> Just to make things clear. The code in 9p.c is ALWAYS exclusively run by
> the main thread. Only the code called under v9fs_co_run_in_worker() is
> dispatched on worker threads. So, yes the order of individual backend
> operations may change, but the start of a new client request is necessarily
> serialized with the completion of pending ones, which is the only thing
> we care for actually.

I just looked at this. 9p.c code is called by main I/O thread only, that's 
clear. The start of requests come also in in order, yes, but it seems you 
would think that main I/O thread would not grab the next client request from 
queue before completing the current/previous client request (that is not 
before transmitting result to client). If yes, I am not so sure about this 
claim:

For instance v9fs_path_write_lock() is using a co-mutex, right? So an 
unsuccesful lock would cause main I/O thread to grab the next request before 
completing the current/previous request.

And what happens on any run_in_worker({}) call? If there is another client 
request in the queue, main I/O thread would pull that request from the queue 
before waiting for the worker thread to complete its task, wouldn't it?

Just looked at the code so far, haven't tested it yet ...

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many()
  2020-05-07 12:16             ` Christian Schoenebeck
@ 2020-05-07 15:59               ` Greg Kurz
  0 siblings, 0 replies; 30+ messages in thread
From: Greg Kurz @ 2020-05-07 15:59 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Thu, 07 May 2020 14:16:43 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Montag, 4. Mai 2020 11:18:34 CEST Greg Kurz wrote:
> > On Fri, 01 May 2020 16:04:41 +0200
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > On Donnerstag, 30. April 2020 15:30:49 CEST Greg Kurz wrote:
> > > > > > I agree that a client that issues concurrent readdir requests on the
> > > > > > same fid is probably asking for troubles, but this permitted by the
> > > > > > spec. Whether we should detect such conditions and warn or even fail
> > > > > > is discussion for another thread.
> > > > > > 
> > > > > > The locking is only needed to avoid concurrent accesses to the
> > > > > > dirent
> > > > > > structure returned by readdir(), otherwise we could return partially
> > > > > > overwritten file names to the client. It must be done for each
> > > > > > individual
> > > > > > call to readdir(), but certainly not for multiple calls.
> > > > > 
> > > > > Yeah, that would resolve this issue more appropriately for 9p2000.L,
> > > > > since
> > > > > Treaddir specifies an offset, but for 9p2000.u the result of a
> > > > > concurrent
> > > > > read on a directory (9p2000.u) would still be undefined.
> > > > 
> > > > The bad client behavior you want to tackle has nothing to do with
> > > > the locking itself. Since all the code in 9p.c runs serialized in
> > > > the context of the QEMU main loop, concurrent readdir requests could
> > > > easily be detected up-front with a simple flag in the fid structure.
> > > 
> > > Well, it's fine with me. I don't really see an issue here right now. But
> > > that all the code was serialized is not fully true. Most of the 9p.c code
> > > is still heavily dispatching between main thread and worker threads back
> > > and forth. And for that reason the order of request processing might
> > > change quite arbitrarily in between. Just keep that in mind.
> > 
> > Just to make things clear. The code in 9p.c is ALWAYS exclusively run by
> > the main thread. Only the code called under v9fs_co_run_in_worker() is
> > dispatched on worker threads. So, yes the order of individual backend
> > operations may change, but the start of a new client request is necessarily
> > serialized with the completion of pending ones, which is the only thing
> > we care for actually.
> 
> I just looked at this. 9p.c code is called by main I/O thread only, that's 
> clear. The start of requests come also in in order, yes, but it seems you 
> would think that main I/O thread would not grab the next client request from 
> queue before completing the current/previous client request (that is not 
> before transmitting result to client). If yes, I am not so sure about this 
> claim:
> 

Hrm... I don't think I've ever said anything like that :)

What I'm saying is that, thanks to the serialization of
request start and completion, v9fs_readdir() and v9fs_read()
can:
- flag the fid as being involved in a readdir request
- clear the flag when the request is complete or failed
- directly fail or print a warning if the fid already
  has the flag set (ie. a previous request hasn't
  completed yet)

But again, I'm not a big fan of adding code to handle
such an unlikely situation which is even not explicitly
forbidden by the 9p spec.

> For instance v9fs_path_write_lock() is using a co-mutex, right? So an 
> unsuccesful lock would cause main I/O thread to grab the next request before 
> completing the current/previous request.
> 

An unsuccessful lock would cause the main I/O thread to switch
to some other task.

> And what happens on any run_in_worker({}) call? If there is another client 
> request in the queue, main I/O thread would pull that request from the queue 
> before waiting for the worker thread to complete its task, wouldn't it?
> 

The main I/O thread won't wait for anything, so, yes, it will probably
start handling the new request until it yields.

> Just looked at the code so far, haven't tested it yet ...
> 
> Best regards,
> Christian Schoenebeck
> 
> 



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-04-19 15:06 ` [PATCH v6 4/5] 9pfs: T_readdir latency optimization Christian Schoenebeck
@ 2020-06-03 17:16   ` Christian Schoenebeck
  2020-06-29 16:39     ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-06-03 17:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Sonntag, 19. April 2020 17:06:17 CEST Christian Schoenebeck wrote:
> Make top half really top half and bottom half really bottom half:
> 
> Each T_readdir request handling is hopping between threads (main
> I/O thread and background I/O driver threads) several times for
> every individual directory entry, which sums up to huge latencies
> for handling just a single T_readdir request.
> 
> Instead of doing that, collect now all required directory entries
> (including all potentially required stat buffers for each entry) in
> one rush on a background I/O thread from fs driver by calling the
> previously added function v9fs_co_readdir_many() instead of
> v9fs_co_readdir(), then assemble the entire resulting network
> response message for the readdir request on main I/O thread. The
> fs driver is still aborting the directory entry retrieval loop
> (on the background I/O thread inside of v9fs_co_readdir_many())
> as soon as it would exceed the client's requested maximum R_readdir
> response size. So this will not introduce a performance penalty on
> another end.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/9p.c | 122 +++++++++++++++++++++++----------------------------
>  1 file changed, 55 insertions(+), 67 deletions(-)

Ping. Anybody?

I would like to roll out this patch set soon and this is the only patch in 
this series missing a review yet.




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-06-03 17:16   ` Christian Schoenebeck
@ 2020-06-29 16:39     ` Greg Kurz
  2020-06-30 15:16       ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-06-29 16:39 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Wed, 03 Jun 2020 19:16:08 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Sonntag, 19. April 2020 17:06:17 CEST Christian Schoenebeck wrote:
> > Make top half really top half and bottom half really bottom half:
> > 
> > Each T_readdir request handling is hopping between threads (main
> > I/O thread and background I/O driver threads) several times for
> > every individual directory entry, which sums up to huge latencies
> > for handling just a single T_readdir request.
> > 
> > Instead of doing that, collect now all required directory entries
> > (including all potentially required stat buffers for each entry) in
> > one rush on a background I/O thread from fs driver by calling the
> > previously added function v9fs_co_readdir_many() instead of
> > v9fs_co_readdir(), then assemble the entire resulting network
> > response message for the readdir request on main I/O thread. The
> > fs driver is still aborting the directory entry retrieval loop
> > (on the background I/O thread inside of v9fs_co_readdir_many())
> > as soon as it would exceed the client's requested maximum R_readdir
> > response size. So this will not introduce a performance penalty on
> > another end.
> > 
> > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > ---
> >  hw/9pfs/9p.c | 122 +++++++++++++++++++++++----------------------------
> >  1 file changed, 55 insertions(+), 67 deletions(-)
> 
> Ping. Anybody?
> 
> I would like to roll out this patch set soon and this is the only patch in 
> this series missing a review yet.
> 

Hi Christian,

Sorry for getting back to this only now :-\

So I still have some concerns about the locking of the directory stream
pointer a fid. It was initially introduced to avoid concurrent accesses
by multiple threads to the corresponding internal glibc object, as
recommended in the readdir(3) manual page. Now, this patch considerably
extends the critical section to also contain calls to telldir() and all
the _many_ readdir()... so I'm not sure exactly what's the purpose of
that mutex right now. Please provide more details.

Cheers,

--
Greg


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-06-29 16:39     ` Greg Kurz
@ 2020-06-30 15:16       ` Christian Schoenebeck
  2020-06-30 16:39         ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-06-30 15:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Montag, 29. Juni 2020 18:39:02 CEST Greg Kurz wrote:
> On Wed, 03 Jun 2020 19:16:08 +0200
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Sonntag, 19. April 2020 17:06:17 CEST Christian Schoenebeck wrote:
> > > Make top half really top half and bottom half really bottom half:
> > > 
> > > Each T_readdir request handling is hopping between threads (main
> > > I/O thread and background I/O driver threads) several times for
> > > every individual directory entry, which sums up to huge latencies
> > > for handling just a single T_readdir request.
> > > 
> > > Instead of doing that, collect now all required directory entries
> > > (including all potentially required stat buffers for each entry) in
> > > one rush on a background I/O thread from fs driver by calling the
> > > previously added function v9fs_co_readdir_many() instead of
> > > v9fs_co_readdir(), then assemble the entire resulting network
> > > response message for the readdir request on main I/O thread. The
> > > fs driver is still aborting the directory entry retrieval loop
> > > (on the background I/O thread inside of v9fs_co_readdir_many())
> > > as soon as it would exceed the client's requested maximum R_readdir
> > > response size. So this will not introduce a performance penalty on
> > > another end.
> > > 
> > > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > ---
> > > 
> > >  hw/9pfs/9p.c | 122 +++++++++++++++++++++++----------------------------
> > >  1 file changed, 55 insertions(+), 67 deletions(-)
> > 
> > Ping. Anybody?
> > 
> > I would like to roll out this patch set soon and this is the only patch in
> > this series missing a review yet.
> 
> Hi Christian,

Hi Greg,

> Sorry for getting back to this only now :-\
> 
> So I still have some concerns about the locking of the directory stream
> pointer a fid. It was initially introduced to avoid concurrent accesses
> by multiple threads to the corresponding internal glibc object, as
> recommended in the readdir(3) manual page. Now, this patch considerably
> extends the critical section to also contain calls to telldir() and all
> the _many_ readdir()... so I'm not sure exactly what's the purpose of
> that mutex right now. Please provide more details.

The intention of this lock is to prevent concurrent r/w (i.e. telldir()/
readdir()) of the dir stream position by two or more active readdir requests 
handled in parallel, provided that they would use the same FID. Due to the 
latter requirement for this to become relevant, we already agreed that this is 
not something that would usually happen with a Linux 9p client, but there is 
nothing from protocol point of view that would prohibit this to be done by a 
client, so the resulting undefined behaviour should be prevented, which this 
lock does.

What's your precise concern?

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-06-30 15:16       ` Christian Schoenebeck
@ 2020-06-30 16:39         ` Greg Kurz
  2020-06-30 18:00           ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-06-30 16:39 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Tue, 30 Jun 2020 17:16:40 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Montag, 29. Juni 2020 18:39:02 CEST Greg Kurz wrote:
> > On Wed, 03 Jun 2020 19:16:08 +0200
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > On Sonntag, 19. April 2020 17:06:17 CEST Christian Schoenebeck wrote:
> > > > Make top half really top half and bottom half really bottom half:
> > > > 
> > > > Each T_readdir request handling is hopping between threads (main
> > > > I/O thread and background I/O driver threads) several times for
> > > > every individual directory entry, which sums up to huge latencies
> > > > for handling just a single T_readdir request.
> > > > 
> > > > Instead of doing that, collect now all required directory entries
> > > > (including all potentially required stat buffers for each entry) in
> > > > one rush on a background I/O thread from fs driver by calling the
> > > > previously added function v9fs_co_readdir_many() instead of
> > > > v9fs_co_readdir(), then assemble the entire resulting network
> > > > response message for the readdir request on main I/O thread. The
> > > > fs driver is still aborting the directory entry retrieval loop
> > > > (on the background I/O thread inside of v9fs_co_readdir_many())
> > > > as soon as it would exceed the client's requested maximum R_readdir
> > > > response size. So this will not introduce a performance penalty on
> > > > another end.
> > > > 
> > > > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > > ---
> > > > 
> > > >  hw/9pfs/9p.c | 122 +++++++++++++++++++++++----------------------------
> > > >  1 file changed, 55 insertions(+), 67 deletions(-)
> > > 
> > > Ping. Anybody?
> > > 
> > > I would like to roll out this patch set soon and this is the only patch in
> > > this series missing a review yet.
> > 
> > Hi Christian,
> 
> Hi Greg,
> 
> > Sorry for getting back to this only now :-\
> > 
> > So I still have some concerns about the locking of the directory stream
> > pointer a fid. It was initially introduced to avoid concurrent accesses
> > by multiple threads to the corresponding internal glibc object, as
> > recommended in the readdir(3) manual page. Now, this patch considerably
> > extends the critical section to also contain calls to telldir() and all
> > the _many_ readdir()... so I'm not sure exactly what's the purpose of
> > that mutex right now. Please provide more details.
> 
> The intention of this lock is to prevent concurrent r/w (i.e. telldir()/
> readdir()) of the dir stream position by two or more active readdir requests 
> handled in parallel, provided that they would use the same FID. Due to the 
> latter requirement for this to become relevant, we already agreed that this is 
> not something that would usually happen with a Linux 9p client, but there is 
> nothing from protocol point of view that would prohibit this to be done by a 
> client, so the resulting undefined behaviour should be prevented, which this 
> lock does.
> 

Makes sense. The dir stream position is no different from glibc's internal
dirent in that respect: it shouldn't be used by concurrent threads.

> What's your precise concern?
> 

My overall concern is still the same. The patches are big and I've
been away for too long, so I need some more discussion to reassemble
my thoughts and the puzzle :)

But now that I'm starting to understand why it's a good thing to
extend the critical section, I realize it should be extended
even more: we also have calls to seekdir() and rewindir() in
v9fs_readdir() and friends that could _theoretically_ cause the
very same kind of undefined behavior you're mentioning.

I think the change is important enough that it deserves its
own patch. I expect that moving the locking to the top-level
handler might also simplify the existing code, and thus your
series as well.

Please let me come up with a patch first.

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-06-30 16:39         ` Greg Kurz
@ 2020-06-30 18:00           ` Christian Schoenebeck
  2020-07-01 10:09             ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-06-30 18:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Dienstag, 30. Juni 2020 18:39:57 CEST Greg Kurz wrote:
> On Tue, 30 Jun 2020 17:16:40 +0200
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Montag, 29. Juni 2020 18:39:02 CEST Greg Kurz wrote:
> > > On Wed, 03 Jun 2020 19:16:08 +0200
> > > 
> > > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > > On Sonntag, 19. April 2020 17:06:17 CEST Christian Schoenebeck wrote:
> > > > > Make top half really top half and bottom half really bottom half:
> > > > > 
> > > > > Each T_readdir request handling is hopping between threads (main
> > > > > I/O thread and background I/O driver threads) several times for
> > > > > every individual directory entry, which sums up to huge latencies
> > > > > for handling just a single T_readdir request.
> > > > > 
> > > > > Instead of doing that, collect now all required directory entries
> > > > > (including all potentially required stat buffers for each entry) in
> > > > > one rush on a background I/O thread from fs driver by calling the
> > > > > previously added function v9fs_co_readdir_many() instead of
> > > > > v9fs_co_readdir(), then assemble the entire resulting network
> > > > > response message for the readdir request on main I/O thread. The
> > > > > fs driver is still aborting the directory entry retrieval loop
> > > > > (on the background I/O thread inside of v9fs_co_readdir_many())
> > > > > as soon as it would exceed the client's requested maximum R_readdir
> > > > > response size. So this will not introduce a performance penalty on
> > > > > another end.
> > > > > 
> > > > > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > > > ---
> > > > > 
> > > > >  hw/9pfs/9p.c | 122
> > > > >  +++++++++++++++++++++++----------------------------
> > > > >  1 file changed, 55 insertions(+), 67 deletions(-)
> > > > 
> > > > Ping. Anybody?
> > > > 
> > > > I would like to roll out this patch set soon and this is the only
> > > > patch in
> > > > this series missing a review yet.
> > > 
> > > Hi Christian,
> > 
> > Hi Greg,
> > 
> > > Sorry for getting back to this only now :-\
> > > 
> > > So I still have some concerns about the locking of the directory stream
> > > pointer a fid. It was initially introduced to avoid concurrent accesses
> > > by multiple threads to the corresponding internal glibc object, as
> > > recommended in the readdir(3) manual page. Now, this patch considerably
> > > extends the critical section to also contain calls to telldir() and all
> > > the _many_ readdir()... so I'm not sure exactly what's the purpose of
> > > that mutex right now. Please provide more details.
> > 
> > The intention of this lock is to prevent concurrent r/w (i.e. telldir()/
> > readdir()) of the dir stream position by two or more active readdir
> > requests handled in parallel, provided that they would use the same FID.
> > Due to the latter requirement for this to become relevant, we already
> > agreed that this is not something that would usually happen with a Linux
> > 9p client, but there is nothing from protocol point of view that would
> > prohibit this to be done by a client, so the resulting undefined
> > behaviour should be prevented, which this lock does.
> 
> Makes sense. The dir stream position is no different from glibc's internal
> dirent in that respect: it shouldn't be used by concurrent threads.

Exactly, it is a conceptual issue per se in general, independent of whether 
glibc is used and independent of the fs driver.

> > What's your precise concern?
> 
> My overall concern is still the same. The patches are big and I've
> been away for too long, so I need some more discussion to reassemble
> my thoughts and the puzzle :)

Np, if you have questions, just come along.

> But now that I'm starting to understand why it's a good thing to
> extend the critical section, I realize it should be extended
> even more: we also have calls to seekdir() and rewindir() in
> v9fs_readdir() and friends that could _theoretically_ cause the
> very same kind of undefined behavior you're mentioning.

You are talking about the "old" code now. Yes, it is wrong and I did not 
bother to fix the "old" implementation, since this optimized readdir 
implementation fixed this issue as well in one rush, but also note ...

> I think the change is important enough that it deserves its
> own patch. I expect that moving the locking to the top-level
> handler might also simplify the existing code, and thus your
> series as well.

... please reconsider if you really want to open that box. The "old" readdir 
implementation on main thread is inherently wrong, and handling the lock there 
and error pathes is extremely cumbersome and error prone.

Also: it does not make sense to move locking on this series from fs driver 
back to main I/O thread. Atomicity must retain on driver side, not on top 
half.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-06-30 18:00           ` Christian Schoenebeck
@ 2020-07-01 10:09             ` Greg Kurz
  2020-07-01 11:47               ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-07-01 10:09 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Tue, 30 Jun 2020 20:00:08 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Dienstag, 30. Juni 2020 18:39:57 CEST Greg Kurz wrote:
> > On Tue, 30 Jun 2020 17:16:40 +0200
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > On Montag, 29. Juni 2020 18:39:02 CEST Greg Kurz wrote:
> > > > On Wed, 03 Jun 2020 19:16:08 +0200
> > > > 
> > > > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > > > On Sonntag, 19. April 2020 17:06:17 CEST Christian Schoenebeck wrote:
> > > > > > Make top half really top half and bottom half really bottom half:
> > > > > > 
> > > > > > Each T_readdir request handling is hopping between threads (main
> > > > > > I/O thread and background I/O driver threads) several times for
> > > > > > every individual directory entry, which sums up to huge latencies
> > > > > > for handling just a single T_readdir request.
> > > > > > 
> > > > > > Instead of doing that, collect now all required directory entries
> > > > > > (including all potentially required stat buffers for each entry) in
> > > > > > one rush on a background I/O thread from fs driver by calling the
> > > > > > previously added function v9fs_co_readdir_many() instead of
> > > > > > v9fs_co_readdir(), then assemble the entire resulting network
> > > > > > response message for the readdir request on main I/O thread. The
> > > > > > fs driver is still aborting the directory entry retrieval loop
> > > > > > (on the background I/O thread inside of v9fs_co_readdir_many())
> > > > > > as soon as it would exceed the client's requested maximum R_readdir
> > > > > > response size. So this will not introduce a performance penalty on
> > > > > > another end.
> > > > > > 
> > > > > > Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > > > > ---
> > > > > > 
> > > > > >  hw/9pfs/9p.c | 122
> > > > > >  +++++++++++++++++++++++----------------------------
> > > > > >  1 file changed, 55 insertions(+), 67 deletions(-)
> > > > > 
> > > > > Ping. Anybody?
> > > > > 
> > > > > I would like to roll out this patch set soon and this is the only
> > > > > patch in
> > > > > this series missing a review yet.
> > > > 
> > > > Hi Christian,
> > > 
> > > Hi Greg,
> > > 
> > > > Sorry for getting back to this only now :-\
> > > > 
> > > > So I still have some concerns about the locking of the directory stream
> > > > pointer a fid. It was initially introduced to avoid concurrent accesses
> > > > by multiple threads to the corresponding internal glibc object, as
> > > > recommended in the readdir(3) manual page. Now, this patch considerably
> > > > extends the critical section to also contain calls to telldir() and all
> > > > the _many_ readdir()... so I'm not sure exactly what's the purpose of
> > > > that mutex right now. Please provide more details.
> > > 
> > > The intention of this lock is to prevent concurrent r/w (i.e. telldir()/
> > > readdir()) of the dir stream position by two or more active readdir
> > > requests handled in parallel, provided that they would use the same FID.
> > > Due to the latter requirement for this to become relevant, we already
> > > agreed that this is not something that would usually happen with a Linux
> > > 9p client, but there is nothing from protocol point of view that would
> > > prohibit this to be done by a client, so the resulting undefined
> > > behaviour should be prevented, which this lock does.
> > 
> > Makes sense. The dir stream position is no different from glibc's internal
> > dirent in that respect: it shouldn't be used by concurrent threads.
> 
> Exactly, it is a conceptual issue per se in general, independent of whether 
> glibc is used and independent of the fs driver.
> 
> > > What's your precise concern?
> > 
> > My overall concern is still the same. The patches are big and I've
> > been away for too long, so I need some more discussion to reassemble
> > my thoughts and the puzzle :)
> 
> Np, if you have questions, just come along.
> 
> > But now that I'm starting to understand why it's a good thing to
> > extend the critical section, I realize it should be extended
> > even more: we also have calls to seekdir() and rewindir() in
> > v9fs_readdir() and friends that could _theoretically_ cause the
> > very same kind of undefined behavior you're mentioning.
> 
> You are talking about the "old" code now. Yes, it is wrong and I did not 

No I'm talking about code that isn't changed by this series:

    if (initial_offset == 0) {
        v9fs_co_rewinddir(pdu, fidp);
    } else {
        v9fs_co_seekdir(pdu, fidp, initial_offset);
    }
    count = v9fs_do_readdir(pdu, fidp, max_count);

Leaving these outside the critical section seems to negate
your arguments... please clarify.

> bother to fix the "old" implementation, since this optimized readdir 
> implementation fixed this issue as well in one rush, but also note ...
> 
> > I think the change is important enough that it deserves its
> > own patch. I expect that moving the locking to the top-level
> > handler might also simplify the existing code, and thus your
> > series as well.
> 
> ... please reconsider if you really want to open that box. The "old" readdir 
> implementation on main thread is inherently wrong, and handling the lock there 
> and error pathes is extremely cumbersome and error prone.
> 

There are indeed several issues with the current readdir:

1) potential inconsistency on concurrent Treaddir requests

2) excessive dispatching to worker threads

So we agreed that 1) should never happen with a regular linux
client (we could even dump the lock actually) but we want to
make it clear in the code anyway that actions on the directory
stream are serialized.

2) basically means moving some logic from the frontend to the
backend, ie. called from v9fs_co_run_in_worker(). This implies
that a very long request (ie. one that would span on many calls
to readdir()) cannot be interrupted by the client anymore, as
opposed to what we have now BTW.

I tend to think that addressing both issues in one "rush" is
as much "error prone".

> Also: it does not make sense to move locking on this series from fs driver 
> back to main I/O thread. Atomicity must retain on driver side, not on top 
> half.
> 

Then the whole seekdir/rewinddir/telldir/readdir sequence should
be called with a single v9fs_co_run_in_worker() invocation, in
which case the locking could just be something like:

int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
                                      struct V9fsDirEnt **entries,
                                      int32_t maxsize, bool dostat)
{
    int err = 0;

    if (v9fs_request_cancelled(pdu)) {
        return -EINTR;
    }

    v9fs_readdir_lock(&fidp->fs.dir);

    v9fs_co_run_in_worker({
        err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
    });

    v9fs_readdir_unlock(&fidp->fs.dir);
    return err;
}

This would technically leave the locking in the main I/O thread,
but it isn't conceptually different from locking at the beginning
of do_readdir_lock() and unlocking before returning. My concern
is that I don't know how coroutine mutexes behave with multiple
worker threads...

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-01 10:09             ` Greg Kurz
@ 2020-07-01 11:47               ` Christian Schoenebeck
  2020-07-01 15:12                 ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-07-01 11:47 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Mittwoch, 1. Juli 2020 12:09:24 CEST Greg Kurz wrote:
> No I'm talking about code that isn't changed by this series:
> 
>     if (initial_offset == 0) {
>         v9fs_co_rewinddir(pdu, fidp);
>     } else {
>         v9fs_co_seekdir(pdu, fidp, initial_offset);
>     }
>     count = v9fs_do_readdir(pdu, fidp, max_count);
> 
> Leaving these outside the critical section seems to negate
> your arguments... please clarify.

Yeah, I admit I have neglected this issue, as concurrent readdir requests with 
same FID always was some kind of theoretical issue. But yes, you are right, 
that should be addressed by 

1. entirely removing the rewinddir/seekdir code here

2. passing initial_offset to v9fs_do_readdir(), which in turn would
   pass it to readdir_many()

3. readdir_many() would handle rewinddir/seekdir exclusively in its crticial
   section.

> There are indeed several issues with the current readdir:
> 
> 1) potential inconsistency on concurrent Treaddir requests
> 
> 2) excessive dispatching to worker threads
> 
> So we agreed that 1) should never happen with a regular linux
> client (we could even dump the lock actually) but we want to
> make it clear in the code anyway that actions on the directory
> stream are serialized.

My point is you are trying to fix a merely thereotical issue on code that's 
conceptually, inherently wrong and dead end code at first place. Top half code 
should always be designed to be as thin as possible. The old readdir code 
though is the complete opposite.

> 2) basically means moving some logic from the frontend to the
> backend, ie. called from v9fs_co_run_in_worker(). This implies
> that a very long request (ie. one that would span on many calls
> to readdir()) cannot be interrupted by the client anymore, as
> opposed to what we have now BTW.

3) Complexity of old readdir code is so much bigger compared to the new one
   such that probability of additional issues is also significantly higher.

> I tend to think that addressing both issues in one "rush" is
> as much "error prone".

No it's not. The optimized readdir implementation is quantifyable, 
significantly less complex than the old implementation (i.e. significantly 
smaller amount of branches and error pathes, determenistic clear separation of 
thread's task assignments which includes much smaller amount of thread hops). 
Less complexity and increased determinism consequently means reduced chance of 
misbehaviours. So that's not a subjective, but rather a quantifyable, proven 
statement.

You can't switch from the old (wrong) concept to the new concept without some 
minimum amount of changes, which yes are not small, but I don't see any way to 
make the change set smaller without yet introducing new negative side effects.

I am now using words that I heard from your side before: please be realistic. 
Currently man power on 9p code is extremely limited to put it mildly. Yes, we 
could spend time on fixing this (theoretical) issue on the old readdir could. 
But what would it buy? Due to the limited man power we can only move forward 
with compromises; in this case you are fighting for code that's DOA. The only 
thing that you achieve by still sticking to the old readdir code is you are 
preventing that we move forward at all. Remember: I originally sent this patch 
almost 7 months ago.

> > Also: it does not make sense to move locking on this series from fs driver
> > back to main I/O thread. Atomicity must retain on driver side, not on top
> > half.
> 
> Then the whole seekdir/rewinddir/telldir/readdir sequence should
> be called with a single v9fs_co_run_in_worker() invocation, in
> which case the locking could just be something like:
> 
> int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
>                                       struct V9fsDirEnt **entries,
>                                       int32_t maxsize, bool dostat)
> {
>     int err = 0;
> 
>     if (v9fs_request_cancelled(pdu)) {
>         return -EINTR;
>     }
> 
>     v9fs_readdir_lock(&fidp->fs.dir);
> 
>     v9fs_co_run_in_worker({
>         err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
>     });
> 
>     v9fs_readdir_unlock(&fidp->fs.dir);
>     return err;
> }

That's exactly what should be prevented. The lock must be on driver thread 
side, not on main thread side. The goal is to reduce the time slice of 
individual locks. If the lock is on main thread, the time slice of a lock 
(even on huge directories with thousands of entries) is multiple factors 
larger than if the lock is on driver thread side. So that's not just few 
percent or so, it is huge.

> This would technically leave the locking in the main I/O thread,
> but it isn't conceptually different from locking at the beginning
> of do_readdir_lock() and unlocking before returning. My concern
> is that I don't know how coroutine mutexes behave with multiple
> worker threads...

Well, your Mutex -> CoMutex change was intended to fix a deadlock with the old 
readdir implementation. That deadlock could not happen with the new readdir 
implementation, which suggests that it probably makes sense to revert this 
change (i.e. CoMutex -> Mutex) for this new implementation.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-01 11:47               ` Christian Schoenebeck
@ 2020-07-01 15:12                 ` Greg Kurz
  2020-07-02 11:43                   ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-07-01 15:12 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Wed, 01 Jul 2020 13:47:12 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Mittwoch, 1. Juli 2020 12:09:24 CEST Greg Kurz wrote:
> > No I'm talking about code that isn't changed by this series:
> > 
> >     if (initial_offset == 0) {
> >         v9fs_co_rewinddir(pdu, fidp);
> >     } else {
> >         v9fs_co_seekdir(pdu, fidp, initial_offset);
> >     }
> >     count = v9fs_do_readdir(pdu, fidp, max_count);
> > 
> > Leaving these outside the critical section seems to negate
> > your arguments... please clarify.
> 
> Yeah, I admit I have neglected this issue, as concurrent readdir requests with 
> same FID always was some kind of theoretical issue. But yes, you are right, 

It's perfectly fine to miss things, that's what reviews are made for :)

> that should be addressed by 
> 
> 1. entirely removing the rewinddir/seekdir code here
> 
> 2. passing initial_offset to v9fs_do_readdir(), which in turn would
>    pass it to readdir_many()
> 
> 3. readdir_many() would handle rewinddir/seekdir exclusively in its crticial
>    section.
> 

Sounds good. v7 ?

> > There are indeed several issues with the current readdir:
> > 
> > 1) potential inconsistency on concurrent Treaddir requests
> > 
> > 2) excessive dispatching to worker threads
> > 
> > So we agreed that 1) should never happen with a regular linux
> > client (we could even dump the lock actually) but we want to
> > make it clear in the code anyway that actions on the directory
> > stream are serialized.
> 
> My point is you are trying to fix a merely thereotical issue on code that's 
> conceptually, inherently wrong and dead end code at first place. Top half code 

I'm not trying to fix anything. We already have locking in place to
deal with this theoretical issue and it interferes with your changes.
I don't care that much if a silly guest shoots itself in the foot
actually, so it'll be fine with me if you just dump the lock, as
long as it doesn't cause QEMU to hang or crash.

> should always be designed to be as thin as possible. The old readdir code 
> though is the complete opposite.
> 

It isn't readdir only, most requests span over multiple v9fs_co_*() calls...

> > 2) basically means moving some logic from the frontend to the
> > backend, ie. called from v9fs_co_run_in_worker(). This implies
> > that a very long request (ie. one that would span on many calls
> > to readdir()) cannot be interrupted by the client anymore, as
> > opposed to what we have now BTW.
> 

... most likely to allow clients to interrupt a long request with a
Tflush. Have you considered this aspect in your changes ?

> 3) Complexity of old readdir code is so much bigger compared to the new one
>    such that probability of additional issues is also significantly higher.
> 
> > I tend to think that addressing both issues in one "rush" is
> > as much "error prone".
> 
> No it's not. The optimized readdir implementation is quantifyable, 
> significantly less complex than the old implementation (i.e. significantly 
> smaller amount of branches and error pathes, determenistic clear separation of 
> thread's task assignments which includes much smaller amount of thread hops). 
> Less complexity and increased determinism consequently means reduced chance of 
> misbehaviours. So that's not a subjective, but rather a quantifyable, proven 
> statement.
> 
> You can't switch from the old (wrong) concept to the new concept without some 
> minimum amount of changes, which yes are not small, but I don't see any way to 
> make the change set smaller without yet introducing new negative side effects.
> 
> I am now using words that I heard from your side before: please be realistic. 
> Currently man power on 9p code is extremely limited to put it mildly. Yes, we 
> could spend time on fixing this (theoretical) issue on the old readdir could. 
> But what would it buy? Due to the limited man power we can only move forward 
> with compromises; in this case you are fighting for code that's DOA. The only 
> thing that you achieve by still sticking to the old readdir code is you are 
> preventing that we move forward at all. Remember: I originally sent this patch 
> almost 7 months ago.
> 
> > > Also: it does not make sense to move locking on this series from fs driver
> > > back to main I/O thread. Atomicity must retain on driver side, not on top
> > > half.
> > 
> > Then the whole seekdir/rewinddir/telldir/readdir sequence should
> > be called with a single v9fs_co_run_in_worker() invocation, in
> > which case the locking could just be something like:
> > 
> > int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> >                                       struct V9fsDirEnt **entries,
> >                                       int32_t maxsize, bool dostat)
> > {
> >     int err = 0;
> > 
> >     if (v9fs_request_cancelled(pdu)) {
> >         return -EINTR;
> >     }
> > 
> >     v9fs_readdir_lock(&fidp->fs.dir);
> > 
> >     v9fs_co_run_in_worker({
> >         err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> >     });
> > 
> >     v9fs_readdir_unlock(&fidp->fs.dir);
> >     return err;
> > }
> 
> That's exactly what should be prevented. The lock must be on driver thread 
> side, not on main thread side. The goal is to reduce the time slice of 
> individual locks. If the lock is on main thread, the time slice of a lock 
> (even on huge directories with thousands of entries) is multiple factors 
> larger than if the lock is on driver thread side. So that's not just few 
> percent or so, it is huge.
> 

Sorry I don't get it...  In a contention-less situation, which is the
case we really care for, qemu_co_mutex_lock() has no overhead.

> > This would technically leave the locking in the main I/O thread,
> > but it isn't conceptually different from locking at the beginning
> > of do_readdir_lock() and unlocking before returning. My concern
> > is that I don't know how coroutine mutexes behave with multiple
> > worker threads...
> 
> Well, your Mutex -> CoMutex change was intended to fix a deadlock with the old 
> readdir implementation. That deadlock could not happen with the new readdir 
> implementation, which suggests that it probably makes sense to revert this 
> change (i.e. CoMutex -> Mutex) for this new implementation.
> 

No we can't because it is also used with 9p2000.u, that you said you
don't want to fix.

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-01 15:12                 ` Greg Kurz
@ 2020-07-02 11:43                   ` Christian Schoenebeck
  2020-07-02 15:35                     ` Greg Kurz
  0 siblings, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-07-02 11:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Mittwoch, 1. Juli 2020 17:12:40 CEST Greg Kurz wrote:
> On Wed, 01 Jul 2020 13:47:12 +0200
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Mittwoch, 1. Juli 2020 12:09:24 CEST Greg Kurz wrote:
> > > No I'm talking about code that isn't changed by this series:
> > >     if (initial_offset == 0) {
> > >     
> > >         v9fs_co_rewinddir(pdu, fidp);
> > >     
> > >     } else {
> > >     
> > >         v9fs_co_seekdir(pdu, fidp, initial_offset);
> > >     
> > >     }
> > >     count = v9fs_do_readdir(pdu, fidp, max_count);
> > > 
> > > Leaving these outside the critical section seems to negate
> > > your arguments... please clarify.
> > 
> > Yeah, I admit I have neglected this issue, as concurrent readdir requests
> > with same FID always was some kind of theoretical issue. But yes, you are
> > right,
> It's perfectly fine to miss things, that's what reviews are made for :)
> 
> > that should be addressed by
> > 
> > 1. entirely removing the rewinddir/seekdir code here
> > 
> > 2. passing initial_offset to v9fs_do_readdir(), which in turn would
> > 
> >    pass it to readdir_many()
> > 
> > 3. readdir_many() would handle rewinddir/seekdir exclusively in its
> > crticial> 
> >    section.
> 
> Sounds good. v7 ?

Sure, that's not the problem, I can repost this handled appropriately of 
course.

But would you please finalize your picture of this patch set first? I would 
really (try) to finally nail this thing with the next version.

> > > There are indeed several issues with the current readdir:
> > > 
> > > 1) potential inconsistency on concurrent Treaddir requests
> > > 
> > > 2) excessive dispatching to worker threads
> > > 
> > > So we agreed that 1) should never happen with a regular linux
> > > client (we could even dump the lock actually) but we want to
> > > make it clear in the code anyway that actions on the directory
> > > stream are serialized.
> > 
> > My point is you are trying to fix a merely thereotical issue on code
> > that's
> > conceptually, inherently wrong and dead end code at first place. Top half
> > code
> I'm not trying to fix anything. We already have locking in place to
> deal with this theoretical issue and it interferes with your changes.
> I don't care that much if a silly guest shoots itself in the foot
> actually, so it'll be fine with me if you just dump the lock, as
> long as it doesn't cause QEMU to hang or crash.

Ah ok, I got you as if you wanted to fix more details on the old readdir code, 
which would be a clear blocker for this patch set to proceed. Good then.

> > should always be designed to be as thin as possible. The old readdir code
> > though is the complete opposite.
> 
> It isn't readdir only, most requests span over multiple v9fs_co_*() calls...

Right, I know! And that's actually my root motivation to finally bring this 
patch set forward, since I am very aware that this patch set is just a small 
brick in the overall procecss of fixing similarly affected code portions. So 
yes, that's my plan to fix them with upcoming patch sets, too.

Having said that, could we probably try to make future reviews a bit more 
efficient at certain aspects? For instance it would help a lot if the patch 
set was reviewed entirely, and not stopped at the very first issue spotted and 
then suspended to ++version, as this creates large latencies in the overall 
review process?

And likewise it would also help if review is prioritized on relevant behaviour 
aspects (concept, design) first before arguing about code style details like 
variable names or other details invisible to users.

And finally: compromises. As man power on 9p is very limited, it would make 
sense to limit patch sets at a certain extent and agree to postpone certain 
non-critical issues to subsequent patches (or sets of), otherwise a patch set 
will grow and grow and it will take ages to get forward.

> > > 2) basically means moving some logic from the frontend to the
> > > backend, ie. called from v9fs_co_run_in_worker(). This implies
> > > that a very long request (ie. one that would span on many calls
> > > to readdir()) cannot be interrupted by the client anymore, as
> > > opposed to what we have now BTW.
> 
> ... most likely to allow clients to interrupt a long request with a
> Tflush. Have you considered this aspect in your changes ?

In this particular patch set, no, as for readdir this should not be an issue 
in practice for anybody. After this patch set is applied, even on huge 
directories, the fs driver's duration for readdir would barely be measurable. 
In fact the server's latency would always be much higher, yet.

But also for the upcoming, planned patch sets (i.e. other request types): That 
would be an example where I would ask you to lower you expectations a bit and 
that we could agree to postpone such an aspect to a subsequent, separate patch 
(or set of).

> > 3) Complexity of old readdir code is so much bigger compared to the new
> > one
> > 
> >    such that probability of additional issues is also significantly
> >    higher.
> > > 
> > > I tend to think that addressing both issues in one "rush" is
> > > as much "error prone".
> > 
> > No it's not. The optimized readdir implementation is quantifyable,
> > significantly less complex than the old implementation (i.e. significantly
> > smaller amount of branches and error pathes, determenistic clear
> > separation of thread's task assignments which includes much smaller
> > amount of thread hops). Less complexity and increased determinism
> > consequently means reduced chance of misbehaviours. So that's not a
> > subjective, but rather a quantifyable, proven statement.
> > 
> > You can't switch from the old (wrong) concept to the new concept without
> > some minimum amount of changes, which yes are not small, but I don't see
> > any way to make the change set smaller without yet introducing new
> > negative side effects.
> > 
> > I am now using words that I heard from your side before: please be
> > realistic. Currently man power on 9p code is extremely limited to put it
> > mildly. Yes, we could spend time on fixing this (theoretical) issue on
> > the old readdir could. But what would it buy? Due to the limited man
> > power we can only move forward with compromises; in this case you are
> > fighting for code that's DOA. The only thing that you achieve by still
> > sticking to the old readdir code is you are preventing that we move
> > forward at all. Remember: I originally sent this patch almost 7 months
> > ago.
> > 
> > > > Also: it does not make sense to move locking on this series from fs
> > > > driver
> > > > back to main I/O thread. Atomicity must retain on driver side, not on
> > > > top
> > > > half.
> > > 
> > > Then the whole seekdir/rewinddir/telldir/readdir sequence should
> > > be called with a single v9fs_co_run_in_worker() invocation, in
> > > which case the locking could just be something like:
> > > 
> > > int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> > > 
> > >                                       struct V9fsDirEnt **entries,
> > >                                       int32_t maxsize, bool dostat)
> > > 
> > > {
> > > 
> > >     int err = 0;
> > >     
> > >     if (v9fs_request_cancelled(pdu)) {
> > >     
> > >         return -EINTR;
> > >     
> > >     }
> > >     
> > >     v9fs_readdir_lock(&fidp->fs.dir);
> > >     
> > >     v9fs_co_run_in_worker({
> > >     
> > >         err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> > >     
> > >     });
> > >     
> > >     v9fs_readdir_unlock(&fidp->fs.dir);
> > >     return err;
> > > 
> > > }
> > 
> > That's exactly what should be prevented. The lock must be on driver thread
> > side, not on main thread side. The goal is to reduce the time slice of
> > individual locks. If the lock is on main thread, the time slice of a lock
> > (even on huge directories with thousands of entries) is multiple factors
> > larger than if the lock is on driver thread side. So that's not just few
> > percent or so, it is huge.
> 
> Sorry I don't get it...  In a contention-less situation, which is the
> case we really care for, qemu_co_mutex_lock() has no overhead.

Yes, it only kicks in if there is concurrency.

> > > This would technically leave the locking in the main I/O thread,
> > > but it isn't conceptually different from locking at the beginning
> > > of do_readdir_lock() and unlocking before returning. My concern
> > > is that I don't know how coroutine mutexes behave with multiple
> > > worker threads...
> > 
> > Well, your Mutex -> CoMutex change was intended to fix a deadlock with the
> > old readdir implementation. That deadlock could not happen with the new
> > readdir implementation, which suggests that it probably makes sense to
> > revert this change (i.e. CoMutex -> Mutex) for this new implementation.
> 
> No we can't because it is also used with 9p2000.u, that you said you
> don't want to fix.

Yes and no, I did not say I don't want to fix readdir for 9p2000.u at all. 
What I said was I want to prioritize and concentrate on the most relevant 
issues first. 9p2000.L is the most commonly used protocol variant, so I would 
like to fix the most severe (e.g. performance) issues for 9p2000.L first 
before spending time on 9p2000.u which is AFAICS barely used in comparison, 
which I personally don't use at all, and which I am hence not testing in the 
same degree and cannot serve with the same quality as 9p2000.L right now.

Plus if there are really users caring for 9p2000.u, I can gladly assist them 
to address these issues for 9p2000.u as well, provided that they help at least 
with testing the required 9p2000.u changes.

Back to the actual topic: so what do we do about the mutex then? CoMutex for 
9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but it would just 
be a transitional measure.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-02 11:43                   ` Christian Schoenebeck
@ 2020-07-02 15:35                     ` Greg Kurz
  2020-07-02 17:23                       ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-07-02 15:35 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Thu, 02 Jul 2020 13:43:11 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Mittwoch, 1. Juli 2020 17:12:40 CEST Greg Kurz wrote:
> > On Wed, 01 Jul 2020 13:47:12 +0200
> > 
> > Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > > On Mittwoch, 1. Juli 2020 12:09:24 CEST Greg Kurz wrote:
> > > > No I'm talking about code that isn't changed by this series:
> > > >     if (initial_offset == 0) {
> > > >     
> > > >         v9fs_co_rewinddir(pdu, fidp);
> > > >     
> > > >     } else {
> > > >     
> > > >         v9fs_co_seekdir(pdu, fidp, initial_offset);
> > > >     
> > > >     }
> > > >     count = v9fs_do_readdir(pdu, fidp, max_count);
> > > > 
> > > > Leaving these outside the critical section seems to negate
> > > > your arguments... please clarify.
> > > 
> > > Yeah, I admit I have neglected this issue, as concurrent readdir requests
> > > with same FID always was some kind of theoretical issue. But yes, you are
> > > right,
> > It's perfectly fine to miss things, that's what reviews are made for :)
> > 
> > > that should be addressed by
> > > 
> > > 1. entirely removing the rewinddir/seekdir code here
> > > 
> > > 2. passing initial_offset to v9fs_do_readdir(), which in turn would
> > > 
> > >    pass it to readdir_many()
> > > 
> > > 3. readdir_many() would handle rewinddir/seekdir exclusively in its
> > > crticial> 
> > >    section.
> > 
> > Sounds good. v7 ?
> 
> Sure, that's not the problem, I can repost this handled appropriately of 
> course.
> 
> But would you please finalize your picture of this patch set first? I would 
> really (try) to finally nail this thing with the next version.
> 

I'll do my best.

> > > > There are indeed several issues with the current readdir:
> > > > 
> > > > 1) potential inconsistency on concurrent Treaddir requests
> > > > 
> > > > 2) excessive dispatching to worker threads
> > > > 
> > > > So we agreed that 1) should never happen with a regular linux
> > > > client (we could even dump the lock actually) but we want to
> > > > make it clear in the code anyway that actions on the directory
> > > > stream are serialized.
> > > 
> > > My point is you are trying to fix a merely thereotical issue on code
> > > that's
> > > conceptually, inherently wrong and dead end code at first place. Top half
> > > code
> > I'm not trying to fix anything. We already have locking in place to
> > deal with this theoretical issue and it interferes with your changes.
> > I don't care that much if a silly guest shoots itself in the foot
> > actually, so it'll be fine with me if you just dump the lock, as
> > long as it doesn't cause QEMU to hang or crash.
> 
> Ah ok, I got you as if you wanted to fix more details on the old readdir code, 
> which would be a clear blocker for this patch set to proceed. Good then.
> 
> > > should always be designed to be as thin as possible. The old readdir code
> > > though is the complete opposite.
> > 
> > It isn't readdir only, most requests span over multiple v9fs_co_*() calls...
> 
> Right, I know! And that's actually my root motivation to finally bring this 
> patch set forward, since I am very aware that this patch set is just a small 
> brick in the overall procecss of fixing similarly affected code portions. So 
> yes, that's my plan to fix them with upcoming patch sets, too.
> 
> Having said that, could we probably try to make future reviews a bit more 
> efficient at certain aspects? For instance it would help a lot if the patch 
> set was reviewed entirely, and not stopped at the very first issue spotted and 
> then suspended to ++version, as this creates large latencies in the overall 
> review process?
> 

Review of 9pfs is a best effort thing... I usually stop reviewing when I'm fed
up or when all the time I could reasonably invest is elapsed. Breaking down
the series in smaller patches is the usual improvement for that.

> And likewise it would also help if review is prioritized on relevant behaviour 
> aspects (concept, design) first before arguing about code style details like 
> variable names or other details invisible to users.
> 

I don't remember questioning the overall concept behind these changes
because it looks reasonable enough (even if I would appreciate to be
able to verify them with a working reproducer).

Even if it is invisible to the users, coding style or how a series is
broken down in patches is important for developers.

> And finally: compromises. As man power on 9p is very limited, it would make 
> sense to limit patch sets at a certain extent and agree to postpone certain 
> non-critical issues to subsequent patches (or sets of), otherwise a patch set 
> will grow and grow and it will take ages to get forward.
> 

FWIW this patchset was more than 10 patches and now it is only 5 :)

The good news is that you'll soon be able to merge things by yourself.
I'll help you when I can but I won't be the gating factor anymore.

Hurrah !

> > > > 2) basically means moving some logic from the frontend to the
> > > > backend, ie. called from v9fs_co_run_in_worker(). This implies
> > > > that a very long request (ie. one that would span on many calls
> > > > to readdir()) cannot be interrupted by the client anymore, as
> > > > opposed to what we have now BTW.
> > 
> > ... most likely to allow clients to interrupt a long request with a
> > Tflush. Have you considered this aspect in your changes ?
> 
> In this particular patch set, no, as for readdir this should not be an issue 
> in practice for anybody. After this patch set is applied, even on huge 
> directories, the fs driver's duration for readdir would barely be measurable. 
> In fact the server's latency would always be much higher, yet.
> 

Reproducer ? Numbers ? ;)

> But also for the upcoming, planned patch sets (i.e. other request types): That 
> would be an example where I would ask you to lower you expectations a bit and 
> that we could agree to postpone such an aspect to a subsequent, separate patch 
> (or set of).
> 
> > > 3) Complexity of old readdir code is so much bigger compared to the new
> > > one
> > > 
> > >    such that probability of additional issues is also significantly
> > >    higher.
> > > > 
> > > > I tend to think that addressing both issues in one "rush" is
> > > > as much "error prone".
> > > 
> > > No it's not. The optimized readdir implementation is quantifyable,
> > > significantly less complex than the old implementation (i.e. significantly
> > > smaller amount of branches and error pathes, determenistic clear
> > > separation of thread's task assignments which includes much smaller
> > > amount of thread hops). Less complexity and increased determinism
> > > consequently means reduced chance of misbehaviours. So that's not a
> > > subjective, but rather a quantifyable, proven statement.
> > > 
> > > You can't switch from the old (wrong) concept to the new concept without
> > > some minimum amount of changes, which yes are not small, but I don't see
> > > any way to make the change set smaller without yet introducing new
> > > negative side effects.
> > > 
> > > I am now using words that I heard from your side before: please be
> > > realistic. Currently man power on 9p code is extremely limited to put it
> > > mildly. Yes, we could spend time on fixing this (theoretical) issue on
> > > the old readdir could. But what would it buy? Due to the limited man
> > > power we can only move forward with compromises; in this case you are
> > > fighting for code that's DOA. The only thing that you achieve by still
> > > sticking to the old readdir code is you are preventing that we move
> > > forward at all. Remember: I originally sent this patch almost 7 months
> > > ago.
> > > 
> > > > > Also: it does not make sense to move locking on this series from fs
> > > > > driver
> > > > > back to main I/O thread. Atomicity must retain on driver side, not on
> > > > > top
> > > > > half.
> > > > 
> > > > Then the whole seekdir/rewinddir/telldir/readdir sequence should
> > > > be called with a single v9fs_co_run_in_worker() invocation, in
> > > > which case the locking could just be something like:
> > > > 
> > > > int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp,
> > > > 
> > > >                                       struct V9fsDirEnt **entries,
> > > >                                       int32_t maxsize, bool dostat)
> > > > 
> > > > {
> > > > 
> > > >     int err = 0;
> > > >     
> > > >     if (v9fs_request_cancelled(pdu)) {
> > > >     
> > > >         return -EINTR;
> > > >     
> > > >     }
> > > >     
> > > >     v9fs_readdir_lock(&fidp->fs.dir);
> > > >     
> > > >     v9fs_co_run_in_worker({
> > > >     
> > > >         err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> > > >     
> > > >     });
> > > >     
> > > >     v9fs_readdir_unlock(&fidp->fs.dir);
> > > >     return err;
> > > > 
> > > > }
> > > 
> > > That's exactly what should be prevented. The lock must be on driver thread
> > > side, not on main thread side. The goal is to reduce the time slice of
> > > individual locks. If the lock is on main thread, the time slice of a lock
> > > (even on huge directories with thousands of entries) is multiple factors
> > > larger than if the lock is on driver thread side. So that's not just few
> > > percent or so, it is huge.
> > 
> > Sorry I don't get it...  In a contention-less situation, which is the
> > case we really care for, qemu_co_mutex_lock() has no overhead.
> 
> Yes, it only kicks in if there is concurrency.
> 

And we don't care to preserve performance for silly clients, do we ?

> > > > This would technically leave the locking in the main I/O thread,
> > > > but it isn't conceptually different from locking at the beginning
> > > > of do_readdir_lock() and unlocking before returning. My concern
> > > > is that I don't know how coroutine mutexes behave with multiple
> > > > worker threads...
> > > 
> > > Well, your Mutex -> CoMutex change was intended to fix a deadlock with the
> > > old readdir implementation. That deadlock could not happen with the new
> > > readdir implementation, which suggests that it probably makes sense to
> > > revert this change (i.e. CoMutex -> Mutex) for this new implementation.
> > 
> > No we can't because it is also used with 9p2000.u, that you said you
> > don't want to fix.
> 
> Yes and no, I did not say I don't want to fix readdir for 9p2000.u at all. 
> What I said was I want to prioritize and concentrate on the most relevant 
> issues first. 9p2000.L is the most commonly used protocol variant, so I would 
> like to fix the most severe (e.g. performance) issues for 9p2000.L first 
> before spending time on 9p2000.u which is AFAICS barely used in comparison, 
> which I personally don't use at all, and which I am hence not testing in the 
> same degree and cannot serve with the same quality as 9p2000.L right now.
> 
> Plus if there are really users caring for 9p2000.u, I can gladly assist them 
> to address these issues for 9p2000.u as well, provided that they help at least 
> with testing the required 9p2000.u changes.
> 

I would personally do the opposite... again ;)

Like you say we essentially care for 9p2000.L and we only do limited
support for 9p2000.u. If we have a change that we really want for
9p2000.L but it has an impact on 9p2000.u because of shared code,
it is fine to do the changes anyway, including changes to the 9p2000.u
specific code. Very basic testing of 9p2000.u (mount/ls or find/umount)
or maybe running pjd-fstest is enough IMHO. If we break someone's setup
and he complains, then we can ask him to test a fix before we merge it.

> Back to the actual topic: so what do we do about the mutex then? CoMutex for 
> 9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but it would just 
> be a transitional measure.
> 

That would ruin my day...

More seriously, the recent fix for a deadlock condition that was present
for years proves that nobody seems to be using silly clients with QEMU.
So I think we should just dump the lock and add a one-time warning in
the top level handlers when we detect a duplicate readdir request on
the same fid. This should be a very simple patch (I can take care of
it right away).

> Best regards,
> Christian Schoenebeck
> 
> 

Cheers,

--
Greg


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-02 15:35                     ` Greg Kurz
@ 2020-07-02 17:23                       ` Christian Schoenebeck
  2020-07-03  8:08                         ` Christian Schoenebeck
  2020-07-03 15:53                         ` Greg Kurz
  0 siblings, 2 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-07-02 17:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Donnerstag, 2. Juli 2020 17:35:00 CEST Greg Kurz wrote:
> > > It isn't readdir only, most requests span over multiple v9fs_co_*()
> > > calls...> 
> > Right, I know! And that's actually my root motivation to finally bring
> > this
> > patch set forward, since I am very aware that this patch set is just a
> > small brick in the overall procecss of fixing similarly affected code
> > portions. So yes, that's my plan to fix them with upcoming patch sets,
> > too.
> > 
> > Having said that, could we probably try to make future reviews a bit more
> > efficient at certain aspects? For instance it would help a lot if the
> > patch
> > set was reviewed entirely, and not stopped at the very first issue spotted
> > and then suspended to ++version, as this creates large latencies in the
> > overall review process?
> 
> Review of 9pfs is a best effort thing... I usually stop reviewing when I'm
> fed up or when all the time I could reasonably invest is elapsed. Breaking
> down the series in smaller patches is the usual improvement for that.

No need to defend, you do these things voluntarily. I am glad that you spend 
time on this project at all. I would appreciate though if we could reduce the 
overall time for a patch set a bit, and my suggestion trying to walk through 
an entire set before re-posting a new version might indeed bring an 
improvement without anybody having to invest more time, but rather less.

> > And likewise it would also help if review is prioritized on relevant
> > behaviour aspects (concept, design) first before arguing about code style
> > details like variable names or other details invisible to users.
> 
> I don't remember questioning the overall concept behind these changes
> because it looks reasonable enough (even if I would appreciate to be
> able to verify them with a working reproducer).

What exactly do you mean here with working reproducer?

> Even if it is invisible to the users, coding style or how a series is
> broken down in patches is important for developers.

Dedication for detail beyond a certain fine graded level is luxury, and for 
that reason it is designated for projects which clearly can afford it. On 
(sub)projects with low activity and high latency (like this one) it easily 
leads to frustration, which is obviously contra productive.

I don't say don't care about code style, details et al., but caring a bit less 
on reviews would not hurt, so to say. A bit of (invisible) tolerance avoids 
friction and stagnancy.

> > And finally: compromises. As man power on 9p is very limited, it would
> > make
> > sense to limit patch sets at a certain extent and agree to postpone
> > certain
> > non-critical issues to subsequent patches (or sets of), otherwise a patch
> > set will grow and grow and it will take ages to get forward.
> 
> FWIW this patchset was more than 10 patches and now it is only 5 :)

Hey, no misleading number crunching please. ;-) Three dropped patches were 
never intended to be merged, they were pure benchmarks.

> The good news is that you'll soon be able to merge things by yourself.
> I'll help you when I can but I won't be the gating factor anymore.
> 
> Hurrah !

Yep, I'll do my best to pursue your work, I appreciate what you did and that 
you still volunteer to help!

> > > > > 2) basically means moving some logic from the frontend to the
> > > > > backend, ie. called from v9fs_co_run_in_worker(). This implies
> > > > > that a very long request (ie. one that would span on many calls
> > > > > to readdir()) cannot be interrupted by the client anymore, as
> > > > > opposed to what we have now BTW.
> > > 
> > > ... most likely to allow clients to interrupt a long request with a
> > > Tflush. Have you considered this aspect in your changes ?
> > 
> > In this particular patch set, no, as for readdir this should not be an
> > issue in practice for anybody. After this patch set is applied, even on
> > huge directories, the fs driver's duration for readdir would barely be
> > measurable. In fact the server's latency would always be much higher,
> > yet.
> 
> Reproducer ? Numbers ? ;)

Well, you have already seen and run the benchmark in this series yourself. You 
can easily hit several hundred ms server latency, which is very hard to top 
with any fs driver reading a directory. Ok, maybe with a tape drive you could, 
but that's pretty much it. :) With a 'normal' system the fs driver would 
rather block for something between <1ms .. 8ms, i.e. fs driver completes 
before it would be able to actually see a flush request.

But to make it also clear: it would not bite with the new design either. If 
someone really would see a necessity to abort readdir requests, that would be 
possible without huge changes.

> > > > > Then the whole seekdir/rewinddir/telldir/readdir sequence should
> > > > > be called with a single v9fs_co_run_in_worker() invocation, in
> > > > > which case the locking could just be something like:
> > > > > 
> > > > > int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState
> > > > > *fidp,
> > > > > 
> > > > >                                       struct V9fsDirEnt **entries,
> > > > >                                       int32_t maxsize, bool dostat)
> > > > > 
> > > > > {
> > > > > 
> > > > >     int err = 0;
> > > > >     
> > > > >     if (v9fs_request_cancelled(pdu)) {
> > > > >     
> > > > >         return -EINTR;
> > > > >     
> > > > >     }
> > > > >     
> > > > >     v9fs_readdir_lock(&fidp->fs.dir);
> > > > >     
> > > > >     v9fs_co_run_in_worker({
> > > > >     
> > > > >         err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> > > > >     
> > > > >     });
> > > > >     
> > > > >     v9fs_readdir_unlock(&fidp->fs.dir);
> > > > >     return err;
> > > > > 
> > > > > }
> > > > 
> > > > That's exactly what should be prevented. The lock must be on driver
> > > > thread
> > > > side, not on main thread side. The goal is to reduce the time slice of
> > > > individual locks. If the lock is on main thread, the time slice of a
> > > > lock
> > > > (even on huge directories with thousands of entries) is multiple
> > > > factors
> > > > larger than if the lock is on driver thread side. So that's not just
> > > > few
> > > > percent or so, it is huge.
> > > 
> > > Sorry I don't get it...  In a contention-less situation, which is the
> > > case we really care for, qemu_co_mutex_lock() has no overhead.
> > 
> > Yes, it only kicks in if there is concurrency.
> 
> And we don't care to preserve performance for silly clients, do we ?

We don't necessarily need to preserve performance for them, that's right. But 
it is already handled appropriately for them, so destroying it (i.e. slowing 
them down) only makes sense if there is a good reason for that.

> > > > > This would technically leave the locking in the main I/O thread,
> > > > > but it isn't conceptually different from locking at the beginning
> > > > > of do_readdir_lock() and unlocking before returning. My concern
> > > > > is that I don't know how coroutine mutexes behave with multiple
> > > > > worker threads...
> > > > 
> > > > Well, your Mutex -> CoMutex change was intended to fix a deadlock with
> > > > the
> > > > old readdir implementation. That deadlock could not happen with the
> > > > new
> > > > readdir implementation, which suggests that it probably makes sense to
> > > > revert this change (i.e. CoMutex -> Mutex) for this new
> > > > implementation.
> > > 
> > > No we can't because it is also used with 9p2000.u, that you said you
> > > don't want to fix.
> > 
> > Yes and no, I did not say I don't want to fix readdir for 9p2000.u at all.
> > What I said was I want to prioritize and concentrate on the most relevant
> > issues first. 9p2000.L is the most commonly used protocol variant, so I
> > would like to fix the most severe (e.g. performance) issues for 9p2000.L
> > first before spending time on 9p2000.u which is AFAICS barely used in
> > comparison, which I personally don't use at all, and which I am hence not
> > testing in the same degree and cannot serve with the same quality as
> > 9p2000.L right now.
> > 
> > Plus if there are really users caring for 9p2000.u, I can gladly assist
> > them to address these issues for 9p2000.u as well, provided that they
> > help at least with testing the required 9p2000.u changes.
> 
> I would personally do the opposite... again ;)
> 
> Like you say we essentially care for 9p2000.L and we only do limited
> support for 9p2000.u. If we have a change that we really want for
> 9p2000.L but it has an impact on 9p2000.u because of shared code,
> it is fine to do the changes anyway, including changes to the 9p2000.u
> specific code. Very basic testing of 9p2000.u (mount/ls or find/umount)
> or maybe running pjd-fstest is enough IMHO. If we break someone's setup
> and he complains, then we can ask him to test a fix before we merge it.

Well, so you want to handle the relevant 9p2000.u readdir optimizations by 
yourself, and you would send it as follow-up patch set (after this set is 
through), including test cases?

> > Back to the actual topic: so what do we do about the mutex then? CoMutex
> > for 9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but it
> > would just be a transitional measure.
> 
> That would ruin my day...
> 
> More seriously, the recent fix for a deadlock condition that was present
> for years proves that nobody seems to be using silly clients with QEMU.
> So I think we should just dump the lock and add a one-time warning in
> the top level handlers when we detect a duplicate readdir request on
> the same fid. This should be a very simple patch (I can take care of
> it right away).

Looks like we have a consensus! Then I wait for your patch removing the lock, 
and for your feedback whether or not you see anything else in this patch set?

> > Best regards,
> > Christian Schoenebeck
> 
> Cheers,
> 
> --
> Greg

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-02 17:23                       ` Christian Schoenebeck
@ 2020-07-03  8:08                         ` Christian Schoenebeck
  2020-07-03 16:08                           ` Greg Kurz
  2020-07-03 15:53                         ` Greg Kurz
  1 sibling, 1 reply; 30+ messages in thread
From: Christian Schoenebeck @ 2020-07-03  8:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Donnerstag, 2. Juli 2020 19:23:35 CEST Christian Schoenebeck wrote:
> > > Back to the actual topic: so what do we do about the mutex then? CoMutex
> > > for 9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but it
> > > would just be a transitional measure.
> > 
> > That would ruin my day...
> > 
> > More seriously, the recent fix for a deadlock condition that was present
> > for years proves that nobody seems to be using silly clients with QEMU.
> > So I think we should just dump the lock and add a one-time warning in
> > the top level handlers when we detect a duplicate readdir request on
> > the same fid. This should be a very simple patch (I can take care of
> > it right away).
> 
> Looks like we have a consensus! Then I wait for your patch removing the
> lock, and for your feedback whether or not you see anything else in this
> patch set?

Please wait before you work on this patch. I really do think your patch should 
be based on/after this optimization patch for one reason: if (and even though 
it's a big if) somebody comes along with a silly client as you named it, then 
your patch can simply be reverted, which would not be possible if it's next.

So I would really suggest I change this patch here to go the ugly way with 2 
mutex types for readdir 9p2000.L vs 9p2000.L, and your patch would get rid of 
that mess by removing the lock entirely, okay?

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-02 17:23                       ` Christian Schoenebeck
  2020-07-03  8:08                         ` Christian Schoenebeck
@ 2020-07-03 15:53                         ` Greg Kurz
  2020-07-03 18:12                           ` Christian Schoenebeck
  1 sibling, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-07-03 15:53 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Thu, 02 Jul 2020 19:23:35 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 2. Juli 2020 17:35:00 CEST Greg Kurz wrote:
> > > > It isn't readdir only, most requests span over multiple v9fs_co_*()
> > > > calls...> 
> > > Right, I know! And that's actually my root motivation to finally bring
> > > this
> > > patch set forward, since I am very aware that this patch set is just a
> > > small brick in the overall procecss of fixing similarly affected code
> > > portions. So yes, that's my plan to fix them with upcoming patch sets,
> > > too.
> > > 
> > > Having said that, could we probably try to make future reviews a bit more
> > > efficient at certain aspects? For instance it would help a lot if the
> > > patch
> > > set was reviewed entirely, and not stopped at the very first issue spotted
> > > and then suspended to ++version, as this creates large latencies in the
> > > overall review process?
> > 
> > Review of 9pfs is a best effort thing... I usually stop reviewing when I'm
> > fed up or when all the time I could reasonably invest is elapsed. Breaking
> > down the series in smaller patches is the usual improvement for that.
> 
> No need to defend, you do these things voluntarily. I am glad that you spend 
> time on this project at all. I would appreciate though if we could reduce the 
> overall time for a patch set a bit, and my suggestion trying to walk through 
> an entire set before re-posting a new version might indeed bring an 
> improvement without anybody having to invest more time, but rather less.
> 

I'll try to adjust but I think you should also try to split patches (like
you did eventually with the addition of patch 3).

> > > And likewise it would also help if review is prioritized on relevant
> > > behaviour aspects (concept, design) first before arguing about code style
> > > details like variable names or other details invisible to users.
> > 
> > I don't remember questioning the overall concept behind these changes
> > because it looks reasonable enough (even if I would appreciate to be
> > able to verify them with a working reproducer).
> 
> What exactly do you mean here with working reproducer?
> 

Some test program to be run in a real guest that gets improved performance
with this patch set for example.

> > Even if it is invisible to the users, coding style or how a series is
> > broken down in patches is important for developers.
> 
> Dedication for detail beyond a certain fine graded level is luxury, and for 
> that reason it is designated for projects which clearly can afford it. On 
> (sub)projects with low activity and high latency (like this one) it easily 
> leads to frustration, which is obviously contra productive.
> 

The only way to avoid frustration when things don't go to your pace
is to take control, as you decided to do :)

> I don't say don't care about code style, details et al., but caring a bit less 
> on reviews would not hurt, so to say. A bit of (invisible) tolerance avoids 
> friction and stagnancy.
> 

No, this friction and stagnancy is essentially the result of not sharing
the same agenda and different tastes also.

> > > And finally: compromises. As man power on 9p is very limited, it would
> > > make
> > > sense to limit patch sets at a certain extent and agree to postpone
> > > certain
> > > non-critical issues to subsequent patches (or sets of), otherwise a patch
> > > set will grow and grow and it will take ages to get forward.
> > 
> > FWIW this patchset was more than 10 patches and now it is only 5 :)
> 
> Hey, no misleading number crunching please. ;-) Three dropped patches were 
> never intended to be merged, they were pure benchmarks.
> 

Heh :)

> > The good news is that you'll soon be able to merge things by yourself.
> > I'll help you when I can but I won't be the gating factor anymore.
> > 
> > Hurrah !
> 
> Yep, I'll do my best to pursue your work, I appreciate what you did and that 
> you still volunteer to help!
> 

And if I'm to picky on reviews, feel free to ignore my remarks ;)

> > > > > > 2) basically means moving some logic from the frontend to the
> > > > > > backend, ie. called from v9fs_co_run_in_worker(). This implies
> > > > > > that a very long request (ie. one that would span on many calls
> > > > > > to readdir()) cannot be interrupted by the client anymore, as
> > > > > > opposed to what we have now BTW.
> > > > 
> > > > ... most likely to allow clients to interrupt a long request with a
> > > > Tflush. Have you considered this aspect in your changes ?
> > > 
> > > In this particular patch set, no, as for readdir this should not be an
> > > issue in practice for anybody. After this patch set is applied, even on
> > > huge directories, the fs driver's duration for readdir would barely be
> > > measurable. In fact the server's latency would always be much higher,
> > > yet.
> > 
> > Reproducer ? Numbers ? ;)
> 
> Well, you have already seen and run the benchmark in this series yourself. You 
> can easily hit several hundred ms server latency, which is very hard to top 
> with any fs driver reading a directory. Ok, maybe with a tape drive you could, 
> but that's pretty much it. :) With a 'normal' system the fs driver would 

Or some network file system on an extremely slow connection...

> rather block for something between <1ms .. 8ms, i.e. fs driver completes 
> before it would be able to actually see a flush request.
> 
> But to make it also clear: it would not bite with the new design either. If 
> someone really would see a necessity to abort readdir requests, that would be 
> possible without huge changes.
> 

Tflush requests are handled in the top-half exclusively by design, ie.
they rely on the worker thread handling the targeted request to yield
control back to the main I/O thread. Since this doesn't happen anymore
with your patches, I'm not sure how you be able to abort a _many readdir_ 
request once it's been fired into a worker thread.

A possible solution could be to break down a Treaddir into multiple
_many readdirs_ jobs, and have a knob or some logic to control the
latency ratio.

> > > > > > Then the whole seekdir/rewinddir/telldir/readdir sequence should
> > > > > > be called with a single v9fs_co_run_in_worker() invocation, in
> > > > > > which case the locking could just be something like:
> > > > > > 
> > > > > > int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState
> > > > > > *fidp,
> > > > > > 
> > > > > >                                       struct V9fsDirEnt **entries,
> > > > > >                                       int32_t maxsize, bool dostat)
> > > > > > 
> > > > > > {
> > > > > > 
> > > > > >     int err = 0;
> > > > > >     
> > > > > >     if (v9fs_request_cancelled(pdu)) {
> > > > > >     
> > > > > >         return -EINTR;
> > > > > >     
> > > > > >     }
> > > > > >     
> > > > > >     v9fs_readdir_lock(&fidp->fs.dir);
> > > > > >     
> > > > > >     v9fs_co_run_in_worker({
> > > > > >     
> > > > > >         err = do_readdir_many(pdu, fidp, entries, maxsize, dostat);
> > > > > >     
> > > > > >     });
> > > > > >     
> > > > > >     v9fs_readdir_unlock(&fidp->fs.dir);
> > > > > >     return err;
> > > > > > 
> > > > > > }
> > > > > 
> > > > > That's exactly what should be prevented. The lock must be on driver
> > > > > thread
> > > > > side, not on main thread side. The goal is to reduce the time slice of
> > > > > individual locks. If the lock is on main thread, the time slice of a
> > > > > lock
> > > > > (even on huge directories with thousands of entries) is multiple
> > > > > factors
> > > > > larger than if the lock is on driver thread side. So that's not just
> > > > > few
> > > > > percent or so, it is huge.
> > > > 
> > > > Sorry I don't get it...  In a contention-less situation, which is the
> > > > case we really care for, qemu_co_mutex_lock() has no overhead.
> > > 
> > > Yes, it only kicks in if there is concurrency.
> > 
> > And we don't care to preserve performance for silly clients, do we ?
> 
> We don't necessarily need to preserve performance for them, that's right. But 
> it is already handled appropriately for them, so destroying it (i.e. slowing 
> them down) only makes sense if there is a good reason for that.
> 

Ending up with a mix of two different kind of mutexes for 9p2000.L and .u is
a good enough reason for me.

> > > > > > This would technically leave the locking in the main I/O thread,
> > > > > > but it isn't conceptually different from locking at the beginning
> > > > > > of do_readdir_lock() and unlocking before returning. My concern
> > > > > > is that I don't know how coroutine mutexes behave with multiple
> > > > > > worker threads...
> > > > > 
> > > > > Well, your Mutex -> CoMutex change was intended to fix a deadlock with
> > > > > the
> > > > > old readdir implementation. That deadlock could not happen with the
> > > > > new
> > > > > readdir implementation, which suggests that it probably makes sense to
> > > > > revert this change (i.e. CoMutex -> Mutex) for this new
> > > > > implementation.
> > > > 
> > > > No we can't because it is also used with 9p2000.u, that you said you
> > > > don't want to fix.
> > > 
> > > Yes and no, I did not say I don't want to fix readdir for 9p2000.u at all.
> > > What I said was I want to prioritize and concentrate on the most relevant
> > > issues first. 9p2000.L is the most commonly used protocol variant, so I
> > > would like to fix the most severe (e.g. performance) issues for 9p2000.L
> > > first before spending time on 9p2000.u which is AFAICS barely used in
> > > comparison, which I personally don't use at all, and which I am hence not
> > > testing in the same degree and cannot serve with the same quality as
> > > 9p2000.L right now.
> > > 
> > > Plus if there are really users caring for 9p2000.u, I can gladly assist
> > > them to address these issues for 9p2000.u as well, provided that they
> > > help at least with testing the required 9p2000.u changes.
> > 
> > I would personally do the opposite... again ;)
> > 
> > Like you say we essentially care for 9p2000.L and we only do limited
> > support for 9p2000.u. If we have a change that we really want for
> > 9p2000.L but it has an impact on 9p2000.u because of shared code,
> > it is fine to do the changes anyway, including changes to the 9p2000.u
> > specific code. Very basic testing of 9p2000.u (mount/ls or find/umount)
> > or maybe running pjd-fstest is enough IMHO. If we break someone's setup
> > and he complains, then we can ask him to test a fix before we merge it.
> 
> Well, so you want to handle the relevant 9p2000.u readdir optimizations by 
> yourself, and you would send it as follow-up patch set (after this set is 
> through), including test cases?
> 

Ah it wasn't my point. I was just saying that any valuable change for
9p2000.L prevails and if you have to change some common code (eg.
locking) that could impact the 9p2000.u experience, you can do it
anyway, even if you only do smoke testing with 9p2000.u.

> > > Back to the actual topic: so what do we do about the mutex then? CoMutex
> > > for 9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but it
> > > would just be a transitional measure.
> > 
> > That would ruin my day...
> > 
> > More seriously, the recent fix for a deadlock condition that was present
> > for years proves that nobody seems to be using silly clients with QEMU.
> > So I think we should just dump the lock and add a one-time warning in
> > the top level handlers when we detect a duplicate readdir request on
> > the same fid. This should be a very simple patch (I can take care of
> > it right away).
> 
> Looks like we have a consensus! Then I wait for your patch removing the lock, 
> and for your feedback whether or not you see anything else in this patch set?
> 

I was ready to do that but now I'm reading you other mail... I'll
continue the discussion there.

> > > Best regards,
> > > Christian Schoenebeck
> > 
> > Cheers,
> > 
> > --
> > Greg
> 
> Best regards,
> Christian Schoenebeck
> 
> 



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-03  8:08                         ` Christian Schoenebeck
@ 2020-07-03 16:08                           ` Greg Kurz
  2020-07-03 18:27                             ` Christian Schoenebeck
  0 siblings, 1 reply; 30+ messages in thread
From: Greg Kurz @ 2020-07-03 16:08 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel

On Fri, 03 Jul 2020 10:08:09 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> On Donnerstag, 2. Juli 2020 19:23:35 CEST Christian Schoenebeck wrote:
> > > > Back to the actual topic: so what do we do about the mutex then? CoMutex
> > > > for 9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but it
> > > > would just be a transitional measure.
> > > 
> > > That would ruin my day...
> > > 
> > > More seriously, the recent fix for a deadlock condition that was present
> > > for years proves that nobody seems to be using silly clients with QEMU.
> > > So I think we should just dump the lock and add a one-time warning in
> > > the top level handlers when we detect a duplicate readdir request on
> > > the same fid. This should be a very simple patch (I can take care of
> > > it right away).
> > 
> > Looks like we have a consensus! Then I wait for your patch removing the
> > lock, and for your feedback whether or not you see anything else in this
> > patch set?
> 
> Please wait before you work on this patch. I really do think your patch should 
> be based on/after this optimization patch for one reason: if (and even though 
> it's a big if) somebody comes along with a silly client as you named it, then 
> your patch can simply be reverted, which would not be possible if it's next.
> 
> So I would really suggest I change this patch here to go the ugly way with 2 
> mutex types for readdir 9p2000.L vs 9p2000.L, and your patch would get rid of 
> that mess by removing the lock entirely, okay?
> 

If someones ever comes along with a silly client, she will get warnings
explaining that she might get silly results. If it turns out that we
really need to support that for valid reasons, it will be okay to focus
on the appropriate fix when the time comes, not now. So I don't say any
real benefit to postponing the removal of the lock after your series, but
I do at least see one benefit, it will reduce the level of noise.

> Best regards,
> Christian Schoenebeck
> 
> 



^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-03 15:53                         ` Greg Kurz
@ 2020-07-03 18:12                           ` Christian Schoenebeck
  0 siblings, 0 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-07-03 18:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Freitag, 3. Juli 2020 17:53:15 CEST Greg Kurz wrote:
> > > I don't remember questioning the overall concept behind these changes
> > > because it looks reasonable enough (even if I would appreciate to be
> > > able to verify them with a working reproducer).
> > 
> > What exactly do you mean here with working reproducer?
> 
> Some test program to be run in a real guest that gets improved performance
> with this patch set for example.

Mhm, for now the benchmark patches already provided should be proof enough 
that this optimization will bring significant improvements in the end.

Like mentioned on a side note; to actually feel the difference with a real 
Linux client, a kernel patch is also required for the guest. But I definitely 
don't start dealing with LKML before the ground is laid on QEMU side (i.e. 
this patch set is merged on master).

> > I don't say don't care about code style, details et al., but caring a bit
> > less on reviews would not hurt, so to say. A bit of (invisible) tolerance
> > avoids friction and stagnancy.
> 
> No, this friction and stagnancy is essentially the result of not sharing
> the same agenda and different tastes also.

That summary is a bit too simple, and you know that. But okay, let's leave it 
that way.

> > rather block for something between <1ms .. 8ms, i.e. fs driver completes
> > before it would be able to actually see a flush request.
> > 
> > But to make it also clear: it would not bite with the new design either.
> > If
> > someone really would see a necessity to abort readdir requests, that would
> > be possible without huge changes.
> 
> Tflush requests are handled in the top-half exclusively by design, ie.
> they rely on the worker thread handling the targeted request to yield
> control back to the main I/O thread. Since this doesn't happen anymore
> with your patches, I'm not sure how you be able to abort a _many readdir_
> request once it's been fired into a worker thread.
> 
> A possible solution could be to break down a Treaddir into multiple
> _many readdirs_ jobs, and have a knob or some logic to control the
> latency ratio.

Too complicated, there is no need to break it down into subtasks or something. 
It can be handled with a simple abort condition in readdir_many()'s loop. But 
that's not an issue for now to discuss the details.

> > > > > > > Then the whole seekdir/rewinddir/telldir/readdir sequence should
> > > > > > > be called with a single v9fs_co_run_in_worker() invocation, in
> > > > > > > which case the locking could just be something like:
> > > > > > > 
> > > > > > > int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState
> > > > > > > *fidp,
> > > > > > > 
> > > > > > >                                       struct V9fsDirEnt
> > > > > > >                                       **entries,
> > > > > > >                                       int32_t maxsize, bool
> > > > > > >                                       dostat)
> > > > > > > 
> > > > > > > {
> > > > > > > 
> > > > > > >     int err = 0;
> > > > > > >     
> > > > > > >     if (v9fs_request_cancelled(pdu)) {
> > > > > > >     
> > > > > > >         return -EINTR;
> > > > > > >     
> > > > > > >     }
> > > > > > >     
> > > > > > >     v9fs_readdir_lock(&fidp->fs.dir);
> > > > > > >     
> > > > > > >     v9fs_co_run_in_worker({
> > > > > > >     
> > > > > > >         err = do_readdir_many(pdu, fidp, entries, maxsize,
> > > > > > >         dostat);
> > > > > > >     
> > > > > > >     });
> > > > > > >     
> > > > > > >     v9fs_readdir_unlock(&fidp->fs.dir);
> > > > > > >     return err;
> > > > > > > 
> > > > > > > }
> > > > > > 
> > > > > > That's exactly what should be prevented. The lock must be on
> > > > > > driver
> > > > > > thread
> > > > > > side, not on main thread side. The goal is to reduce the time
> > > > > > slice of
> > > > > > individual locks. If the lock is on main thread, the time slice of
> > > > > > a
> > > > > > lock
> > > > > > (even on huge directories with thousands of entries) is multiple
> > > > > > factors
> > > > > > larger than if the lock is on driver thread side. So that's not
> > > > > > just
> > > > > > few
> > > > > > percent or so, it is huge.
> > > > > 
> > > > > Sorry I don't get it...  In a contention-less situation, which is
> > > > > the
> > > > > case we really care for, qemu_co_mutex_lock() has no overhead.
> > > > 
> > > > Yes, it only kicks in if there is concurrency.
> > > 
> > > And we don't care to preserve performance for silly clients, do we ?
> > 
> > We don't necessarily need to preserve performance for them, that's right.
> > But it is already handled appropriately for them, so destroying it (i.e.
> > slowing them down) only makes sense if there is a good reason for that.
> 
> Ending up with a mix of two different kind of mutexes for 9p2000.L and .u is
> a good enough reason for me.

Correct behaviour always has precedence before code aesthetics, but we already 
agreed to remove the lock entirely anyway, so let's move on.

> > > > > > > This would technically leave the locking in the main I/O thread,
> > > > > > > but it isn't conceptually different from locking at the
> > > > > > > beginning
> > > > > > > of do_readdir_lock() and unlocking before returning. My concern
> > > > > > > is that I don't know how coroutine mutexes behave with multiple
> > > > > > > worker threads...
> > > > > > 
> > > > > > Well, your Mutex -> CoMutex change was intended to fix a deadlock
> > > > > > with
> > > > > > the
> > > > > > old readdir implementation. That deadlock could not happen with
> > > > > > the
> > > > > > new
> > > > > > readdir implementation, which suggests that it probably makes
> > > > > > sense to
> > > > > > revert this change (i.e. CoMutex -> Mutex) for this new
> > > > > > implementation.
> > > > > 
> > > > > No we can't because it is also used with 9p2000.u, that you said you
> > > > > don't want to fix.
> > > > 
> > > > Yes and no, I did not say I don't want to fix readdir for 9p2000.u at
> > > > all.
> > > > What I said was I want to prioritize and concentrate on the most
> > > > relevant
> > > > issues first. 9p2000.L is the most commonly used protocol variant, so
> > > > I
> > > > would like to fix the most severe (e.g. performance) issues for
> > > > 9p2000.L
> > > > first before spending time on 9p2000.u which is AFAICS barely used in
> > > > comparison, which I personally don't use at all, and which I am hence
> > > > not
> > > > testing in the same degree and cannot serve with the same quality as
> > > > 9p2000.L right now.
> > > > 
> > > > Plus if there are really users caring for 9p2000.u, I can gladly
> > > > assist
> > > > them to address these issues for 9p2000.u as well, provided that they
> > > > help at least with testing the required 9p2000.u changes.
> > > 
> > > I would personally do the opposite... again ;)
> > > 
> > > Like you say we essentially care for 9p2000.L and we only do limited
> > > support for 9p2000.u. If we have a change that we really want for
> > > 9p2000.L but it has an impact on 9p2000.u because of shared code,
> > > it is fine to do the changes anyway, including changes to the 9p2000.u
> > > specific code. Very basic testing of 9p2000.u (mount/ls or find/umount)
> > > or maybe running pjd-fstest is enough IMHO. If we break someone's setup
> > > and he complains, then we can ask him to test a fix before we merge it.
> > 
> > Well, so you want to handle the relevant 9p2000.u readdir optimizations by
> > yourself, and you would send it as follow-up patch set (after this set is
> > through), including test cases?
> 
> Ah it wasn't my point. I was just saying that any valuable change for
> 9p2000.L prevails and if you have to change some common code (eg.
> locking) that could impact the 9p2000.u experience, you can do it
> anyway, even if you only do smoke testing with 9p2000.u.

Ah I see, so you would. But you don't. ;-)

And yes, I could. But I won't either, for the following reasons:

1. I would have to write readdir test cases for 9p2000.u as well, because the 
   9p2000.L tests are incompatible. -> My time

2. I have to change the 9p2000.u server code -> My time

3. This patch grows substantially, by both lines and amount of patches -> My
   time and probably +10 versions more of this patch series.

4. I would have to do at least some basic testing of 9p2000.u behaviour
   -> My time

All these things would bring me +x months farther away from reaching my actual 
goal: making 9p useable by Linux clients. So no, I don't waste time on 
9p2000.u optimizations before I see somebody actually caring for 9p2000.u 
efficiency.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v6 4/5] 9pfs: T_readdir latency optimization
  2020-07-03 16:08                           ` Greg Kurz
@ 2020-07-03 18:27                             ` Christian Schoenebeck
  0 siblings, 0 replies; 30+ messages in thread
From: Christian Schoenebeck @ 2020-07-03 18:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Greg Kurz

On Freitag, 3. Juli 2020 18:08:21 CEST Greg Kurz wrote:
> On Fri, 03 Jul 2020 10:08:09 +0200
> 
> Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:
> > On Donnerstag, 2. Juli 2020 19:23:35 CEST Christian Schoenebeck wrote:
> > > > > Back to the actual topic: so what do we do about the mutex then?
> > > > > CoMutex
> > > > > for 9p2000.u and Mutex for 9p2000.L? I know you find that ugly, but
> > > > > it
> > > > > would just be a transitional measure.
> > > > 
> > > > That would ruin my day...
> > > > 
> > > > More seriously, the recent fix for a deadlock condition that was
> > > > present
> > > > for years proves that nobody seems to be using silly clients with
> > > > QEMU.
> > > > So I think we should just dump the lock and add a one-time warning in
> > > > the top level handlers when we detect a duplicate readdir request on
> > > > the same fid. This should be a very simple patch (I can take care of
> > > > it right away).
> > > 
> > > Looks like we have a consensus! Then I wait for your patch removing the
> > > lock, and for your feedback whether or not you see anything else in this
> > > patch set?
> > 
> > Please wait before you work on this patch. I really do think your patch
> > should be based on/after this optimization patch for one reason: if (and
> > even though it's a big if) somebody comes along with a silly client as
> > you named it, then your patch can simply be reverted, which would not be
> > possible if it's next.
> > 
> > So I would really suggest I change this patch here to go the ugly way with
> > 2 mutex types for readdir 9p2000.L vs 9p2000.L, and your patch would get
> > rid of that mess by removing the lock entirely, okay?
> 
> If someones ever comes along with a silly client, she will get warnings
> explaining that she might get silly results. If it turns out that we
> really need to support that for valid reasons, it will be okay to focus
> on the appropriate fix when the time comes, not now. So I don't say any
> real benefit to postponing the removal of the lock after your series, but
> I do at least see one benefit, it will reduce the level of noise.

Reasons:

1. Less work for me, as I would have more conflicts to resolve manually if
   your patch would come next.

2. The dual mutex change is just like 3 lines of code more -> trivial (and my 
   problem)

3. If somebody complains, I just have to revert your patch.

4. For you, it does neither mean more time nor more efforts, as you haven't 
   started to write the patch yet.

5. The actual outcome from your side is the same: you get what you want, the
   lock will be gone entirely after all anyway.

and most notably:

6. I don't see any advantage if your patch would come next.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2020-07-03 18:28 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-19 15:10 [PATCH v6 0/5] 9pfs: readdir optimization Christian Schoenebeck
2020-04-19 15:00 ` [PATCH v6 1/5] tests/virtio-9p: added split readdir tests Christian Schoenebeck
2020-04-19 15:00 ` [PATCH v6 2/5] 9pfs: make v9fs_readdir_response_size() public Christian Schoenebeck
2020-04-19 15:02 ` [PATCH v6 3/5] 9pfs: add new function v9fs_co_readdir_many() Christian Schoenebeck
2020-04-30 11:42   ` Greg Kurz
2020-04-30 12:50     ` Christian Schoenebeck
2020-04-30 13:30       ` Greg Kurz
2020-05-01 14:04         ` Christian Schoenebeck
2020-05-04  9:18           ` Greg Kurz
2020-05-04 10:08             ` Christian Schoenebeck
2020-05-07 12:16             ` Christian Schoenebeck
2020-05-07 15:59               ` Greg Kurz
2020-04-19 15:06 ` [PATCH v6 4/5] 9pfs: T_readdir latency optimization Christian Schoenebeck
2020-06-03 17:16   ` Christian Schoenebeck
2020-06-29 16:39     ` Greg Kurz
2020-06-30 15:16       ` Christian Schoenebeck
2020-06-30 16:39         ` Greg Kurz
2020-06-30 18:00           ` Christian Schoenebeck
2020-07-01 10:09             ` Greg Kurz
2020-07-01 11:47               ` Christian Schoenebeck
2020-07-01 15:12                 ` Greg Kurz
2020-07-02 11:43                   ` Christian Schoenebeck
2020-07-02 15:35                     ` Greg Kurz
2020-07-02 17:23                       ` Christian Schoenebeck
2020-07-03  8:08                         ` Christian Schoenebeck
2020-07-03 16:08                           ` Greg Kurz
2020-07-03 18:27                             ` Christian Schoenebeck
2020-07-03 15:53                         ` Greg Kurz
2020-07-03 18:12                           ` Christian Schoenebeck
2020-04-19 15:07 ` [PATCH v6 5/5] 9pfs: clarify latency of v9fs_co_run_in_worker() Christian Schoenebeck

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.