All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/2] vhost-user-blk: fix bug on device disconnection during initialization
@ 2021-03-24  9:38 Denis Plotnikov
  2021-03-24  9:38 ` [PATCH v2 1/2] vhost-user-blk: use different event handlers on initialization Denis Plotnikov
  2021-03-24  9:38 ` [PATCH v2 2/2] vhost-user-blk: perform immediate cleanup if disconnect " Denis Plotnikov
  0 siblings, 2 replies; 5+ messages in thread
From: Denis Plotnikov @ 2021-03-24  9:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, qemu-block, mst, raphael.norwitz, yc-core, mreitz

v2:
  * split the initial patch into two (Raphael)
  * rename init to realized (Raphael)
  * remove unrelated comment (Raphael)

When the vhost-user-blk device lose the connection to the daemon during
the initialization phase it kills qemu because of the assert in the code.
The series fixes the bug.

0001 is preparation for the fix
0002 fixes the bug, patch description has the full motivation for the series

Denis Plotnikov (2):
  vhost-user-blk: use different event handlers on initialization
  vhost-user-blk: perform immediate cleanup if disconnect on
    initialization

 hw/block/vhost-user-blk.c | 79 ++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 31 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 1/2] vhost-user-blk: use different event handlers on initialization
  2021-03-24  9:38 [PATCH v2 0/2] vhost-user-blk: fix bug on device disconnection during initialization Denis Plotnikov
@ 2021-03-24  9:38 ` Denis Plotnikov
  2021-03-24 18:25   ` Raphael Norwitz
  2021-03-24  9:38 ` [PATCH v2 2/2] vhost-user-blk: perform immediate cleanup if disconnect " Denis Plotnikov
  1 sibling, 1 reply; 5+ messages in thread
From: Denis Plotnikov @ 2021-03-24  9:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, qemu-block, mst, raphael.norwitz, yc-core, mreitz

It is useful to use different connect/disconnect event handlers
on device initialization and operation as seen from the further
commit fixing a bug on device initialization.

The patch refactor the code to make use of them: we don't rely any
more on the VM state for choosing how to cleanup the device, instead
we explicitly use the proper event handler dependping on whether
the device has been initialized.

Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru>
---
 hw/block/vhost-user-blk.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index b870a50e6b20..1af95ec6aae7 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -362,7 +362,18 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
     vhost_dev_cleanup(&s->dev);
 }
 
-static void vhost_user_blk_event(void *opaque, QEMUChrEvent event);
+static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,
+                                 bool realized);
+
+static void vhost_user_blk_event_realize(void *opaque, QEMUChrEvent event)
+{
+    vhost_user_blk_event(opaque, event, false);
+}
+
+static void vhost_user_blk_event_oper(void *opaque, QEMUChrEvent event)
+{
+    vhost_user_blk_event(opaque, event, true);
+}
 
 static void vhost_user_blk_chr_closed_bh(void *opaque)
 {
@@ -371,11 +382,12 @@ static void vhost_user_blk_chr_closed_bh(void *opaque)
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
 
     vhost_user_blk_disconnect(dev);
-    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
-            NULL, opaque, NULL, true);
+    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL,
+            vhost_user_blk_event_oper, NULL, opaque, NULL, true);
 }
 
-static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
+static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,
+                                 bool realized)
 {
     DeviceState *dev = opaque;
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -406,7 +418,7 @@ static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
          * TODO: maybe it is a good idea to make the same fix
          * for other vhost-user devices.
          */
-        if (runstate_is_running()) {
+        if (realized) {
             AioContext *ctx = qemu_get_current_aio_context();
 
             qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, NULL,
@@ -473,8 +485,9 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
     s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
     s->connected = false;
 
-    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, vhost_user_blk_event,
-                             NULL, (void *)dev, NULL, true);
+    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL,
+                             vhost_user_blk_event_realize, NULL, (void *)dev,
+                             NULL, true);
 
 reconnect:
     if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) {
@@ -494,6 +507,10 @@ reconnect:
         goto reconnect;
     }
 
+    /* we're fully initialized, now we can operate, so change the handler */
+    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL,
+                             vhost_user_blk_event_oper, NULL, (void *)dev,
+                             NULL, true);
     return;
 
 virtio_err:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2 2/2] vhost-user-blk: perform immediate cleanup if disconnect on initialization
  2021-03-24  9:38 [PATCH v2 0/2] vhost-user-blk: fix bug on device disconnection during initialization Denis Plotnikov
  2021-03-24  9:38 ` [PATCH v2 1/2] vhost-user-blk: use different event handlers on initialization Denis Plotnikov
@ 2021-03-24  9:38 ` Denis Plotnikov
  2021-03-24 18:44   ` Raphael Norwitz
  1 sibling, 1 reply; 5+ messages in thread
From: Denis Plotnikov @ 2021-03-24  9:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, qemu-block, mst, raphael.norwitz, yc-core, mreitz

Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect")
introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts
because of connection problems with vhost-blk daemon.

However, it introdues a new problem. Now, any communication errors
during execution of vhost_dev_init() called by vhost_user_blk_device_realize()
lead to qemu abort on assert in vhost_dev_get_config().

This happens because vhost_user_blk_disconnect() is postponed but
it should have dropped s->connected flag by the time
vhost_user_blk_device_realize() performs a new connection opening.
On the connection opening, vhost_dev initialization in
vhost_user_blk_connect() relies on s->connection flag and
if it's not dropped, it skips vhost_dev initialization and returns
with success. Then, vhost_user_blk_device_realize()'s execution flow
goes to vhost_dev_get_config() where it's aborted on the assert.

The connection/disconnection processing should happen
differently on initialization and operation of vhost-user-blk.
On initialization (in vhost_user_blk_device_realize()) we fully
control the initialization process. At that point, nobody can use the
device since it isn't initialized and we don't need to postpone any
cleanups, so we can do cleanup right away when there is communication
problems with the vhost-blk daemon.
On operation the disconnect may happen when the device is in use, so
the device users may want to use vhost_dev's data to do rollback before
vhost_dev is re-initialized (e.g. in vhost_dev_set_log()), so we
postpone the cleanup.

The patch splits those two cases, and performs the cleanup immediately on
initialization, and postpones cleanup when the device is initialized and
in use.

Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru>
---
 hw/block/vhost-user-blk.c | 48 +++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 1af95ec6aae7..4e215f71f152 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -402,38 +402,38 @@ static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,
         break;
     case CHR_EVENT_CLOSED:
         /*
-         * A close event may happen during a read/write, but vhost
-         * code assumes the vhost_dev remains setup, so delay the
-         * stop & clear. There are two possible paths to hit this
-         * disconnect event:
-         * 1. When VM is in the RUN_STATE_PRELAUNCH state. The
-         * vhost_user_blk_device_realize() is a caller.
-         * 2. In tha main loop phase after VM start.
-         *
-         * For p2 the disconnect event will be delayed. We can't
-         * do the same for p1, because we are not running the loop
-         * at this moment. So just skip this step and perform
-         * disconnect in the caller function.
-         *
-         * TODO: maybe it is a good idea to make the same fix
-         * for other vhost-user devices.
+         * Closing the connection should happen differently on device
+         * initialization and operation stages.
+         * On initalization, we want to re-start vhost_dev initialization
+         * from the very beginning right away when the connection is closed,
+         * so we clean up vhost_dev on each connection closing.
+         * On operation, we want to postpone vhost_dev cleanup to let the
+         * other code perform its own cleanup sequence using vhost_dev data
+         * (e.g. vhost_dev_set_log).
          */
         if (realized) {
+            /*
+             * A close event may happen during a read/write, but vhost
+             * code assumes the vhost_dev remains setup, so delay the
+             * stop & clear.
+             */
             AioContext *ctx = qemu_get_current_aio_context();
 
             qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, NULL,
                     NULL, NULL, false);
             aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque);
-        }
 
-        /*
-         * Move vhost device to the stopped state. The vhost-user device
-         * will be clean up and disconnected in BH. This can be useful in
-         * the vhost migration code. If disconnect was caught there is an
-         * option for the general vhost code to get the dev state without
-         * knowing its type (in this case vhost-user).
-         */
-        s->dev.started = false;
+            /*
+             * Move vhost device to the stopped state. The vhost-user device
+             * will be clean up and disconnected in BH. This can be useful in
+             * the vhost migration code. If disconnect was caught there is an
+             * option for the general vhost code to get the dev state without
+             * knowing its type (in this case vhost-user).
+             */
+            s->dev.started = false;
+        } else {
+            vhost_user_blk_disconnect(dev);
+        }
         break;
     case CHR_EVENT_BREAK:
     case CHR_EVENT_MUX_IN:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 1/2] vhost-user-blk: use different event handlers on initialization
  2021-03-24  9:38 ` [PATCH v2 1/2] vhost-user-blk: use different event handlers on initialization Denis Plotnikov
@ 2021-03-24 18:25   ` Raphael Norwitz
  0 siblings, 0 replies; 5+ messages in thread
From: Raphael Norwitz @ 2021-03-24 18:25 UTC (permalink / raw)
  To: Denis Plotnikov
  Cc: kwolf, qemu-block, mst, qemu-devel, Raphael Norwitz, yc-core, mreitz

Couple commit message NITs but otherwise I'm happy with this.

Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>

On Wed, Mar 24, 2021 at 12:38:28PM +0300, Denis Plotnikov wrote:
> It is useful to use different connect/disconnect event handlers
> on device initialization and operation as seen from the further
> commit fixing a bug on device initialization.
> 
> The patch refactor the code to make use of them: we don't rely any

s/The/This and s/refactor/refactors

> more on the VM state for choosing how to cleanup the device, instead
> we explicitly use the proper event handler dependping on whether

s/dependping/depending

> the device has been initialized.
> 
> Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru>
> ---
>  hw/block/vhost-user-blk.c | 31 ++++++++++++++++++++++++-------
>  1 file changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index b870a50e6b20..1af95ec6aae7 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -362,7 +362,18 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
>      vhost_dev_cleanup(&s->dev);
>  }
>  
> -static void vhost_user_blk_event(void *opaque, QEMUChrEvent event);
> +static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,
> +                                 bool realized);
> +
> +static void vhost_user_blk_event_realize(void *opaque, QEMUChrEvent event)
> +{
> +    vhost_user_blk_event(opaque, event, false);
> +}
> +
> +static void vhost_user_blk_event_oper(void *opaque, QEMUChrEvent event)
> +{
> +    vhost_user_blk_event(opaque, event, true);
> +}
>  
>  static void vhost_user_blk_chr_closed_bh(void *opaque)
>  {
> @@ -371,11 +382,12 @@ static void vhost_user_blk_chr_closed_bh(void *opaque)
>      VHostUserBlk *s = VHOST_USER_BLK(vdev);
>  
>      vhost_user_blk_disconnect(dev);
> -    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event,
> -            NULL, opaque, NULL, true);
> +    qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL,
> +            vhost_user_blk_event_oper, NULL, opaque, NULL, true);
>  }
>  
> -static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
> +static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,
> +                                 bool realized)
>  {
>      DeviceState *dev = opaque;
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -406,7 +418,7 @@ static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
>           * TODO: maybe it is a good idea to make the same fix
>           * for other vhost-user devices.
>           */
> -        if (runstate_is_running()) {
> +        if (realized) {
>              AioContext *ctx = qemu_get_current_aio_context();
>  
>              qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, NULL,
> @@ -473,8 +485,9 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>      s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
>      s->connected = false;
>  
> -    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, vhost_user_blk_event,
> -                             NULL, (void *)dev, NULL, true);
> +    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL,
> +                             vhost_user_blk_event_realize, NULL, (void *)dev,
> +                             NULL, true);
>  
>  reconnect:
>      if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) {
> @@ -494,6 +507,10 @@ reconnect:
>          goto reconnect;
>      }
>  
> +    /* we're fully initialized, now we can operate, so change the handler */
> +    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL,
> +                             vhost_user_blk_event_oper, NULL, (void *)dev,
> +                             NULL, true);
>      return;
>  
>  virtio_err:
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 2/2] vhost-user-blk: perform immediate cleanup if disconnect on initialization
  2021-03-24  9:38 ` [PATCH v2 2/2] vhost-user-blk: perform immediate cleanup if disconnect " Denis Plotnikov
@ 2021-03-24 18:44   ` Raphael Norwitz
  0 siblings, 0 replies; 5+ messages in thread
From: Raphael Norwitz @ 2021-03-24 18:44 UTC (permalink / raw)
  To: Denis Plotnikov
  Cc: kwolf, qemu-block, mst, qemu-devel, Raphael Norwitz, yc-core, mreitz

Looks good, just clean up the commit message to reflect the way you've
now split the patches.

Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>

On Wed, Mar 24, 2021 at 12:38:29PM +0300, Denis Plotnikov wrote:
> Commit 4bcad76f4c39 ("vhost-user-blk: delay vhost_user_blk_disconnect")
> introduced postponing vhost_dev cleanup aiming to eliminate qemu aborts
> because of connection problems with vhost-blk daemon.
> 
> However, it introdues a new problem. Now, any communication errors
> during execution of vhost_dev_init() called by vhost_user_blk_device_realize()
> lead to qemu abort on assert in vhost_dev_get_config().
> 
> This happens because vhost_user_blk_disconnect() is postponed but
> it should have dropped s->connected flag by the time
> vhost_user_blk_device_realize() performs a new connection opening.
> On the connection opening, vhost_dev initialization in
> vhost_user_blk_connect() relies on s->connection flag and
> if it's not dropped, it skips vhost_dev initialization and returns
> with success. Then, vhost_user_blk_device_realize()'s execution flow
> goes to vhost_dev_get_config() where it's aborted on the assert.
> 

The next sentence is a little misleading as splitting initialization and
operational logic has already been done in a prior patch. Please reword
to clarify that we've already made the split.

> The connection/disconnection processing should happen
> differently on initialization and operation of vhost-user-blk.
> On initialization (in vhost_user_blk_device_realize()) we fully
> control the initialization process. At that point, nobody can use the
> device since it isn't initialized and we don't need to postpone any
> cleanups, so we can do cleanup right away when there is communication
> problems with the vhost-blk daemon.
> On operation the disconnect may happen when the device is in use, so
> the device users may want to use vhost_dev's data to do rollback before
> vhost_dev is re-initialized (e.g. in vhost_dev_set_log()), so we
> postpone the cleanup.
>

ditto - this patch no longer splits the two cases.

> The patch splits those two cases, and performs the cleanup immediately on
> initialization, and postpones cleanup when the device is initialized and
> in use.
> 
> Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru>
> ---
>  hw/block/vhost-user-blk.c | 48 +++++++++++++++++++--------------------
>  1 file changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 1af95ec6aae7..4e215f71f152 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -402,38 +402,38 @@ static void vhost_user_blk_event(void *opaque, QEMUChrEvent event,
>          break;
>      case CHR_EVENT_CLOSED:
>          /*
> -         * A close event may happen during a read/write, but vhost
> -         * code assumes the vhost_dev remains setup, so delay the
> -         * stop & clear. There are two possible paths to hit this
> -         * disconnect event:
> -         * 1. When VM is in the RUN_STATE_PRELAUNCH state. The
> -         * vhost_user_blk_device_realize() is a caller.
> -         * 2. In tha main loop phase after VM start.
> -         *
> -         * For p2 the disconnect event will be delayed. We can't
> -         * do the same for p1, because we are not running the loop
> -         * at this moment. So just skip this step and perform
> -         * disconnect in the caller function.
> -         *
> -         * TODO: maybe it is a good idea to make the same fix
> -         * for other vhost-user devices.
> +         * Closing the connection should happen differently on device
> +         * initialization and operation stages.
> +         * On initalization, we want to re-start vhost_dev initialization
> +         * from the very beginning right away when the connection is closed,
> +         * so we clean up vhost_dev on each connection closing.
> +         * On operation, we want to postpone vhost_dev cleanup to let the
> +         * other code perform its own cleanup sequence using vhost_dev data
> +         * (e.g. vhost_dev_set_log).
>           */
>          if (realized) {
> +            /*
> +             * A close event may happen during a read/write, but vhost
> +             * code assumes the vhost_dev remains setup, so delay the
> +             * stop & clear.
> +             */
>              AioContext *ctx = qemu_get_current_aio_context();
>  
>              qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, NULL,
>                      NULL, NULL, false);
>              aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque);
> -        }
>  
> -        /*
> -         * Move vhost device to the stopped state. The vhost-user device
> -         * will be clean up and disconnected in BH. This can be useful in
> -         * the vhost migration code. If disconnect was caught there is an
> -         * option for the general vhost code to get the dev state without
> -         * knowing its type (in this case vhost-user).
> -         */
> -        s->dev.started = false;
> +            /*
> +             * Move vhost device to the stopped state. The vhost-user device
> +             * will be clean up and disconnected in BH. This can be useful in
> +             * the vhost migration code. If disconnect was caught there is an
> +             * option for the general vhost code to get the dev state without
> +             * knowing its type (in this case vhost-user).
> +             */
> +            s->dev.started = false;
> +        } else {
> +            vhost_user_blk_disconnect(dev);
> +        }
>          break;
>      case CHR_EVENT_BREAK:
>      case CHR_EVENT_MUX_IN:
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-03-24 19:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-24  9:38 [PATCH v2 0/2] vhost-user-blk: fix bug on device disconnection during initialization Denis Plotnikov
2021-03-24  9:38 ` [PATCH v2 1/2] vhost-user-blk: use different event handlers on initialization Denis Plotnikov
2021-03-24 18:25   ` Raphael Norwitz
2021-03-24  9:38 ` [PATCH v2 2/2] vhost-user-blk: perform immediate cleanup if disconnect " Denis Plotnikov
2021-03-24 18:44   ` Raphael Norwitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.