[Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
@ 2012-08-18 21:49 Paolo Bonzini
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 1/3] iscsi: move iscsi_schedule_bh and iscsi_readv_writev_bh_cb Paolo Bonzini
                   ` (3 more replies)
  0 siblings, 4 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-18 21:49 UTC (permalink / raw)
  To: qemu-devel, s.priebe; +Cc: ronniesahlberg

Hi Stefan,

this is my version of your patch.  I think the flow of the code is a
bit simpler (or at least matches other implementations of cancellation).
Can you test it on your test case?

Thanks!

Paolo

Paolo Bonzini (3):
  iscsi: move iscsi_schedule_bh and iscsi_readv_writev_bh_cb
  iscsi: simplify iscsi_schedule_bh
  iscsi: fix races between task completion and abort

 block/iscsi.c | 114 +++++++++++++++++++++++++++-------------------------------
 1 file modificato, 52 inserzioni(+), 62 rimozioni(-)

-- 
1.7.11.2

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH 1/3] iscsi: move iscsi_schedule_bh and iscsi_readv_writev_bh_cb
  2012-08-18 21:49 [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort Paolo Bonzini
@ 2012-08-18 21:49 ` Paolo Bonzini
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 2/3] iscsi: simplify iscsi_schedule_bh Paolo Bonzini
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-18 21:49 UTC (permalink / raw)
  To: qemu-devel, s.priebe; +Cc: ronniesahlberg

Put these functions at the beginning, to avoid forward references
in the next patches.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/iscsi.c | 56 ++++++++++++++++++++++++++++----------------------------
 1 file modificato, 28 inserzioni(+), 28 rimozioni(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 993a86d..7cfd752 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -73,6 +73,34 @@ struct IscsiTask {
 };
 
 static void
+iscsi_readv_writev_bh_cb(void *p)
+{
+    IscsiAIOCB *acb = p;
+
+    qemu_bh_delete(acb->bh);
+
+    if (acb->canceled == 0) {
+        acb->common.cb(acb->common.opaque, acb->status);
+    }
+
+    qemu_aio_release(acb);
+}
+
+static int
+iscsi_schedule_bh(QEMUBHFunc *cb, IscsiAIOCB *acb)
+{
+    acb->bh = qemu_bh_new(cb, acb);
+    if (!acb->bh) {
+        error_report("oom: could not create iscsi bh");
+        return -EIO;
+    }
+
+    qemu_bh_schedule(acb->bh);
+    return 0;
+}
+
+
+static void
 iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
                     void *private_data)
 {
@@ -159,34 +187,6 @@ iscsi_process_write(void *arg)
 }
 
 
-static int
-iscsi_schedule_bh(QEMUBHFunc *cb, IscsiAIOCB *acb)
-{
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        error_report("oom: could not create iscsi bh");
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-    return 0;
-}
-
-static void
-iscsi_readv_writev_bh_cb(void *p)
-{
-    IscsiAIOCB *acb = p;
-
-    qemu_bh_delete(acb->bh);
-
-    if (acb->canceled == 0) {
-        acb->common.cb(acb->common.opaque, acb->status);
-    }
-
-    qemu_aio_release(acb);
-}
-
-
 static void
 iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
                      void *command_data, void *opaque)
-- 
1.7.11.2

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH 2/3] iscsi: simplify iscsi_schedule_bh
  2012-08-18 21:49 [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort Paolo Bonzini
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 1/3] iscsi: move iscsi_schedule_bh and iscsi_readv_writev_bh_cb Paolo Bonzini
@ 2012-08-18 21:49 ` Paolo Bonzini
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 3/3] iscsi: fix races between task completion and abort Paolo Bonzini
  2012-08-19  7:55 ` [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / " Stefan Priebe
  3 siblings, 0 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-18 21:49 UTC (permalink / raw)
  To: qemu-devel, s.priebe; +Cc: ronniesahlberg

It is always used with the same callback, remove the argument.  And
its return value is never used, assume allocation succeeds.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/iscsi.c | 24 +++++++++---------------
 1 file modificato, 9 inserzioni(+), 15 rimozioni(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 7cfd752..fd5ac3b 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -73,7 +73,7 @@ struct IscsiTask {
 };
 
 static void
-iscsi_readv_writev_bh_cb(void *p)
+iscsi_bh_cb(void *p)
 {
     IscsiAIOCB *acb = p;
 
@@ -86,17 +86,11 @@ iscsi_readv_writev_bh_cb(void *p)
     qemu_aio_release(acb);
 }
 
-static int
-iscsi_schedule_bh(QEMUBHFunc *cb, IscsiAIOCB *acb)
+static void
+iscsi_schedule_bh(IscsiAIOCB *acb)
 {
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        error_report("oom: could not create iscsi bh");
-        return -EIO;
-    }
-
+    acb->bh = qemu_bh_new(iscsi_bh_cb, acb);
     qemu_bh_schedule(acb->bh);
-    return 0;
 }
 
 
@@ -211,7 +205,7 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
     scsi_free_scsi_task(acb->task);
     acb->task = NULL;
 }
@@ -312,7 +306,7 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
     scsi_free_scsi_task(acb->task);
     acb->task = NULL;
 }
@@ -428,7 +422,7 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
     scsi_free_scsi_task(acb->task);
     acb->task = NULL;
 }
@@ -482,7 +476,7 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
         acb->status = -EIO;
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
     scsi_free_scsi_task(acb->task);
     acb->task = NULL;
 }
@@ -559,7 +553,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
         memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss);
     }
 
-    iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+    iscsi_schedule_bh(acb);
     scsi_free_scsi_task(acb->task);
     acb->task = NULL;
 }
-- 
1.7.11.2

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [Qemu-devel] [PATCH 3/3] iscsi: fix races between task completion and abort
  2012-08-18 21:49 [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort Paolo Bonzini
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 1/3] iscsi: move iscsi_schedule_bh and iscsi_readv_writev_bh_cb Paolo Bonzini
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 2/3] iscsi: simplify iscsi_schedule_bh Paolo Bonzini
@ 2012-08-18 21:49 ` Paolo Bonzini
  2012-08-19  7:55 ` [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / " Stefan Priebe
  3 siblings, 0 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-18 21:49 UTC (permalink / raw)
  To: qemu-devel, s.priebe; +Cc: ronniesahlberg

This patch fixes two main issues with block/iscsi.c:

1) iscsi_task_mgmt_abort_task_async calls iscsi_scsi_task_cancel which
was also directly called in iscsi_aio_cancel

2) a race between task completion and task abortion could happen cause
the scsi_free_scsi_task were done before iscsi_schedule_bh has finished.
To fix this, all the freeing of IscsiTasks and releasing of the AIOCBs
is centralized in iscsi_bh_cb, independent of whether the SCSI command
has completed or was cancelled.

3) iscsi_aio_cancel was not synchronously waiting for the end of the
command.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/iscsi.c | 54 +++++++++++++++++++++++++-----------------------------
 1 file modificato, 25 inserzioni(+), 29 rimozioni(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index fd5ac3b..70d6908 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -83,12 +83,20 @@ iscsi_bh_cb(void *p)
         acb->common.cb(acb->common.opaque, acb->status);
     }
 
+    if (acb->task != NULL) {
+        scsi_free_scsi_task(acb->task);
+        acb->task = NULL;
+    }
+
     qemu_aio_release(acb);
 }
 
 static void
 iscsi_schedule_bh(IscsiAIOCB *acb)
 {
+    if (acb->bh) {
+        return;
+    }
     acb->bh = qemu_bh_new(iscsi_bh_cb, acb);
     qemu_bh_schedule(acb->bh);
 }
@@ -98,6 +106,10 @@ static void
 iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data,
                     void *private_data)
 {
+    IscsiAIOCB *acb = private_data;
+
+    acb->status = -ECANCELED;
+    iscsi_schedule_bh(acb);
 }
 
 static void
@@ -106,15 +118,19 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb)
     IscsiAIOCB *acb = (IscsiAIOCB *)blockacb;
     IscsiLun *iscsilun = acb->iscsilun;
 
-    acb->common.cb(acb->common.opaque, -ECANCELED);
+    if (acb->status != -EINPROGRESS) {
+        return;
+    }
+
     acb->canceled = 1;
 
     /* send a task mgmt call to the target to cancel the task on the target */
     iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task,
-                                     iscsi_abort_task_cb, NULL);
+                                     iscsi_abort_task_cb, acb);
 
-    /* then also cancel the task locally in libiscsi */
-    iscsi_scsi_task_cancel(iscsilun->iscsi, acb->task);
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
 }
 
 static AIOPool iscsi_aio_pool = {
@@ -192,9 +208,6 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
     g_free(acb->buf);
 
     if (acb->canceled != 0) {
-        qemu_aio_release(acb);
-        scsi_free_scsi_task(acb->task);
-        acb->task = NULL;
         return;
     }
 
@@ -206,8 +219,6 @@ iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status,
     }
 
     iscsi_schedule_bh(acb);
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
 }
 
 static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun)
@@ -236,6 +247,7 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
     acb->qiov     = qiov;
 
     acb->canceled   = 0;
+    acb->status     = -EINPROGRESS;
 
     /* XXX we should pass the iovec to write16 to avoid the extra copy */
     /* this will allow us to get rid of 'buf' completely */
@@ -293,9 +305,6 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
     trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled);
 
     if (acb->canceled != 0) {
-        qemu_aio_release(acb);
-        scsi_free_scsi_task(acb->task);
-        acb->task = NULL;
         return;
     }
 
@@ -307,8 +316,6 @@ iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status,
     }
 
     iscsi_schedule_bh(acb);
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
 }
 
 static BlockDriverAIOCB *
@@ -334,6 +341,7 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num,
     acb->qiov     = qiov;
 
     acb->canceled    = 0;
+    acb->status      = -EINPROGRESS;
     acb->read_size   = qemu_read_size;
     acb->buf         = NULL;
 
@@ -409,9 +417,6 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
     IscsiAIOCB *acb = opaque;
 
     if (acb->canceled != 0) {
-        qemu_aio_release(acb);
-        scsi_free_scsi_task(acb->task);
-        acb->task = NULL;
         return;
     }
 
@@ -423,8 +428,6 @@ iscsi_synccache10_cb(struct iscsi_context *iscsi, int status,
     }
 
     iscsi_schedule_bh(acb);
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
 }
 
 static BlockDriverAIOCB *
@@ -439,6 +442,7 @@ iscsi_aio_flush(BlockDriverState *bs,
 
     acb->iscsilun = iscsilun;
     acb->canceled   = 0;
+    acb->status     = -EINPROGRESS;
 
     acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
                                          0, 0, 0, 0,
@@ -463,9 +467,6 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
     IscsiAIOCB *acb = opaque;
 
     if (acb->canceled != 0) {
-        qemu_aio_release(acb);
-        scsi_free_scsi_task(acb->task);
-        acb->task = NULL;
         return;
     }
 
@@ -477,8 +478,6 @@ iscsi_unmap_cb(struct iscsi_context *iscsi, int status,
     }
 
     iscsi_schedule_bh(acb);
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
 }
 
 static BlockDriverAIOCB *
@@ -495,6 +494,7 @@ iscsi_aio_discard(BlockDriverState *bs,
 
     acb->iscsilun = iscsilun;
     acb->canceled   = 0;
+    acb->status     = -EINPROGRESS;
 
     list[0].lba = sector_qemu2lun(sector_num, iscsilun);
     list[0].num = nb_sectors * BDRV_SECTOR_SIZE / iscsilun->block_size;
@@ -523,9 +523,6 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
     IscsiAIOCB *acb = opaque;
 
     if (acb->canceled != 0) {
-        qemu_aio_release(acb);
-        scsi_free_scsi_task(acb->task);
-        acb->task = NULL;
         return;
     }
 
@@ -554,8 +551,6 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
     }
 
     iscsi_schedule_bh(acb);
-    scsi_free_scsi_task(acb->task);
-    acb->task = NULL;
 }
 
 static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
@@ -573,6 +568,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
 
     acb->iscsilun = iscsilun;
     acb->canceled    = 0;
+    acb->status      = -EINPROGRESS;
     acb->buf         = NULL;
     acb->ioh         = buf;
 
-- 
1.7.11.2

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-18 21:49 [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort Paolo Bonzini
                   ` (2 preceding siblings ...)
  2012-08-18 21:49 ` [Qemu-devel] [PATCH 3/3] iscsi: fix races between task completion and abort Paolo Bonzini
@ 2012-08-19  7:55 ` Stefan Priebe
  2012-08-19 13:11   ` Paolo Bonzini
  3 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe @ 2012-08-19  7:55 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, ronniesahlberg

Hi Paolo,

Am 18.08.2012 23:49, schrieb Paolo Bonzini:
> Hi Stefan,
>
> this is my version of your patch.  I think the flow of the code is a
> bit simpler (or at least matches other implementations of cancellation).
> Can you test it on your test case?
I'm really sorry but your patch doesn't work at all. I'm not even able 
to start the VM. KVM process hangs and never detaches itself.

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-19  7:55 ` [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / " Stefan Priebe
@ 2012-08-19 13:11   ` Paolo Bonzini
  2012-08-19 19:22     ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-19 13:11 UTC (permalink / raw)
  To: Stefan Priebe; +Cc: qemu-devel, ronniesahlberg

Il 19/08/2012 09:55, Stefan Priebe ha scritto:
> Hi Paolo,
> 
> Am 18.08.2012 23:49, schrieb Paolo Bonzini:
>> Hi Stefan,
>>
>> this is my version of your patch.  I think the flow of the code is a
>> bit simpler (or at least matches other implementations of cancellation).
>> Can you test it on your test case?
> I'm really sorry but your patch doesn't work at all. I'm not even able
> to start the VM. KVM process hangs and never detaches itself.

No problem, my fault---I'm just back and I haven't really started again
all my stuff, so the patch was not tested.

This should fix it, though.

Paolo


diff --git a/block/iscsi.c b/block/iscsi.c
index 74ada64..0b96165 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -247,6 +247,7 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t
sector_num,
     acb->qiov     = qiov;

     acb->canceled   = 0;
+    acb->bh         = NULL;
     acb->status     = -EINPROGRESS;

     /* XXX we should pass the iovec to write16 to avoid the extra copy */
@@ -341,6 +342,7 @@ iscsi_aio_readv(BlockDriverState *bs, int64_t
sector_num,
     acb->qiov     = qiov;

     acb->canceled    = 0;
+    acb->bh          = NULL;
     acb->status      = -EINPROGRESS;
     acb->read_size   = qemu_read_size;
     acb->buf         = NULL;
@@ -442,6 +444,7 @@ iscsi_aio_flush(BlockDriverState *bs,

     acb->iscsilun = iscsilun;
     acb->canceled   = 0;
+    acb->bh         = NULL;
     acb->status     = -EINPROGRESS;

     acb->task = iscsi_synchronizecache10_task(iscsi, iscsilun->lun,
@@ -494,6 +497,7 @@ iscsi_aio_discard(BlockDriverState *bs,

     acb->iscsilun = iscsilun;
     acb->canceled   = 0;
+    acb->bh         = NULL;
     acb->status     = -EINPROGRESS;

     list[0].lba = sector_qemu2lun(sector_num, iscsilun);
@@ -568,6 +572,7 @@ static BlockDriverAIOCB
*iscsi_aio_ioctl(BlockDriverState *bs,

     acb->iscsilun = iscsilun;
     acb->canceled    = 0;
+    acb->bh          = NULL;
     acb->status      = -EINPROGRESS;
     acb->buf         = NULL;
     acb->ioh         = buf;

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-19 13:11   ` Paolo Bonzini
@ 2012-08-19 19:22     ` Stefan Priebe - Profihost AG
  2012-08-20  7:22       ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-08-19 19:22 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, ronniesahlberg

Am 19.08.2012 15:11, schrieb Paolo Bonzini:
> No problem, my fault---I'm just back and I haven't really started again
> all my stuff, so the patch was not tested.
>
> This should fix it, though.

Booting works fine now. But the VM starts to hang after trying to unmap 
large regions. No segfault or so just not reacting anymore.

Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-19 19:22     ` Stefan Priebe - Profihost AG
@ 2012-08-20  7:22       ` Paolo Bonzini
  2012-08-20  7:34         ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-20  7:22 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel, ronniesahlberg

Il 19/08/2012 21:22, Stefan Priebe - Profihost AG ha scritto:
> 
>> No problem, my fault---I'm just back and I haven't really started again
>> all my stuff, so the patch was not tested.
>>
>> This should fix it, though.
> 
> Booting works fine now. But the VM starts to hang after trying to unmap
> large regions. No segfault or so just not reacting anymore.

This is expected; unfortunately cancellation right now is a synchronous
operation in the block layer.  SCSI is the first big user of
cancellation, and it would indeed benefit from asynchronous cancellation.

Without these three patches, you risk corruption in case the following
happens:

    qemu                 target
  -----------------------------------
    send unmap -------->
    cancel unmap ------>
    send write -------->
       <---------------- complete write
                         <unmap just written sector>
       <---------------- complete unmap
       <---------------- cancellation done (unmap complete)

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-20  7:22       ` Paolo Bonzini
@ 2012-08-20  7:34         ` Stefan Priebe - Profihost AG
  2012-08-20  8:08           ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-08-20  7:34 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, ronniesahlberg

Am 20.08.2012 09:22, schrieb Paolo Bonzini:
> Il 19/08/2012 21:22, Stefan Priebe - Profihost AG ha scritto:
>>
>>> No problem, my fault---I'm just back and I haven't really started again
>>> all my stuff, so the patch was not tested.
>>>
>>> This should fix it, though.
>>
>> Booting works fine now. But the VM starts to hang after trying to unmap
>> large regions. No segfault or so just not reacting anymore.
>
> This is expected; unfortunately cancellation right now is a synchronous
> operation in the block layer.  SCSI is the first big user of
> cancellation, and it would indeed benefit from asynchronous cancellation.
>
> Without these three patches, you risk corruption in case the following
> happens:
>
>      qemu                 target
>    -----------------------------------
>      send unmap -------->
>      cancel unmap ------>
>      send write -------->
>         <---------------- complete write
>                           <unmap just written sector>
>         <---------------- complete unmap
>         <---------------- cancellation done (unmap complete)

mhm OK that makes sense. But i cannot even login via SSH and i also see 
no cancellation message in kernel log.

So what is the right way to test these patches? With my version i can 
still work via SSH.

Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-20  7:34         ` Stefan Priebe - Profihost AG
@ 2012-08-20  8:08           ` Paolo Bonzini
  2012-08-20  8:12             ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-20  8:08 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel, ronniesahlberg

Il 20/08/2012 09:34, Stefan Priebe - Profihost AG ha scritto:
>>> Booting works fine now. But the VM starts to hang after trying to unmap
>>> large regions. No segfault or so just not reacting anymore.
>>
>> This is expected; unfortunately cancellation right now is a synchronous
>> operation in the block layer.  SCSI is the first big user of
>> cancellation, and it would indeed benefit from asynchronous cancellation.
>>
>> Without these three patches, you risk corruption in case the following
>> happens:
>>
>>      qemu                 target
>>    -----------------------------------
>>      send unmap -------->
>>      cancel unmap ------>
>>      send write -------->
>>         <---------------- complete write
>>                           <unmap just written sector>
>>         <---------------- complete unmap
>>         <---------------- cancellation done (unmap complete)
> 
> mhm OK that makes sense. But i cannot even login via SSH

That's because the "big QEMU lock" is held by the thread that called
qemu_aio_cancel.

> and i also see
> no cancellation message in kernel log.

And that's because the UNMAP actually ultimately succeeds.  You'll
probably see soft lockup messages though.

The solution here is to bump the timeout of the UNMAP command (either in
the kernel or in libiscsi, I didn't really understand who's at fault).

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-20  8:08           ` Paolo Bonzini
@ 2012-08-20  8:12             ` Stefan Priebe - Profihost AG
  2012-08-20 22:36               ` ronnie sahlberg
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-08-20  8:12 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, ronniesahlberg

Hi Ronnie,

Am 20.08.2012 10:08, schrieb Paolo Bonzini:
> That's because the "big QEMU lock" is held by the thread that called
> qemu_aio_cancel.
>
>> and i also see
>> no cancellation message in kernel log.
>
> And that's because the UNMAP actually ultimately succeeds.  You'll
> probably see soft lockup messages though.
>
> The solution here is to bump the timeout of the UNMAP command (either in
> the kernel or in libiscsi, I didn't really understand who's at fault).

What's your suggestion / idea about that?

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-20  8:12             ` Stefan Priebe - Profihost AG
@ 2012-08-20 22:36               ` ronnie sahlberg
  2012-08-21  7:22                 ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: ronnie sahlberg @ 2012-08-20 22:36 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: Paolo Bonzini, qemu-devel

On Mon, Aug 20, 2012 at 6:12 PM, Stefan Priebe - Profihost AG
<s.priebe@profihost.ag> wrote:
> Hi Ronnie,
>
> Am 20.08.2012 10:08, schrieb Paolo Bonzini:
>
>> That's because the "big QEMU lock" is held by the thread that called
>> qemu_aio_cancel.
>>
>>> and i also see
>>> no cancellation message in kernel log.
>>
>>
>> And that's because the UNMAP actually ultimately succeeds.  You'll
>> probably see soft lockup messages though.
>>
>> The solution here is to bump the timeout of the UNMAP command (either in
>> the kernel or in libiscsi, I didn't really understand who's at fault).
>
>
> What's your suggestion / idea about that?
>

There are no timeouts in libiscsi itself.
But you can probably tweak it through the guest kernel :


$ cat /sys/bus/scsi/devices/0\:0\:0\:0/timeout
30

should be the default scsi timeout for this device, and

$ cat /sys/bus/scsi/devices/0\:0\:0\:0/queue_depth
31

would be how many concurrent i/o that the guest will drive to the device.


When performing the UNMAPS, maybe setting timeout to something really
big, and at the same time also dropping queue-depth to something
really small so that the guest kernel will not be able to send so many
UNMAPs concurrently.


ronnie s

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-20 22:36               ` ronnie sahlberg
@ 2012-08-21  7:22                 ` Stefan Priebe - Profihost AG
  2012-08-21  7:30                   ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-08-21  7:22 UTC (permalink / raw)
  To: ronnie sahlberg; +Cc: Paolo Bonzini, qemu-devel

Am 21.08.2012 00:36, schrieb ronnie sahlberg:
> On Mon, Aug 20, 2012 at 6:12 PM, Stefan Priebe - Profihost AG
> <s.priebe@profihost.ag> wrote:
>> Hi Ronnie,
>>
>> Am 20.08.2012 10:08, schrieb Paolo Bonzini:
>>
>>> That's because the "big QEMU lock" is held by the thread that called
>>> qemu_aio_cancel.
>>>
>>>> and i also see
>>>> no cancellation message in kernel log.
>>>
>>>
>>> And that's because the UNMAP actually ultimately succeeds.  You'll
>>> probably see soft lockup messages though.
>>>
>>> The solution here is to bump the timeout of the UNMAP command (either in
>>> the kernel or in libiscsi, I didn't really understand who's at fault).
>>
>>
>> What's your suggestion / idea about that?
>>
>
> There are no timeouts in libiscsi itself.
> But you can probably tweak it through the guest kernel :
>
>
> $ cat /sys/bus/scsi/devices/0\:0\:0\:0/timeout
> 30
>
> should be the default scsi timeout for this device, and
>
> $ cat /sys/bus/scsi/devices/0\:0\:0\:0/queue_depth
> 31
>
> would be how many concurrent i/o that the guest will drive to the device.
>
>
> When performing the UNMAPS, maybe setting timeout to something really
> big, and at the same time also dropping queue-depth to something
> really small so that the guest kernel will not be able to send so many
> UNMAPs concurrently.

How is this relevant to the fact, that i can't even work with SSH any 
longer with paolo's patch while UNMAPs are running? With my patchset you 
can still work on the machine.

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort
  2012-08-21  7:22                 ` Stefan Priebe - Profihost AG
@ 2012-08-21  7:30                   ` Paolo Bonzini
  2012-11-06  8:41                     ` [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-08-21  7:30 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel, ronnie sahlberg

> > When performing the UNMAPS, maybe setting timeout to something
> > really big, and at the same time also dropping queue-depth to something
> > really small so that the guest kernel will not be able to send so
> > many UNMAPs concurrently.
> 
> How is this relevant to the fact, that i can't even work with SSH any
> longer with paolo's patch while UNMAPs are running? With my patchset
> you can still work on the machine.

It is relevant because if you tweak the timeouts you will not hit the
cancel at all.

I would like to get a trace of the commands that are sent, so that we
can attack the problem in the guest kernel.  For example, if it's sending
UNMAPs that span 100GB or so, we could enforce a limit in the guest kernel
on the maximum number of discarded blocks per UNMAP.  But if the problem
is the _number_ of UNMAPs rather than the size, it would need a different
heuristic.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-08-21  7:30                   ` Paolo Bonzini
@ 2012-11-06  8:41                     ` Stefan Priebe - Profihost AG
  2012-11-06 22:42                       ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-06  8:41 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Hello list,

i wantes to use scsi unmap with rbd. rbd documention says you need to 
set discard_granularity=512 for the device. I'm using qemu 1.2.

If i set this and send an UNMAP command i get this kernel output:

Sense Key : Aborted Command [current]
sd 2:0:0:1: [sdb]
Add. Sense: I/O process terminated
sd 2:0:0:1: [sdb] CDB:
Write same(16): 93 08 00 00 00 00 00 00 00 00 00 7f ff ff 00 00
end_request: I/O error, dev sdb, sector 0
sd 2:0:0:1: [sdb]
Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
sd 2:0:0:1: [sdb]
Sense Key : Aborted Command [current]
sd 2:0:0:1: [sdb]
Add. Sense: I/O process terminated
sd 2:0:0:1: [sdb] CDB:
Write same(16): 93 08 00 00 00 00 01 ff ff fc 00 7f ff ff 00 00
end_request: I/O error, dev sdb, sector 33554428
sd 2:0:0:1: [sdb]
Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
sd 2:0:0:1: [sdb]
Sense Key : Aborted Command [current]
sd 2:0:0:1: [sdb]
Add. Sense: I/O process terminated
sd 2:0:0:1: [sdb] CDB:
Write same(16): 93 08 00 00 00 00 00 ff ff fe 00 7f ff ff 00 00
end_request: I/O error, dev sdb, sector 16777214

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-06  8:41                     ` [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands Stefan Priebe - Profihost AG
@ 2012-11-06 22:42                       ` Paolo Bonzini
  2012-11-07 18:57                         ` Stefan Priebe
  2012-11-18 22:00                         ` Stefan Priebe
  0 siblings, 2 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-06 22:42 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel


> i wantes to use scsi unmap with rbd. rbd documention says you need to
> set discard_granularity=512 for the device. I'm using qemu 1.2.
> 
> If i set this and send an UNMAP command i get this kernel output:

The discard request is failing.  Please check why with a breakpoint on
rbd_aio_discard_wrapper or scsi_handle_rw_error for example.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-06 22:42                       ` Paolo Bonzini
@ 2012-11-07 18:57                         ` Stefan Priebe
  2012-11-18 22:00                         ` Stefan Priebe
  1 sibling, 0 replies; 42+ messages in thread
From: Stefan Priebe @ 2012-11-07 18:57 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Am 06.11.2012 23:42, schrieb Paolo Bonzini:
>
>> i wantes to use scsi unmap with rbd. rbd documention says you need to
>> set discard_granularity=512 for the device. I'm using qemu 1.2.
>>
>> If i set this and send an UNMAP command i get this kernel output:
>
> The discard request is failing.  Please check why with a breakpoint on
> rbd_aio_discard_wrapper or scsi_handle_rw_error for example.

I've no idea about setting breakpoints and analyse what happens... no C 
coder just perl. I can for sure modify code and compile... but i don't 
know how todo that.

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-06 22:42                       ` Paolo Bonzini
  2012-11-07 18:57                         ` Stefan Priebe
@ 2012-11-18 22:00                         ` Stefan Priebe
  2012-11-19  8:10                           ` Paolo Bonzini
  1 sibling, 1 reply; 42+ messages in thread
From: Stefan Priebe @ 2012-11-18 22:00 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Hi Paolo,

Am 06.11.2012 23:42, schrieb Paolo Bonzini:
>
>> i wantes to use scsi unmap with rbd. rbd documention says you need to
>> set discard_granularity=512 for the device. I'm using qemu 1.2.
>>
>> If i set this and send an UNMAP command i get this kernel output:
>
> The discard request is failing.  Please check why with a breakpoint on
> rbd_aio_discard_wrapper or scsi_handle_rw_error for example.
>
> Paolo

I'm sorry the discard requests aren't failing. Qemu / Block driver 
starts to cancel a bunch of requests.

[   39.577194] sd 2:0:0:1: [sdb]
[   39.577194] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
[   39.577194] sd 2:0:0:1: [sdb]
[   39.577194] Sense Key : Aborted Command [current]
[   39.577194] sd 2:0:0:1: [sdb]
[   39.577194] Add. Sense: I/O process terminated
[   39.577194] sd 2:0:0:1: [sdb] CDB:
[   39.577194] Write same(16): 93 08 00 00 00 00 04 7f ff f7 00 62 00 09 
00 00
[   39.577194] end_request: I/O error, dev sdb, sector 75497463

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-18 22:00                         ` Stefan Priebe
@ 2012-11-19  8:10                           ` Paolo Bonzini
  2012-11-19  9:36                             ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19  8:10 UTC (permalink / raw)
  To: Stefan Priebe; +Cc: qemu-devel

Il 18/11/2012 23:00, Stefan Priebe ha scritto:
> Hi Paolo,
> 
> Am 06.11.2012 23:42, schrieb Paolo Bonzini:
>>
>>> i wantes to use scsi unmap with rbd. rbd documention says you need to
>>> set discard_granularity=512 for the device. I'm using qemu 1.2.
>>>
>>> If i set this and send an UNMAP command i get this kernel output:
>>
>> The discard request is failing.  Please check why with a breakpoint on
>> rbd_aio_discard_wrapper or scsi_handle_rw_error for example.
>>
>> Paolo
> 
> I'm sorry the discard requests aren't failing. Qemu / Block driver
> starts to cancel a bunch of requests.

That is being done in the kernel (the guest, I think) because the UNMAPs
are taking too long.

Paolo

> [   39.577194] sd 2:0:0:1: [sdb]
> [   39.577194] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
> [   39.577194] sd 2:0:0:1: [sdb]
> [   39.577194] Sense Key : Aborted Command [current]
> [   39.577194] sd 2:0:0:1: [sdb]
> [   39.577194] Add. Sense: I/O process terminated
> [   39.577194] sd 2:0:0:1: [sdb] CDB:
> [   39.577194] Write same(16): 93 08 00 00 00 00 04 7f ff f7 00 62 00 09
> 00 00
> [   39.577194] end_request: I/O error, dev sdb, sector 75497463

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19  8:10                           ` Paolo Bonzini
@ 2012-11-19  9:36                             ` Stefan Priebe - Profihost AG
  2012-11-19  9:54                               ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19  9:36 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Hi Paolo,

Am 19.11.2012 09:10, schrieb Paolo Bonzini:
>> I'm sorry the discard requests aren't failing. Qemu / Block driver
>> starts to cancel a bunch of requests.
>
> That is being done in the kernel (the guest, I think) because the UNMAPs
> are taking too long.

That makes sense. RBD handles discards as buffered I/O. When i do an 
mkfs.xfs on a 30GB device i see around 900 pending discard requests. rbd 
finishes them all before reporting success.

If it is correct that a 3.6.7 kernel sends as many discard requests i 
only the a solution in using unbuffered I/O for discards.

Do you know what is the correct way?

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19  9:36                             ` Stefan Priebe - Profihost AG
@ 2012-11-19  9:54                               ` Paolo Bonzini
  2012-11-19  9:59                                 ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19  9:54 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel

Il 19/11/2012 10:36, Stefan Priebe - Profihost AG ha scritto:
> Hi Paolo,
> 
> Am 19.11.2012 09:10, schrieb Paolo Bonzini:
>>> I'm sorry the discard requests aren't failing. Qemu / Block driver
>>> starts to cancel a bunch of requests.
>>
>> That is being done in the kernel (the guest, I think) because the UNMAPs
>> are taking too long.
> 
> That makes sense. RBD handles discards as buffered I/O. When i do an
> mkfs.xfs on a 30GB device i see around 900 pending discard requests. rbd
> finishes them all before reporting success.
> 
> If it is correct that a 3.6.7 kernel sends as many discard requests i
> only the a solution in using unbuffered I/O for discards.
> 
> Do you know what is the correct way?

I think the correct fix is to serialize them in the kernel.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19  9:54                               ` Paolo Bonzini
@ 2012-11-19  9:59                                 ` Stefan Priebe - Profihost AG
  2012-11-19 10:06                                   ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19  9:59 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Am 19.11.2012 10:54, schrieb Paolo Bonzini:
> Il 19/11/2012 10:36, Stefan Priebe - Profihost AG ha scritto:
>> Hi Paolo,
>>
>> Am 19.11.2012 09:10, schrieb Paolo Bonzini:
>>>> I'm sorry the discard requests aren't failing. Qemu / Block driver
>>>> starts to cancel a bunch of requests.
>>>
>>> That is being done in the kernel (the guest, I think) because the UNMAPs
>>> are taking too long.
>>
>> That makes sense. RBD handles discards as buffered I/O. When i do an
>> mkfs.xfs on a 30GB device i see around 900 pending discard requests. rbd
>> finishes them all before reporting success.
>>
>> If it is correct that a 3.6.7 kernel sends as many discard requests i
>> only the a solution in using unbuffered I/O for discards.
>>
>> Do you know what is the correct way?
>
> I think the correct fix is to serialize them in the kernel.

So you mean this is not a bug in rbd or qemu this is a general bug in 
the linux kernel since they implemented discard?

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19  9:59                                 ` Stefan Priebe - Profihost AG
@ 2012-11-19 10:06                                   ` Paolo Bonzini
  2012-11-19 10:13                                     ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 10:06 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel

Il 19/11/2012 10:59, Stefan Priebe - Profihost AG ha scritto:
>>>
>>>
>>> Do you know what is the correct way?
>>
>> I think the correct fix is to serialize them in the kernel.
> 
> So you mean this is not a bug in rbd or qemu this is a general bug in
> the linux kernel since they implemented discard?

Yes.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 10:06                                   ` Paolo Bonzini
@ 2012-11-19 10:13                                     ` Stefan Priebe - Profihost AG
  2012-11-19 10:23                                       ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 10:13 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Am 19.11.2012 11:06, schrieb Paolo Bonzini:
> Il 19/11/2012 10:59, Stefan Priebe - Profihost AG ha scritto:
>>>>
>>>>
>>>> Do you know what is the correct way?
>>>
>>> I think the correct fix is to serialize them in the kernel.
>>
>> So you mean this is not a bug in rbd or qemu this is a general bug in
>> the linux kernel since they implemented discard?
>
> Yes.

As you're known in the linux dev community ;-) Might you open a thread / 
discussion in the linux kernel mailinglist CC'ing me regarding this 
problem? I think when i open one - no one will listen ;-)

Greets
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 10:13                                     ` Stefan Priebe - Profihost AG
@ 2012-11-19 10:23                                       ` Paolo Bonzini
  2012-11-19 10:30                                         ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 10:23 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel

Il 19/11/2012 11:13, Stefan Priebe - Profihost AG ha scritto:
>>>>
>>>
>>> So you mean this is not a bug in rbd or qemu this is a general bug in
>>> the linux kernel since they implemented discard?
>>
>> Yes.
> 
> As you're known in the linux dev community ;-) Might you open a thread /
> discussion in the linux kernel mailinglist CC'ing me regarding this
> problem? I think when i open one - no one will listen ;-)

Yeah, I'll try making and sending a patch, that's usually a good way to
convince those guys to listen...

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 10:23                                       ` Paolo Bonzini
@ 2012-11-19 10:30                                         ` Stefan Priebe - Profihost AG
  2012-11-19 10:36                                           ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 10:30 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Am 19.11.2012 11:23, schrieb Paolo Bonzini:
> Il 19/11/2012 11:13, Stefan Priebe - Profihost AG ha scritto:
>>>>>
>>>>
>>>> So you mean this is not a bug in rbd or qemu this is a general bug in
>>>> the linux kernel since they implemented discard?
>>>
>>> Yes.
>>
>> As you're known in the linux dev community ;-) Might you open a thread /
>> discussion in the linux kernel mailinglist CC'ing me regarding this
>> problem? I think when i open one - no one will listen ;-)
>
> Yeah, I'll try making and sending a patch, that's usually a good way to
> convince those guys to listen...
Thanks.

But do you have any idea why it works with an iscsi / libiscsi backend? 
In that case the kernel does not cancel the I/O.

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 10:30                                         ` Stefan Priebe - Profihost AG
@ 2012-11-19 10:36                                           ` Paolo Bonzini
  2012-11-19 10:57                                             ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 10:36 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel

Il 19/11/2012 11:30, Stefan Priebe - Profihost AG ha scritto:
> 
> 
> But do you have any idea why it works with an iscsi / libiscsi backend?
> In that case the kernel does not cancel the I/O.

It tries to, but the command ultimately succeeds and the success
response "undoes" the cancel.

http://thread.gmane.org/gmane.comp.emulators.qemu/166232/focus=166316

See also the whole thread (and I think there as a part of it offlist, too).

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 10:36                                           ` Paolo Bonzini
@ 2012-11-19 10:57                                             ` Stefan Priebe - Profihost AG
  2012-11-19 11:16                                               ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 10:57 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Yeah thats my old thread regarding iscsi und unmap but this works fine 
now since you patched qemu.

Stefan

Am 19.11.2012 11:36, schrieb Paolo Bonzini:
> Il 19/11/2012 11:30, Stefan Priebe - Profihost AG ha scritto:
>>
>>
>> But do you have any idea why it works with an iscsi / libiscsi backend?
>> In that case the kernel does not cancel the I/O.
>
> It tries to, but the command ultimately succeeds and the success
> response "undoes" the cancel.
>
> http://thread.gmane.org/gmane.comp.emulators.qemu/166232/focus=166316
>
> See also the whole thread (and I think there as a part of it offlist, too).
>
> Paolo
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 10:57                                             ` Stefan Priebe - Profihost AG
@ 2012-11-19 11:16                                               ` Paolo Bonzini
  2012-11-19 11:49                                                 ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 11:16 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: qemu-devel

Il 19/11/2012 11:57, Stefan Priebe - Profihost AG ha scritto:
> Yeah thats my old thread regarding iscsi und unmap but this works fine
> now since you patched qemu.

It still causes hangs no?  Though it works apart from that.

Paolo

> Stefan
> 
> Am 19.11.2012 11:36, schrieb Paolo Bonzini:
>> Il 19/11/2012 11:30, Stefan Priebe - Profihost AG ha scritto:
>>>
>>>
>>> But do you have any idea why it works with an iscsi / libiscsi backend?
>>> In that case the kernel does not cancel the I/O.
>>
>> It tries to, but the command ultimately succeeds and the success
>> response "undoes" the cancel.
>>
>> http://thread.gmane.org/gmane.comp.emulators.qemu/166232/focus=166316
>>
>> See also the whole thread (and I think there as a part of it offlist,
>> too).
>>
>> Paolo
>>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 11:16                                               ` Paolo Bonzini
@ 2012-11-19 11:49                                                 ` Stefan Priebe - Profihost AG
  2012-11-19 12:24                                                   ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 11:49 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel

Am 19.11.2012 12:16, schrieb Paolo Bonzini:
> Il 19/11/2012 11:57, Stefan Priebe - Profihost AG ha scritto:
>> Yeah thats my old thread regarding iscsi und unmap but this works fine
>> now since you patched qemu.
>
> It still causes hangs no?  Though it works apart from that.

iscsi/libiscsi and discards works fine since your latest patches:
1bd075f29ea6d11853475c7c42734595720c3ac6
cfb3f5064af2d2e29c976e292c9472dfe9d61e31
27cbd828c617944c0f9603763fdf4fa87e7ad923
b20909195745c34a819aed14ae996b60ab0f591f

But in rbd it starts to cancel the discard requests. Both using the same 
guest and host kernel with the same QEMU version.

Greets,
Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 11:49                                                 ` Stefan Priebe - Profihost AG
@ 2012-11-19 12:24                                                   ` Paolo Bonzini
  2012-11-19 13:01                                                     ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 12:24 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: josh.durgin, qemu-devel, sage

Il 19/11/2012 12:49, Stefan Priebe - Profihost AG ha scritto:
>>
>> It still causes hangs no?  Though it works apart from that.
> 
> iscsi/libiscsi and discards works fine since your latest patches:
> 1bd075f29ea6d11853475c7c42734595720c3ac6
> cfb3f5064af2d2e29c976e292c9472dfe9d61e31
> 27cbd828c617944c0f9603763fdf4fa87e7ad923
> b20909195745c34a819aed14ae996b60ab0f591f

Does the console still hang though?

> But in rbd it starts to cancel the discard requests. Both using the same
> guest and host kernel with the same QEMU version.

rbd's cancellation is broken like libiscsi's was.  So the cancellation
succeeds apparently, but it is subject to the same race introduced in
commit 64e69e8 (iscsi: Fix NULL dereferences / races between task
completion and abort, 2012-08-15) for libiscsi.

It risks corruption in case the following happens:

        guest                  qemu                 rbd server
=========================================================================
        send write 1 -------->
                               send write 1 ------>
        cancel write 1 ------>
                               cancel write 1 ---->
           <------------------ cancelled
        send write 2 -------->
                               send write 2 ------>
                                   <--------------- completed write 2
           <------------------ completed write 2
                                   <--------------- completed write 1

Here, the guest would see write 2 superseding write 1, when in fact the
outcome could have been the opposite.  The right behavior is to return
only after the target says whether the cancellation was done or not.
For libiscsi, it was implemented by the commits you mention.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 12:24                                                   ` Paolo Bonzini
@ 2012-11-19 13:01                                                     ` Stefan Priebe - Profihost AG
  2012-11-19 13:06                                                       ` Paolo Bonzini
  0 siblings, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 13:01 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel, sage

Am 19.11.2012 13:24, schrieb Paolo Bonzini:
> Il 19/11/2012 12:49, Stefan Priebe - Profihost AG ha scritto:
>>>
>>> It still causes hangs no?  Though it works apart from that.
>>
>> iscsi/libiscsi and discards works fine since your latest patches:
>> 1bd075f29ea6d11853475c7c42734595720c3ac6
>> cfb3f5064af2d2e29c976e292c9472dfe9d61e31
>> 27cbd828c617944c0f9603763fdf4fa87e7ad923
>> b20909195745c34a819aed14ae996b60ab0f591f
>
> Does the console still hang though?

No everything is absolutely fine.

>> But in rbd it starts to cancel the discard requests. Both using the same
>> guest and host kernel with the same QEMU version.
>
> rbd's cancellation is broken like libiscsi's was.  So the cancellation
> succeeds apparently, but it is subject to the same race introduced in
> commit 64e69e8 (iscsi: Fix NULL dereferences / races between task
> completion and abort, 2012-08-15) for libiscsi.
>
> It risks corruption in case the following happens:
>
>          guest                  qemu                 rbd server
> =========================================================================
>          send write 1 -------->
>                                 send write 1 ------>
>          cancel write 1 ------>
>                                 cancel write 1 ---->
>             <------------------ cancelled
>          send write 2 -------->
>                                 send write 2 ------>
>                                     <--------------- completed write 2
>             <------------------ completed write 2
>                                     <--------------- completed write 1
>
> Here, the guest would see write 2 superseding write 1, when in fact the
> outcome could have been the opposite.  The right behavior is to return
> only after the target says whether the cancellation was done or not.
> For libiscsi, it was implemented by the commits you mention.

So the whole bunch of changes is needed for rbd? Or just 64e69e8? Or all 
mentioned except 64e69e8 as 64e69e8 introduced the problem?

Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 13:01                                                     ` Stefan Priebe - Profihost AG
@ 2012-11-19 13:06                                                       ` Paolo Bonzini
  2012-11-19 14:16                                                         ` Stefan Priebe - Profihost AG
  2012-11-19 14:28                                                         ` Stefan Priebe - Profihost AG
  0 siblings, 2 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 13:06 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: josh.durgin, qemu-devel, sage

Il 19/11/2012 14:01, Stefan Priebe - Profihost AG ha scritto:
>> The right behavior is to return
>> only after the target says whether the cancellation was done or not.
>> For libiscsi, it was implemented by the commits you mention.
> 
> So the whole bunch of changes is needed for rbd?

Something like the first three:

1bd075f29ea6d11853475c7c42734595720c3ac6
cfb3f5064af2d2e29c976e292c9472dfe9d61e31
27cbd828c617944c0f9603763fdf4fa87e7ad923

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 13:06                                                       ` Paolo Bonzini
@ 2012-11-19 14:16                                                         ` Stefan Priebe - Profihost AG
  2012-11-19 14:32                                                           ` Paolo Bonzini
  2012-11-19 14:28                                                         ` Stefan Priebe - Profihost AG
  1 sibling, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 14:16 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel, sage

Hi Paolo,
Hi Josh,

i started to port the iscsi fixes to rbd. The one think i'm missing is. 
How to tell / call rbd to cancel I/O on the server side?

I grepped librbd source code for abort / cancel but didn't find anything.

Qemu must be able to tell rbd to cancel a specific I/O request.

Greets,
Stefan
Am 19.11.2012 14:06, schrieb Paolo Bonzini:
> Il 19/11/2012 14:01, Stefan Priebe - Profihost AG ha scritto:
>>> The right behavior is to return
>>> only after the target says whether the cancellation was done or not.
>>> For libiscsi, it was implemented by the commits you mention.
>>
>> So the whole bunch of changes is needed for rbd?
>
> Something like the first three:
>
> 1bd075f29ea6d11853475c7c42734595720c3ac6
> cfb3f5064af2d2e29c976e292c9472dfe9d61e31
> 27cbd828c617944c0f9603763fdf4fa87e7ad923
>
> Paolo
>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 13:06                                                       ` Paolo Bonzini
  2012-11-19 14:16                                                         ` Stefan Priebe - Profihost AG
@ 2012-11-19 14:28                                                         ` Stefan Priebe - Profihost AG
  2012-11-19 14:41                                                           ` Paolo Bonzini
  1 sibling, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 14:28 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel, sage

[-- Attachment #1: Type: text/plain, Size: 717 bytes --]

Hi Paolo,

this is my current work status on porting these fixes to rbd. Right now 
the discards get still canceled by the client kernel.

Might you have a look what i have forgotten?

Thanks!

Stefan
Am 19.11.2012 14:06, schrieb Paolo Bonzini:
> Il 19/11/2012 14:01, Stefan Priebe - Profihost AG ha scritto:
>>> The right behavior is to return
>>> only after the target says whether the cancellation was done or not.
>>> For libiscsi, it was implemented by the commits you mention.
>>
>> So the whole bunch of changes is needed for rbd?
>
> Something like the first three:
>
> 1bd075f29ea6d11853475c7c42734595720c3ac6
> cfb3f5064af2d2e29c976e292c9472dfe9d61e31
> 27cbd828c617944c0f9603763fdf4fa87e7ad923
>
> Paolo
>

[-- Attachment #2: 0001-do-not-check-for-cancellation-in-qemu_rbd_complete_a.patch --]
[-- Type: text/x-patch, Size: 1041 bytes --]

>From 486fdb8b18310ff32ca64fbb2e0217c37319cff4 Mon Sep 17 00:00:00 2001
From: Stefan Priebe <s.priebe@profihost.ag>
Date: Mon, 19 Nov 2012 14:31:40 +0100
Subject: [PATCH 1/2] do not check for cancellation in qemu_rbd_complete_aio


Signed-off-by: Stefan Priebe <s.priebe@profhost.ag>
---
 block/rbd.c |    8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 5a0f79f..583bcc3 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -376,12 +376,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     RBDAIOCB *acb = rcb->acb;
     int64_t r;
 
-    if (acb->cancelled) {
-        qemu_vfree(acb->bounce);
-        qemu_aio_release(acb);
-        goto done;
-    }
-
     r = rcb->ret;
 
     if (acb->cmd == RBD_AIO_WRITE ||
@@ -409,7 +403,7 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     /* Note that acb->bh can be NULL in case where the aio was cancelled */
     acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
     qemu_bh_schedule(acb->bh);
-done:
+
     g_free(rcb);
 }
 
-- 
1.7.10.4


[-- Attachment #3: 0002-rbd-fix-races-between-io-completition-and-abort.patch --]
[-- Type: text/x-patch, Size: 2758 bytes --]

>From e9eac2c7ed7b98ff102ab7da4573f081ebca32fa Mon Sep 17 00:00:00 2001
From: Stefan Priebe <s.priebe@profihost.ag>
Date: Mon, 19 Nov 2012 15:01:16 +0100
Subject: [PATCH 2/2] rbd: fix races between io completition and abort


Signed-off-by: Stefan Priebe <s.priebe@profhost.ag>
---
 block/rbd.c |   40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/block/rbd.c b/block/rbd.c
index 583bcc3..ae1d03b 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -77,6 +77,7 @@ typedef struct RBDAIOCB {
     int error;
     struct BDRVRBDState *s;
     int cancelled;
+    int status;
 } RBDAIOCB;
 
 typedef struct RADOSCB {
@@ -376,6 +377,10 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     RBDAIOCB *acb = rcb->acb;
     int64_t r;
 
+    if (acb->bh) {
+        return;
+    }
+
     r = rcb->ret;
 
     if (acb->cmd == RBD_AIO_WRITE ||
@@ -560,6 +565,20 @@ static void qemu_rbd_close(BlockDriverState *bs)
     rados_shutdown(s->cluster);
 }
 
+static void qemu_rbd_aio_abort(void *private_data)
+{
+    RBDAIOCB *acb = (RBDAIOCB *) private_data;
+
+    acb->status = -ECANCELED;
+
+    if (acb->bh) {
+        return;
+    }
+
+    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
 /*
  * Cancel aio. Since we don't reference acb in a non qemu threads,
  * it is safe to access it here.
@@ -567,7 +586,22 @@ static void qemu_rbd_close(BlockDriverState *bs)
 static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     RBDAIOCB *acb = (RBDAIOCB *) blockacb;
+
+    if (acb->status != -EINPROGRESS) {
+        return;
+    }
+
     acb->cancelled = 1;
+
+    // TODO / FIXME: send an abort command to rbd
+    // Normally we should call abort librbd and
+    // librbd gets qemu_rbd_aio_abort as a callback function
+    // i wasn't able to find an abort function in librbd at all
+    qemu_rbd_aio_abort(acb);
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
 }
 
 static AIOPool rbd_aio_pool = {
@@ -636,10 +670,13 @@ static void rbd_aio_bh_cb(void *opaque)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
     qemu_bh_delete(acb->bh);
     acb->bh = NULL;
 
+    if (acb->cancelled == 0) {
+        acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    }
+
     qemu_aio_release(acb);
 }
 
@@ -685,6 +722,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb->s = s;
     acb->cancelled = 0;
     acb->bh = NULL;
+    acb->status = -EINPROGRESS;
 
     if (cmd == RBD_AIO_WRITE) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 14:16                                                         ` Stefan Priebe - Profihost AG
@ 2012-11-19 14:32                                                           ` Paolo Bonzini
  0 siblings, 0 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 14:32 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: josh.durgin, qemu-devel, sage

Il 19/11/2012 15:16, Stefan Priebe - Profihost AG ha scritto:
> Hi Paolo,
> Hi Josh,
> 
> i started to port the iscsi fixes to rbd. The one think i'm missing is.
> How to tell / call rbd to cancel I/O on the server side?
> 
> I grepped librbd source code for abort / cancel but didn't find anything.
> 
> Qemu must be able to tell rbd to cancel a specific I/O request.

Just don't.  QEMU will wait for rbd to finish the operation, instead of
actually having it cancelled.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 14:28                                                         ` Stefan Priebe - Profihost AG
@ 2012-11-19 14:41                                                           ` Paolo Bonzini
  2012-11-19 14:48                                                             ` Stefan Priebe - Profihost AG
  2012-11-19 15:04                                                             ` Stefan Priebe - Profihost AG
  0 siblings, 2 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 14:41 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: josh.durgin, qemu-devel, sage

Il 19/11/2012 15:28, Stefan Priebe - Profihost AG ha scritto:
>  typedef struct RADOSCB {
> @@ -376,6 +377,10 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
>      RBDAIOCB *acb = rcb->acb;
>      int64_t r;
>  
> +    if (acb->bh) {
> +        return;
> +    }
> +
>      r = rcb->ret;
>  
>      if (acb->cmd == RBD_AIO_WRITE ||
> @@ -560,6 +565,20 @@ static void qemu_rbd_close(BlockDriverState *bs)
>      rados_shutdown(s->cluster);
>  }
>  
> +static void qemu_rbd_aio_abort(void *private_data)
> +{
> +    RBDAIOCB *acb = (RBDAIOCB *) private_data;
> +
> +    acb->status = -ECANCELED;
> +
> +    if (acb->bh) {
> +        return;
> +    }
> +
> +    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
> +    qemu_bh_schedule(acb->bh);
> +}

I think this is all unneeded.  Just store rcb->ret into
rcb->acb->status, and your version of qemu_rbd_aio_cancel should just work.

Also, I think the acb->cancelled field is not necessary anymore after
these changes.

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 14:41                                                           ` Paolo Bonzini
@ 2012-11-19 14:48                                                             ` Stefan Priebe - Profihost AG
  2012-11-19 15:03                                                               ` Paolo Bonzini
  2012-11-19 15:04                                                             ` Stefan Priebe - Profihost AG
  1 sibling, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 14:48 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel, sage

Am 19.11.2012 15:41, schrieb Paolo Bonzini:
> Il 19/11/2012 15:28, Stefan Priebe - Profihost AG ha scritto:
>>   typedef struct RADOSCB {
>> @@ -376,6 +377,10 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
>>       RBDAIOCB *acb = rcb->acb;
>>       int64_t r;
>>
>> +    if (acb->bh) {
>> +        return;
>> +    }
>> +
>>       r = rcb->ret;
>>
>>       if (acb->cmd == RBD_AIO_WRITE ||
>> @@ -560,6 +565,20 @@ static void qemu_rbd_close(BlockDriverState *bs)
>>       rados_shutdown(s->cluster);
>>   }
>>
>> +static void qemu_rbd_aio_abort(void *private_data)
>> +{
>> +    RBDAIOCB *acb = (RBDAIOCB *) private_data;
>> +
>> +    acb->status = -ECANCELED;
>> +
>> +    if (acb->bh) {
>> +        return;
>> +    }
>> +
>> +    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
>> +    qemu_bh_schedule(acb->bh);
>> +}
>
> I think this is all unneeded.  Just store rcb->ret into
> rcb->acb->status, and your version of qemu_rbd_aio_cancel should just work.
>
> Also, I think the acb->cancelled field is not necessary anymore after
> these changes.

The iscsi driver still relies on canceled that's why i left it here in too.

So you mean in qemu_rbd_complete_aio i should remove the check for 
cancelled and then just overwrite acb->status to that it changes from 
-EINPROGRESS to something else?

And qemu_rbd_aio_cancel should just wait for status != -EINPROGRESS?

Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 14:48                                                             ` Stefan Priebe - Profihost AG
@ 2012-11-19 15:03                                                               ` Paolo Bonzini
  0 siblings, 0 replies; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 15:03 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: josh.durgin, qemu-devel, sage

Il 19/11/2012 15:48, Stefan Priebe - Profihost AG ha scritto:
> So you mean in qemu_rbd_complete_aio i should remove the check for
> cancelled and then just overwrite acb->status to that it changes from
> -EINPROGRESS to something else?
> 
> And qemu_rbd_aio_cancel should just wait for status != -EINPROGRESS?

Yes.

paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 14:41                                                           ` Paolo Bonzini
  2012-11-19 14:48                                                             ` Stefan Priebe - Profihost AG
@ 2012-11-19 15:04                                                             ` Stefan Priebe - Profihost AG
  2012-11-19 15:22                                                               ` Paolo Bonzini
  1 sibling, 1 reply; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 15:04 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel, sage

[-- Attachment #1: Type: text/plain, Size: 1462 bytes --]

Hi Paolo,

new patch attached. Desciption is still wrong.


 > I think this is all unneeded.  Just store rcb->ret into
 > rcb->acb->status, and your version of qemu_rbd_aio_cancel should just
 > work.
 >
 > Also, I think the acb->cancelled field is not necessary anymore after
 > these changes.


1.) It removes cancelled
2.) It adds status variable
3.) aio cancel now just waits for io completetion

This should fix the write race you mentioned. But it still does not help 
with discard the kernel starts to cancel as the reply takes too long. See:

[   49.183366] sd 2:0:0:1: [sdb]
[   49.183366] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
[   49.183366] sd 2:0:0:1: [sdb]
[   49.183366] Sense Key : Aborted Command [current]
[   49.183366] sd 2:0:0:1: [sdb]
[   49.183366] Add. Sense: I/O process terminated
[   49.183366] sd 2:0:0:1: [sdb] CDB:
[   49.183366] Write same(16): 93 08 00 00 00 00 03 ff ff f8 00 7f ff ff 
00 00
[   49.183366] end_request: I/O error, dev sdb, sector 67108856
[   49.183366] sd 2:0:0:1: [sdb]
[   49.183366] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
[   49.183366] sd 2:0:0:1: [sdb]
[   49.183366] Sense Key : Aborted Command [current]
[   49.183366] sd 2:0:0:1: [sdb]
[   49.183366] Add. Sense: I/O process terminated
[   49.183366] sd 2:0:0:1: [sdb] CDB:
[   49.183366] Write same(16): 93 08 00 00 00 00 04 7f ff f7 00 62 00 09 
00 00
[   49.183366] end_request: I/O error, dev sdb, sector 75497463

Greets,
Stefan


[-- Attachment #2: 0001-fix-cancel-rbd-race.patch --]
[-- Type: text/x-patch, Size: 2284 bytes --]

>From d65f2c2ba8c81842992953dd772355898e702968 Mon Sep 17 00:00:00 2001
From: Stefan Priebe <s.priebe@profhost.ag>
Date: Mon, 19 Nov 2012 15:54:05 +0100
Subject: [PATCH] fix cancel rbd race


Signed-off-by: Stefan Priebe <s.priebe@profhost.ag>
---
 block/rbd.c |   19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 5a0f79f..7b3bcbb 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -76,7 +76,7 @@ typedef struct RBDAIOCB {
     int64_t sector_num;
     int error;
     struct BDRVRBDState *s;
-    int cancelled;
+    int status;
 } RBDAIOCB;
 
 typedef struct RADOSCB {
@@ -376,9 +376,7 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
     RBDAIOCB *acb = rcb->acb;
     int64_t r;
 
-    if (acb->cancelled) {
-        qemu_vfree(acb->bounce);
-        qemu_aio_release(acb);
+    if (acb->bh) {
         goto done;
     }
 
@@ -406,9 +404,12 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
             acb->ret = r;
         }
     }
+    acb->status = acb->ret;
+
     /* Note that acb->bh can be NULL in case where the aio was cancelled */
     acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
     qemu_bh_schedule(acb->bh);
+
 done:
     g_free(rcb);
 }
@@ -573,7 +574,10 @@ static void qemu_rbd_close(BlockDriverState *bs)
 static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     RBDAIOCB *acb = (RBDAIOCB *) blockacb;
-    acb->cancelled = 1;
+
+    while (acb->status == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
 }
 
 static AIOPool rbd_aio_pool = {
@@ -642,10 +646,11 @@ static void rbd_aio_bh_cb(void *opaque)
         qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
     }
     qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
     qemu_bh_delete(acb->bh);
     acb->bh = NULL;
 
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+
     qemu_aio_release(acb);
 }
 
@@ -689,8 +694,8 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb->ret = 0;
     acb->error = 0;
     acb->s = s;
-    acb->cancelled = 0;
     acb->bh = NULL;
+    acb->status = -EINPROGRESS;
 
     if (cmd == RBD_AIO_WRITE) {
         qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 15:04                                                             ` Stefan Priebe - Profihost AG
@ 2012-11-19 15:22                                                               ` Paolo Bonzini
  2012-11-19 15:58                                                                 ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 42+ messages in thread
From: Paolo Bonzini @ 2012-11-19 15:22 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG; +Cc: josh.durgin, qemu-devel, sage

Il 19/11/2012 16:04, Stefan Priebe - Profihost AG ha scritto:
> [   49.183366] sd 2:0:0:1: [sdb]
> [   49.183366] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
> [   49.183366] sd 2:0:0:1: [sdb]
> [   49.183366] Sense Key : Aborted Command [current]
> [   49.183366] sd 2:0:0:1: [sdb]
> [   49.183366] Add. Sense: I/O process terminated
> [   49.183366] sd 2:0:0:1: [sdb] CDB:
> [   49.183366] Write same(16): 93 08 00 00 00 00 03 ff ff f8 00 7f ff ff
> 00 00
> [   49.183366] end_request: I/O error, dev sdb, sector 67108856
> [   49.183366] sd 2:0:0:1: [sdb]
> [   49.183366] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
> [   49.183366] sd 2:0:0:1: [sdb]
> [   49.183366] Sense Key : Aborted Command [current]
> [   49.183366] sd 2:0:0:1: [sdb]
> [   49.183366] Add. Sense: I/O process terminated
> [   49.183366] sd 2:0:0:1: [sdb] CDB:
> [   49.183366] Write same(16): 93 08 00 00 00 00 04 7f ff f7 00 62 00 09
> 00 00
> [   49.183366] end_request: I/O error, dev sdb, sector 75497463

This is not a cancel.  It happens when the block layer reports an error.
 You can try adding a printf to rbd_aio_bh_cb, like "if (acb->ret < 0)
printf("error... %d\n", acb->ret);".

Paolo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands
  2012-11-19 15:22                                                               ` Paolo Bonzini
@ 2012-11-19 15:58                                                                 ` Stefan Priebe - Profihost AG
  0 siblings, 0 replies; 42+ messages in thread
From: Stefan Priebe - Profihost AG @ 2012-11-19 15:58 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: josh.durgin, qemu-devel, sage

Am 19.11.2012 16:22, schrieb Paolo Bonzini:
> Il 19/11/2012 16:04, Stefan Priebe - Profihost AG ha scritto:
>> [   49.183366] sd 2:0:0:1: [sdb]
>> [   49.183366] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
>> [   49.183366] sd 2:0:0:1: [sdb]
>> [   49.183366] Sense Key : Aborted Command [current]
>> [   49.183366] sd 2:0:0:1: [sdb]
>> [   49.183366] Add. Sense: I/O process terminated
>> [   49.183366] sd 2:0:0:1: [sdb] CDB:
>> [   49.183366] Write same(16): 93 08 00 00 00 00 03 ff ff f8 00 7f ff ff
>> 00 00
>> [   49.183366] end_request: I/O error, dev sdb, sector 67108856
>> [   49.183366] sd 2:0:0:1: [sdb]
>> [   49.183366] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
>> [   49.183366] sd 2:0:0:1: [sdb]
>> [   49.183366] Sense Key : Aborted Command [current]
>> [   49.183366] sd 2:0:0:1: [sdb]
>> [   49.183366] Add. Sense: I/O process terminated
>> [   49.183366] sd 2:0:0:1: [sdb] CDB:
>> [   49.183366] Write same(16): 93 08 00 00 00 00 04 7f ff f7 00 62 00 09
>> 00 00
>> [   49.183366] end_request: I/O error, dev sdb, sector 75497463
>
> This is not a cancel.  It happens when the block layer reports an error.
>   You can try adding a printf to rbd_aio_bh_cb, like "if (acb->ret < 0)
> printf("error... %d\n", acb->ret);".

Yes this one get's interesting values back:
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -512 acb->error: 0
rbd_aio_bh_cb got error back. acb->ret: -1006628352 acb->error: 0

Stefan

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2012-11-19 15:58 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-08-18 21:49 [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / races between task completion and abort Paolo Bonzini
2012-08-18 21:49 ` [Qemu-devel] [PATCH 1/3] iscsi: move iscsi_schedule_bh and iscsi_readv_writev_bh_cb Paolo Bonzini
2012-08-18 21:49 ` [Qemu-devel] [PATCH 2/3] iscsi: simplify iscsi_schedule_bh Paolo Bonzini
2012-08-18 21:49 ` [Qemu-devel] [PATCH 3/3] iscsi: fix races between task completion and abort Paolo Bonzini
2012-08-19  7:55 ` [Qemu-devel] [PATCH RFT 0/3] iscsi: fix NULL dereferences / " Stefan Priebe
2012-08-19 13:11   ` Paolo Bonzini
2012-08-19 19:22     ` Stefan Priebe - Profihost AG
2012-08-20  7:22       ` Paolo Bonzini
2012-08-20  7:34         ` Stefan Priebe - Profihost AG
2012-08-20  8:08           ` Paolo Bonzini
2012-08-20  8:12             ` Stefan Priebe - Profihost AG
2012-08-20 22:36               ` ronnie sahlberg
2012-08-21  7:22                 ` Stefan Priebe - Profihost AG
2012-08-21  7:30                   ` Paolo Bonzini
2012-11-06  8:41                     ` [Qemu-devel] scsi-hd with discard_granularity and unmap results in Aborted Commands Stefan Priebe - Profihost AG
2012-11-06 22:42                       ` Paolo Bonzini
2012-11-07 18:57                         ` Stefan Priebe
2012-11-18 22:00                         ` Stefan Priebe
2012-11-19  8:10                           ` Paolo Bonzini
2012-11-19  9:36                             ` Stefan Priebe - Profihost AG
2012-11-19  9:54                               ` Paolo Bonzini
2012-11-19  9:59                                 ` Stefan Priebe - Profihost AG
2012-11-19 10:06                                   ` Paolo Bonzini
2012-11-19 10:13                                     ` Stefan Priebe - Profihost AG
2012-11-19 10:23                                       ` Paolo Bonzini
2012-11-19 10:30                                         ` Stefan Priebe - Profihost AG
2012-11-19 10:36                                           ` Paolo Bonzini
2012-11-19 10:57                                             ` Stefan Priebe - Profihost AG
2012-11-19 11:16                                               ` Paolo Bonzini
2012-11-19 11:49                                                 ` Stefan Priebe - Profihost AG
2012-11-19 12:24                                                   ` Paolo Bonzini
2012-11-19 13:01                                                     ` Stefan Priebe - Profihost AG
2012-11-19 13:06                                                       ` Paolo Bonzini
2012-11-19 14:16                                                         ` Stefan Priebe - Profihost AG
2012-11-19 14:32                                                           ` Paolo Bonzini
2012-11-19 14:28                                                         ` Stefan Priebe - Profihost AG
2012-11-19 14:41                                                           ` Paolo Bonzini
2012-11-19 14:48                                                             ` Stefan Priebe - Profihost AG
2012-11-19 15:03                                                               ` Paolo Bonzini
2012-11-19 15:04                                                             ` Stefan Priebe - Profihost AG
2012-11-19 15:22                                                               ` Paolo Bonzini
2012-11-19 15:58                                                                 ` Stefan Priebe - Profihost AG

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.