All of lore.kernel.org
 help / color / mirror / Atom feed
* [RESEND PATCH] hw/dma: fix crash caused by race condition
       [not found] <CGME20220427205118uscas1p25031437c0cdd4363c104be13033f366a@uscas1p2.samsung.com>
@ 2022-04-27 20:51 ` Tong Zhang
  2022-05-30 14:34   ` Philippe Mathieu-Daudé via
  2022-05-30 16:19   ` David Hildenbrand
  0 siblings, 2 replies; 15+ messages in thread
From: Tong Zhang @ 2022-04-27 20:51 UTC (permalink / raw)
  To: Paolo Bonzini, Peter Xu, David Hildenbrand,
	Philippe Mathieu-Daudé,
	qemu-devel
  Cc: ztong0001, Francisco Londono, Tong Zhang

assert(dbs->acb) is meant to check the return value of io_func per
documented in commit 6bee44ea34 ("dma: the passed io_func does not
return NULL"). However, there is a chance that after calling
aio_context_release(dbs->ctx); the dma_blk_cb function is called before
the assertion and dbs->acb is set to NULL again at line 121. Thus when
we run assert at line 181 it will fail.

  softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.

Reported-by: Francisco Londono <f.londono@samsung.com>
Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
---
 softmmu/dma-helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
index 7820fec54c..cb81017928 100644
--- a/softmmu/dma-helpers.c
+++ b/softmmu/dma-helpers.c
@@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
     aio_context_acquire(dbs->ctx);
     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                             dma_blk_cb, dbs, dbs->io_func_opaque);
-    aio_context_release(dbs->ctx);
     assert(dbs->acb);
+    aio_context_release(dbs->ctx);
 }
 
 static void dma_aio_cancel(BlockAIOCB *acb)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-04-27 20:51 ` [RESEND PATCH] hw/dma: fix crash caused by race condition Tong Zhang
@ 2022-05-30 14:34   ` Philippe Mathieu-Daudé via
  2022-05-30 16:19   ` David Hildenbrand
  1 sibling, 0 replies; 15+ messages in thread
From: Philippe Mathieu-Daudé via @ 2022-05-30 14:34 UTC (permalink / raw)
  To: Tong Zhang, Paolo Bonzini, Peter Xu, David Hildenbrand, qemu-devel
  Cc: ztong0001, Francisco Londono, Stefan Hajnoczi, Alexander Bulekov,
	Emanuele Giuseppe Esposito

+Emanuele / Alexander / Stefan

On 27/4/22 22:51, Tong Zhang wrote:
> assert(dbs->acb) is meant to check the return value of io_func per
> documented in commit 6bee44ea34 ("dma: the passed io_func does not
> return NULL"). However, there is a chance that after calling
> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> the assertion and dbs->acb is set to NULL again at line 121. Thus when
> we run assert at line 181 it will fail.
> 
>    softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> 
> Reported-by: Francisco Londono <f.londono@samsung.com>
> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> ---
>   softmmu/dma-helpers.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> index 7820fec54c..cb81017928 100644
> --- a/softmmu/dma-helpers.c
> +++ b/softmmu/dma-helpers.c
> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>       aio_context_acquire(dbs->ctx);
>       dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>                               dma_blk_cb, dbs, dbs->io_func_opaque);
> -    aio_context_release(dbs->ctx);
>       assert(dbs->acb);
> +    aio_context_release(dbs->ctx);
>   }
>   
>   static void dma_aio_cancel(BlockAIOCB *acb)



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-04-27 20:51 ` [RESEND PATCH] hw/dma: fix crash caused by race condition Tong Zhang
  2022-05-30 14:34   ` Philippe Mathieu-Daudé via
@ 2022-05-30 16:19   ` David Hildenbrand
  2022-06-01  0:20     ` Tong Zhang
  1 sibling, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2022-05-30 16:19 UTC (permalink / raw)
  To: Tong Zhang, Paolo Bonzini, Peter Xu, Philippe Mathieu-Daudé,
	qemu-devel
  Cc: ztong0001, Francisco Londono

On 27.04.22 22:51, Tong Zhang wrote:
> assert(dbs->acb) is meant to check the return value of io_func per
> documented in commit 6bee44ea34 ("dma: the passed io_func does not
> return NULL"). However, there is a chance that after calling
> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> the assertion and dbs->acb is set to NULL again at line 121. Thus when
> we run assert at line 181 it will fail.
> 
>   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> 
> Reported-by: Francisco Londono <f.londono@samsung.com>
> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> ---
>  softmmu/dma-helpers.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> index 7820fec54c..cb81017928 100644
> --- a/softmmu/dma-helpers.c
> +++ b/softmmu/dma-helpers.c
> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>      aio_context_acquire(dbs->ctx);
>      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>                              dma_blk_cb, dbs, dbs->io_func_opaque);
> -    aio_context_release(dbs->ctx);
>      assert(dbs->acb);
> +    aio_context_release(dbs->ctx);
>  }
>  
>  static void dma_aio_cancel(BlockAIOCB *acb)

I'm fairly new to that code, but I wonder what prevents dma_blk_cb() to
run after you reshuffled the code?

After all, acquire/release is only around the dbs->io_func() call, so I
don't immediately see how it interacts with re-entrance?

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-05-30 16:19   ` David Hildenbrand
@ 2022-06-01  0:20     ` Tong Zhang
  2022-06-01  8:00       ` David Hildenbrand
  0 siblings, 1 reply; 15+ messages in thread
From: Tong Zhang @ 2022-06-01  0:20 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Tong Zhang, Paolo Bonzini, Peter Xu, Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

Hi David,

On Mon, May 30, 2022 at 9:19 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 27.04.22 22:51, Tong Zhang wrote:
> > assert(dbs->acb) is meant to check the return value of io_func per
> > documented in commit 6bee44ea34 ("dma: the passed io_func does not
> > return NULL"). However, there is a chance that after calling
> > aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> > the assertion and dbs->acb is set to NULL again at line 121. Thus when
> > we run assert at line 181 it will fail.
> >
> >   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> >
> > Reported-by: Francisco Londono <f.londono@samsung.com>
> > Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> > ---
> >  softmmu/dma-helpers.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> > index 7820fec54c..cb81017928 100644
> > --- a/softmmu/dma-helpers.c
> > +++ b/softmmu/dma-helpers.c
> > @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
> >      aio_context_acquire(dbs->ctx);
> >      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
> >                              dma_blk_cb, dbs, dbs->io_func_opaque);
> > -    aio_context_release(dbs->ctx);
> >      assert(dbs->acb);
> > +    aio_context_release(dbs->ctx);
> >  }
> >
> >  static void dma_aio_cancel(BlockAIOCB *acb)
>
> I'm fairly new to that code, but I wonder what prevents dma_blk_cb() to
> run after you reshuffled the code?
>

IMO if the assert is to test whether io_func returns a non-NULL value
shouldn't it be immediately after calling io_func.
Also... as suggested by commit 6bee44ea346aed24e12d525daf10542d695508db
  >     dma: the passed io_func does not return NULL

Thanks,
- Tong

> After all, acquire/release is only around the dbs->io_func() call, so I
> don't immediately see how it interacts with re-entrance?
>
> --
> Thanks,
>
> David / dhildenb
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-01  0:20     ` Tong Zhang
@ 2022-06-01  8:00       ` David Hildenbrand
  2022-06-01 13:24         ` Stefan Hajnoczi
  0 siblings, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2022-06-01  8:00 UTC (permalink / raw)
  To: Tong Zhang
  Cc: Tong Zhang, Paolo Bonzini, Peter Xu, Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

On 01.06.22 02:20, Tong Zhang wrote:
> Hi David,
> 
> On Mon, May 30, 2022 at 9:19 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 27.04.22 22:51, Tong Zhang wrote:
>>> assert(dbs->acb) is meant to check the return value of io_func per
>>> documented in commit 6bee44ea34 ("dma: the passed io_func does not
>>> return NULL"). However, there is a chance that after calling
>>> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
>>> the assertion and dbs->acb is set to NULL again at line 121. Thus when
>>> we run assert at line 181 it will fail.
>>>
>>>   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
>>>
>>> Reported-by: Francisco Londono <f.londono@samsung.com>
>>> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
>>> ---
>>>  softmmu/dma-helpers.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
>>> index 7820fec54c..cb81017928 100644
>>> --- a/softmmu/dma-helpers.c
>>> +++ b/softmmu/dma-helpers.c
>>> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>>>      aio_context_acquire(dbs->ctx);
>>>      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>>>                              dma_blk_cb, dbs, dbs->io_func_opaque);
>>> -    aio_context_release(dbs->ctx);
>>>      assert(dbs->acb);
>>> +    aio_context_release(dbs->ctx);
>>>  }
>>>
>>>  static void dma_aio_cancel(BlockAIOCB *acb)
>>
>> I'm fairly new to that code, but I wonder what prevents dma_blk_cb() to
>> run after you reshuffled the code?
>>
> 
> IMO if the assert is to test whether io_func returns a non-NULL value
> shouldn't it be immediately after calling io_func.
> Also... as suggested by commit 6bee44ea346aed24e12d525daf10542d695508db
>   >     dma: the passed io_func does not return NULL

Yes, but I just don't see how it would fix the assertion you document in
the patch description. The locking change to fix the assertion doesn't
make any sense to me, and most probably I am missing something important :)

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-01  8:00       ` David Hildenbrand
@ 2022-06-01 13:24         ` Stefan Hajnoczi
  2022-06-01 13:29           ` David Hildenbrand
  0 siblings, 1 reply; 15+ messages in thread
From: Stefan Hajnoczi @ 2022-06-01 13:24 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Tong Zhang, Tong Zhang, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

[-- Attachment #1: Type: text/plain, Size: 2568 bytes --]

On Wed, Jun 01, 2022 at 10:00:50AM +0200, David Hildenbrand wrote:
> On 01.06.22 02:20, Tong Zhang wrote:
> > Hi David,
> > 
> > On Mon, May 30, 2022 at 9:19 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 27.04.22 22:51, Tong Zhang wrote:
> >>> assert(dbs->acb) is meant to check the return value of io_func per
> >>> documented in commit 6bee44ea34 ("dma: the passed io_func does not
> >>> return NULL"). However, there is a chance that after calling
> >>> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> >>> the assertion and dbs->acb is set to NULL again at line 121. Thus when
> >>> we run assert at line 181 it will fail.
> >>>
> >>>   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> >>>
> >>> Reported-by: Francisco Londono <f.londono@samsung.com>
> >>> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> >>> ---
> >>>  softmmu/dma-helpers.c | 2 +-
> >>>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>>
> >>> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> >>> index 7820fec54c..cb81017928 100644
> >>> --- a/softmmu/dma-helpers.c
> >>> +++ b/softmmu/dma-helpers.c
> >>> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
> >>>      aio_context_acquire(dbs->ctx);
> >>>      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
> >>>                              dma_blk_cb, dbs, dbs->io_func_opaque);
> >>> -    aio_context_release(dbs->ctx);
> >>>      assert(dbs->acb);
> >>> +    aio_context_release(dbs->ctx);
> >>>  }
> >>>
> >>>  static void dma_aio_cancel(BlockAIOCB *acb)
> >>
> >> I'm fairly new to that code, but I wonder what prevents dma_blk_cb() to
> >> run after you reshuffled the code?
> >>
> > 
> > IMO if the assert is to test whether io_func returns a non-NULL value
> > shouldn't it be immediately after calling io_func.
> > Also... as suggested by commit 6bee44ea346aed24e12d525daf10542d695508db
> >   >     dma: the passed io_func does not return NULL
> 
> Yes, but I just don't see how it would fix the assertion you document in
> the patch description. The locking change to fix the assertion doesn't
> make any sense to me, and most probably I am missing something important :)

The other thread will invoke dma_blk_cb(), which modifies dbs->acb, when
it can take the lock. Therefore dbs->acb may contain a value different
from our io_func()'s return value by the time we perform the assertion
check (that's the race).

This patch makes sense to me. Can you rephrase your concern?

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-01 13:24         ` Stefan Hajnoczi
@ 2022-06-01 13:29           ` David Hildenbrand
  2022-06-01 13:55             ` Stefan Hajnoczi
  0 siblings, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2022-06-01 13:29 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Tong Zhang, Tong Zhang, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

On 01.06.22 15:24, Stefan Hajnoczi wrote:
> On Wed, Jun 01, 2022 at 10:00:50AM +0200, David Hildenbrand wrote:
>> On 01.06.22 02:20, Tong Zhang wrote:
>>> Hi David,
>>>
>>> On Mon, May 30, 2022 at 9:19 AM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 27.04.22 22:51, Tong Zhang wrote:
>>>>> assert(dbs->acb) is meant to check the return value of io_func per
>>>>> documented in commit 6bee44ea34 ("dma: the passed io_func does not
>>>>> return NULL"). However, there is a chance that after calling
>>>>> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
>>>>> the assertion and dbs->acb is set to NULL again at line 121. Thus when
>>>>> we run assert at line 181 it will fail.
>>>>>
>>>>>   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
>>>>>
>>>>> Reported-by: Francisco Londono <f.londono@samsung.com>
>>>>> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
>>>>> ---
>>>>>  softmmu/dma-helpers.c | 2 +-
>>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
>>>>> index 7820fec54c..cb81017928 100644
>>>>> --- a/softmmu/dma-helpers.c
>>>>> +++ b/softmmu/dma-helpers.c
>>>>> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>>>>>      aio_context_acquire(dbs->ctx);
>>>>>      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>>>>>                              dma_blk_cb, dbs, dbs->io_func_opaque);
>>>>> -    aio_context_release(dbs->ctx);
>>>>>      assert(dbs->acb);
>>>>> +    aio_context_release(dbs->ctx);
>>>>>  }
>>>>>
>>>>>  static void dma_aio_cancel(BlockAIOCB *acb)
>>>>
>>>> I'm fairly new to that code, but I wonder what prevents dma_blk_cb() to
>>>> run after you reshuffled the code?
>>>>
>>>
>>> IMO if the assert is to test whether io_func returns a non-NULL value
>>> shouldn't it be immediately after calling io_func.
>>> Also... as suggested by commit 6bee44ea346aed24e12d525daf10542d695508db
>>>   >     dma: the passed io_func does not return NULL
>>
>> Yes, but I just don't see how it would fix the assertion you document in
>> the patch description. The locking change to fix the assertion doesn't
>> make any sense to me, and most probably I am missing something important :)
> 
> The other thread will invoke dma_blk_cb(), which modifies dbs->acb, when
> it can take the lock. Therefore dbs->acb may contain a value different
> from our io_func()'s return value by the time we perform the assertion
> check (that's the race).
> 
> This patch makes sense to me. Can you rephrase your concern?

The locking is around dbs->io_func().

aio_context_acquire(dbs->ctx);
dbs->acb = dbs->io_func()
aio_context_release(dbs->ctx);


So where exactly would the lock that's now still held stop someone from
modifying dbs->acb = NULL at the beginning of the function, which seems
to be not protected by that lock?

Maybe I'm missing some locking magic due to the lock being a recursive lock.

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-01 13:29           ` David Hildenbrand
@ 2022-06-01 13:55             ` Stefan Hajnoczi
  2022-06-02  1:04               ` Tong Zhang
  0 siblings, 1 reply; 15+ messages in thread
From: Stefan Hajnoczi @ 2022-06-01 13:55 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Tong Zhang, Tong Zhang, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

On Wed, 1 Jun 2022 at 14:29, David Hildenbrand <david@redhat.com> wrote:
>
> On 01.06.22 15:24, Stefan Hajnoczi wrote:
> > On Wed, Jun 01, 2022 at 10:00:50AM +0200, David Hildenbrand wrote:
> >> On 01.06.22 02:20, Tong Zhang wrote:
> >>> Hi David,
> >>>
> >>> On Mon, May 30, 2022 at 9:19 AM David Hildenbrand <david@redhat.com> wrote:
> >>>>
> >>>> On 27.04.22 22:51, Tong Zhang wrote:
> >>>>> assert(dbs->acb) is meant to check the return value of io_func per
> >>>>> documented in commit 6bee44ea34 ("dma: the passed io_func does not
> >>>>> return NULL"). However, there is a chance that after calling
> >>>>> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> >>>>> the assertion and dbs->acb is set to NULL again at line 121. Thus when
> >>>>> we run assert at line 181 it will fail.
> >>>>>
> >>>>>   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> >>>>>
> >>>>> Reported-by: Francisco Londono <f.londono@samsung.com>
> >>>>> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> >>>>> ---
> >>>>>  softmmu/dma-helpers.c | 2 +-
> >>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>>>>
> >>>>> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> >>>>> index 7820fec54c..cb81017928 100644
> >>>>> --- a/softmmu/dma-helpers.c
> >>>>> +++ b/softmmu/dma-helpers.c
> >>>>> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
> >>>>>      aio_context_acquire(dbs->ctx);
> >>>>>      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
> >>>>>                              dma_blk_cb, dbs, dbs->io_func_opaque);
> >>>>> -    aio_context_release(dbs->ctx);
> >>>>>      assert(dbs->acb);
> >>>>> +    aio_context_release(dbs->ctx);
> >>>>>  }
> >>>>>
> >>>>>  static void dma_aio_cancel(BlockAIOCB *acb)
> >>>>
> >>>> I'm fairly new to that code, but I wonder what prevents dma_blk_cb() to
> >>>> run after you reshuffled the code?
> >>>>
> >>>
> >>> IMO if the assert is to test whether io_func returns a non-NULL value
> >>> shouldn't it be immediately after calling io_func.
> >>> Also... as suggested by commit 6bee44ea346aed24e12d525daf10542d695508db
> >>>   >     dma: the passed io_func does not return NULL
> >>
> >> Yes, but I just don't see how it would fix the assertion you document in
> >> the patch description. The locking change to fix the assertion doesn't
> >> make any sense to me, and most probably I am missing something important :)
> >
> > The other thread will invoke dma_blk_cb(), which modifies dbs->acb, when
> > it can take the lock. Therefore dbs->acb may contain a value different
> > from our io_func()'s return value by the time we perform the assertion
> > check (that's the race).
> >
> > This patch makes sense to me. Can you rephrase your concern?
>
> The locking is around dbs->io_func().
>
> aio_context_acquire(dbs->ctx);
> dbs->acb = dbs->io_func()
> aio_context_release(dbs->ctx);
>
>
> So where exactly would the lock that's now still held stop someone from
> modifying dbs->acb = NULL at the beginning of the function, which seems
> to be not protected by that lock?
>
> Maybe I'm missing some locking magic due to the lock being a recursive lock.

Tong Zhang: Can you share a backtrace of all threads when the
assertion failure occurs?

Stefan


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-01 13:55             ` Stefan Hajnoczi
@ 2022-06-02  1:04               ` Tong Zhang
  2022-06-02  5:29                 ` Stefan Hajnoczi
  0 siblings, 1 reply; 15+ messages in thread
From: Tong Zhang @ 2022-06-02  1:04 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: David Hildenbrand, Tong Zhang, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

Hi Stefan,

On Wed, Jun 1, 2022 at 6:56 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> > > This patch makes sense to me. Can you rephrase your concern?
> >
> > The locking is around dbs->io_func().
> >
> > aio_context_acquire(dbs->ctx);
> > dbs->acb = dbs->io_func()
> > aio_context_release(dbs->ctx);
> >
> >
> > So where exactly would the lock that's now still held stop someone from
> > modifying dbs->acb = NULL at the beginning of the function, which seems
> > to be not protected by that lock?
> >
> > Maybe I'm missing some locking magic due to the lock being a recursive lock.
>
> Tong Zhang: Can you share a backtrace of all threads when the
> assertion failure occurs?
>
Sorry I couldn't get the trace now -- but I can tell that we have some
internal code uses
this dma related code and will grab dbs->ctx lock in another thread
and could overwrite dbs->acb.

From my understanding, one of the reasons that the lock is required
here is to protect dbs->acb,
we could not reliably test io_func()'s return value after releasing
the lock here.

Since this code affects our internal code base and I did not reproduce
on master branch,
feel free to ignore it.

- Tong

> Stefan


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-02  1:04               ` Tong Zhang
@ 2022-06-02  5:29                 ` Stefan Hajnoczi
  0 siblings, 0 replies; 15+ messages in thread
From: Stefan Hajnoczi @ 2022-06-02  5:29 UTC (permalink / raw)
  To: Tong Zhang
  Cc: David Hildenbrand, Tong Zhang, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	qemu-devel, Francisco Londono

On Thu, Jun 2, 2022, 02:04 Tong Zhang <ztong0001@gmail.com> wrote:
>
> Hi Stefan,
>
> On Wed, Jun 1, 2022 at 6:56 AM Stefan Hajnoczi <stefanha@gmail.com> wrote:
> >
> > > > This patch makes sense to me. Can you rephrase your concern?
> > >
> > > The locking is around dbs->io_func().
> > >
> > > aio_context_acquire(dbs->ctx);
> > > dbs->acb = dbs->io_func()
> > > aio_context_release(dbs->ctx);
> > >
> > >
> > > So where exactly would the lock that's now still held stop someone from
> > > modifying dbs->acb = NULL at the beginning of the function, which seems
> > > to be not protected by that lock?
> > >
> > > Maybe I'm missing some locking magic due to the lock being a recursive lock.
> >
> > Tong Zhang: Can you share a backtrace of all threads when the
> > assertion failure occurs?
> >
> Sorry I couldn't get the trace now -- but I can tell that we have some
> internal code uses
> this dma related code and will grab dbs->ctx lock in another thread
> and could overwrite dbs->acb.
>
> From my understanding, one of the reasons that the lock is required
> here is to protect dbs->acb,
> we could not reliably test io_func()'s return value after releasing
> the lock here.
>
> Since this code affects our internal code base and I did not reproduce
> on master branch,
> feel free to ignore it.

If this patch is unnecessary on qemu.git/master it raises the question
whether aio_context_acquire/release() should be removed from
dma_blk_cb(). It was added by:

commit 1919631e6b5562e474690853eca3c35610201e16
Author: Paolo Bonzini <pbonzini@redhat.com>
Date:   Mon Feb 13 14:52:31 2017 +0100

    block: explicitly acquire aiocontext in bottom halves that need it

Paolo: Is dma_blk_cb() called without the AioContext lock (by
virtio-scsi or any code path with IOThreads)? David pointed out that
if that's the case then the dbs->acb is accessed without the lock at
the start of dma_blk_cb().

Stefan


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-29  8:31     ` Tong Zhang
@ 2022-06-29  9:52       ` David Hildenbrand
  0 siblings, 0 replies; 15+ messages in thread
From: David Hildenbrand @ 2022-06-29  9:52 UTC (permalink / raw)
  To: Tong Zhang
  Cc: Francisco Londono, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	Stefan Hajnoczi, Tong Zhang, qemu-devel, qemu-trivial

On 29.06.22 10:31, Tong Zhang wrote:
> 
> 
> On Wed, Jun 29, 2022 at 12:29 AM David Hildenbrand <david@redhat.com
> <mailto:david@redhat.com>> wrote:
> 
>     On 06.05.22 18:31, Tong Zhang wrote:
>     > assert(dbs->acb) is meant to check the return value of io_func per
>     > documented in commit 6bee44ea34 ("dma: the passed io_func does not
>     > return NULL"). However, there is a chance that after calling
>     > aio_context_release(dbs->ctx); the dma_blk_cb function is called
>     before
>     > the assertion and dbs->acb is set to NULL again at line 121. Thus when
>     > we run assert at line 181 it will fail.
>     >
>     >   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
>     >
>     > Reported-by: Francisco Londono <f.londono@samsung.com
>     <mailto:f.londono@samsung.com>>
>     > Signed-off-by: Tong Zhang <t.zhang2@samsung.com
>     <mailto:t.zhang2@samsung.com>>
>     > ---
>     >  softmmu/dma-helpers.c | 2 +-
>     >  1 file changed, 1 insertion(+), 1 deletion(-)
>     >
>     > diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
>     > index 7820fec54c..cb81017928 100644
>     > --- a/softmmu/dma-helpers.c
>     > +++ b/softmmu/dma-helpers.c
>     > @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>     >      aio_context_acquire(dbs->ctx);
>     >      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>     >                              dma_blk_cb, dbs, dbs->io_func_opaque);
>     > -    aio_context_release(dbs->ctx);
>     >      assert(dbs->acb);
>     > +    aio_context_release(dbs->ctx);
>     >  }
>     > 
>     >  static void dma_aio_cancel(BlockAIOCB *acb)
> 
>     Please don't resend patches if the previous submission came to the
>     conclusion that it's unclear how this should help.
> 
>     https://lkml.kernel.org/r/CAJSP0QW396RY_g8LS1mncDZcOv5GamURy+xv+s8zMcdq03OOMA@mail.gmail.com
>     <https://lkml.kernel.org/r/CAJSP0QW396RY_g8LS1mncDZcOv5GamURy+xv+s8zMcdq03OOMA@mail.gmail.com>
> 
> 
>     I *still* don't understand the interaction between the lock and the
>     assertion and so far nobody was able to clarify.
> 
>     -- 
>     Thanks,
> 
>     David / dhildenb
> 
> hello
> 
> This message is sent way before the discussion 

Oh, I'm sorry. I was mislead by the reply from Laurent :)

BTW, do we now have an understanding why that patch helps and if it
applies to upstream?

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-06-29  7:28   ` David Hildenbrand
@ 2022-06-29  8:31     ` Tong Zhang
  2022-06-29  9:52       ` David Hildenbrand
  0 siblings, 1 reply; 15+ messages in thread
From: Tong Zhang @ 2022-06-29  8:31 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Francisco Londono, Paolo Bonzini, Peter Xu,
	Philippe Mathieu-Daudé,
	Stefan Hajnoczi, Tong Zhang, qemu-devel, qemu-trivial

[-- Attachment #1: Type: text/plain, Size: 1829 bytes --]

On Wed, Jun 29, 2022 at 12:29 AM David Hildenbrand <david@redhat.com> wrote:

> On 06.05.22 18:31, Tong Zhang wrote:
> > assert(dbs->acb) is meant to check the return value of io_func per
> > documented in commit 6bee44ea34 ("dma: the passed io_func does not
> > return NULL"). However, there is a chance that after calling
> > aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> > the assertion and dbs->acb is set to NULL again at line 121. Thus when
> > we run assert at line 181 it will fail.
> >
> >   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> >
> > Reported-by: Francisco Londono <f.londono@samsung.com>
> > Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> > ---
> >  softmmu/dma-helpers.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> > index 7820fec54c..cb81017928 100644
> > --- a/softmmu/dma-helpers.c
> > +++ b/softmmu/dma-helpers.c
> > @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
> >      aio_context_acquire(dbs->ctx);
> >      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
> >                              dma_blk_cb, dbs, dbs->io_func_opaque);
> > -    aio_context_release(dbs->ctx);
> >      assert(dbs->acb);
> > +    aio_context_release(dbs->ctx);
> >  }
> >
> >  static void dma_aio_cancel(BlockAIOCB *acb)
>
> Please don't resend patches if the previous submission came to the
> conclusion that it's unclear how this should help.
>
>
> https://lkml.kernel.org/r/CAJSP0QW396RY_g8LS1mncDZcOv5GamURy+xv+s8zMcdq03OOMA@mail.gmail.com
>
>
> I *still* don't understand the interaction between the lock and the
> assertion and so far nobody was able to clarify.
>
> --
> Thanks,
>
> David / dhildenb
>
hello

This message is sent way before the discussion


>

[-- Attachment #2: Type: text/html, Size: 3027 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-05-06 16:31 ` Tong Zhang
  2022-06-28 22:34   ` Laurent Vivier
@ 2022-06-29  7:28   ` David Hildenbrand
  2022-06-29  8:31     ` Tong Zhang
  1 sibling, 1 reply; 15+ messages in thread
From: David Hildenbrand @ 2022-06-29  7:28 UTC (permalink / raw)
  To: Tong Zhang, Paolo Bonzini, Peter Xu, Philippe Mathieu-Daudé,
	qemu-devel, Stefan Hajnoczi
  Cc: ztong0001, qemu-trivial, Francisco Londono

On 06.05.22 18:31, Tong Zhang wrote:
> assert(dbs->acb) is meant to check the return value of io_func per
> documented in commit 6bee44ea34 ("dma: the passed io_func does not
> return NULL"). However, there is a chance that after calling
> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> the assertion and dbs->acb is set to NULL again at line 121. Thus when
> we run assert at line 181 it will fail.
> 
>   softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> 
> Reported-by: Francisco Londono <f.londono@samsung.com>
> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> ---
>  softmmu/dma-helpers.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> index 7820fec54c..cb81017928 100644
> --- a/softmmu/dma-helpers.c
> +++ b/softmmu/dma-helpers.c
> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>      aio_context_acquire(dbs->ctx);
>      dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>                              dma_blk_cb, dbs, dbs->io_func_opaque);
> -    aio_context_release(dbs->ctx);
>      assert(dbs->acb);
> +    aio_context_release(dbs->ctx);
>  }
>  
>  static void dma_aio_cancel(BlockAIOCB *acb)

Please don't resend patches if the previous submission came to the
conclusion that it's unclear how this should help.

https://lkml.kernel.org/r/CAJSP0QW396RY_g8LS1mncDZcOv5GamURy+xv+s8zMcdq03OOMA@mail.gmail.com


I *still* don't understand the interaction between the lock and the
assertion and so far nobody was able to clarify.

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RESEND PATCH] hw/dma: fix crash caused by race condition
  2022-05-06 16:31 ` Tong Zhang
@ 2022-06-28 22:34   ` Laurent Vivier
  2022-06-29  7:28   ` David Hildenbrand
  1 sibling, 0 replies; 15+ messages in thread
From: Laurent Vivier @ 2022-06-28 22:34 UTC (permalink / raw)
  To: Tong Zhang, Paolo Bonzini, Peter Xu, David Hildenbrand,
	Philippe Mathieu-Daudé,
	qemu-devel
  Cc: ztong0001, qemu-trivial, Francisco Londono

Le 06/05/2022 à 18:31, Tong Zhang a écrit :
> assert(dbs->acb) is meant to check the return value of io_func per
> documented in commit 6bee44ea34 ("dma: the passed io_func does not
> return NULL"). However, there is a chance that after calling
> aio_context_release(dbs->ctx); the dma_blk_cb function is called before
> the assertion and dbs->acb is set to NULL again at line 121. Thus when
> we run assert at line 181 it will fail.
> 
>    softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.
> 
> Reported-by: Francisco Londono <f.londono@samsung.com>
> Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
> ---
>   softmmu/dma-helpers.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> index 7820fec54c..cb81017928 100644
> --- a/softmmu/dma-helpers.c
> +++ b/softmmu/dma-helpers.c
> @@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
>       aio_context_acquire(dbs->ctx);
>       dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
>                               dma_blk_cb, dbs, dbs->io_func_opaque);
> -    aio_context_release(dbs->ctx);
>       assert(dbs->acb);
> +    aio_context_release(dbs->ctx);
>   }
>   
>   static void dma_aio_cancel(BlockAIOCB *acb)

Fixes: 1919631e6b55 ("block: explicitly acquire aiocontext in bottom halves that need it")
Reviewed-by: Laurent Vivier <laurent@vivier.eu>



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RESEND PATCH] hw/dma: fix crash caused by race condition
       [not found] <CGME20220506163106uscas1p20aa8ba0a290a9b50be54df6ec4f9cee0@uscas1p2.samsung.com>
@ 2022-05-06 16:31 ` Tong Zhang
  2022-06-28 22:34   ` Laurent Vivier
  2022-06-29  7:28   ` David Hildenbrand
  0 siblings, 2 replies; 15+ messages in thread
From: Tong Zhang @ 2022-05-06 16:31 UTC (permalink / raw)
  To: Paolo Bonzini, Peter Xu, David Hildenbrand,
	Philippe Mathieu-Daudé,
	qemu-devel
  Cc: ztong0001, qemu-trivial, Tong Zhang, Francisco Londono

assert(dbs->acb) is meant to check the return value of io_func per
documented in commit 6bee44ea34 ("dma: the passed io_func does not
return NULL"). However, there is a chance that after calling
aio_context_release(dbs->ctx); the dma_blk_cb function is called before
the assertion and dbs->acb is set to NULL again at line 121. Thus when
we run assert at line 181 it will fail.

  softmmu/dma-helpers.c:181: dma_blk_cb: Assertion `dbs->acb' failed.

Reported-by: Francisco Londono <f.londono@samsung.com>
Signed-off-by: Tong Zhang <t.zhang2@samsung.com>
---
 softmmu/dma-helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
index 7820fec54c..cb81017928 100644
--- a/softmmu/dma-helpers.c
+++ b/softmmu/dma-helpers.c
@@ -177,8 +177,8 @@ static void dma_blk_cb(void *opaque, int ret)
     aio_context_acquire(dbs->ctx);
     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                             dma_blk_cb, dbs, dbs->io_func_opaque);
-    aio_context_release(dbs->ctx);
     assert(dbs->acb);
+    aio_context_release(dbs->ctx);
 }
 
 static void dma_aio_cancel(BlockAIOCB *acb)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-06-29  9:55 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20220427205118uscas1p25031437c0cdd4363c104be13033f366a@uscas1p2.samsung.com>
2022-04-27 20:51 ` [RESEND PATCH] hw/dma: fix crash caused by race condition Tong Zhang
2022-05-30 14:34   ` Philippe Mathieu-Daudé via
2022-05-30 16:19   ` David Hildenbrand
2022-06-01  0:20     ` Tong Zhang
2022-06-01  8:00       ` David Hildenbrand
2022-06-01 13:24         ` Stefan Hajnoczi
2022-06-01 13:29           ` David Hildenbrand
2022-06-01 13:55             ` Stefan Hajnoczi
2022-06-02  1:04               ` Tong Zhang
2022-06-02  5:29                 ` Stefan Hajnoczi
     [not found] <CGME20220506163106uscas1p20aa8ba0a290a9b50be54df6ec4f9cee0@uscas1p2.samsung.com>
2022-05-06 16:31 ` Tong Zhang
2022-06-28 22:34   ` Laurent Vivier
2022-06-29  7:28   ` David Hildenbrand
2022-06-29  8:31     ` Tong Zhang
2022-06-29  9:52       ` David Hildenbrand

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.