All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
@ 2022-07-04 12:00 Yang Jihong
  2022-07-04 15:26 ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Yang Jihong @ 2022-07-04 12:00 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, linux-perf-users, linux-kernel
  Cc: yangjihong1

Data race exists between perf_event_set_output and perf_mmap_close.
The scenario is as follows:

                  CPU1                                                       CPU2
                                                                    perf_mmap_close(event2)
                                                                      if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
                                                                        detach_rest = true;
ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
  perf_event_set_output(event1, event2)
                                                                      if (!detach_rest)
                                                                        goto out_put;
                                                                      list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
                                                                        ring_buffer_attach(event, NULL)
                                                                      // because event1 has not been added to event2->rb->event_list,
                                                                      // event1->rb is not set to NULL in these loops

    ring_buffer_attach(event1, event2->rb)
      list_add_rcu(&event1->rb_entry, &event2->rb->event_list)

The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:

again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
<SNIP>
                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Raced against perf_mmap_close() through
                         * perf_event_set_output(). Try again, hope for better
                         * luck.
                         */
                        mutex_unlock(&event->mmap_mutex);
                        goto again;
                }
<SNIP>

This problem has occurred on the Syzkaller, causing the softlockup. The call stack is as follows:
[ 3666.984385][    C2] watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [syz-executor.2:32404]
[ 3666.986137][    C2] Modules linked in:
[ 3666.989581][    C2] CPU: 2 PID: 32404 Comm: syz-executor.2 Not tainted 5.10.0+ #4
[ 3666.990697][    C2] Hardware name: linux,dummy-virt (DT)
[ 3666.992270][    C2] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
[ 3666.993787][    C2] pc : __kasan_check_write+0x0/0x40
[ 3666.994841][    C2] lr : perf_mmap+0x3c8/0xf80
[ 3666.995661][    C2] sp : ffff00001011f8f0
[ 3666.996598][    C2] x29: ffff00001011f8f0 x28: ffff0000cf644868
[ 3666.998488][    C2] x27: ffff000012cad2c0 x26: 0000000000000000
[ 3666.999888][    C2] x25: 0000000000000001 x24: ffff000012cad298
[ 3667.003511][    C2] x23: 0000000000000000 x22: ffff000012cad000
[ 3667.005504][    C2] x21: ffff0000cf644818 x20: ffff0000cf6d2400
[ 3667.006891][    C2] x19: ffff0000cf6d24c0 x18: 0000000000000000
[ 3667.008295][    C2] x17: 0000000000000000 x16: 0000000000000000
[ 3667.009528][    C2] x15: 0000000000000000 x14: 0000000000000000
[ 3667.010658][    C2] x13: 0000000000000000 x12: ffff800002023f17
[ 3667.012169][    C2] x11: 1fffe00002023f16 x10: ffff800002023f16
[ 3667.013780][    C2] x9 : dfffa00000000000 x8 : ffff00001011f8b7
[ 3667.015265][    C2] x7 : 0000000000000001 x6 : ffff800002023f16
[ 3667.016683][    C2] x5 : ffff0000c0f36400 x4 : 0000000000000000
[ 3667.018078][    C2] x3 : ffffa00010000000 x2 : ffffa000119a0000
[ 3667.019343][    C2] x1 : 0000000000000004 x0 : ffff0000cf6d24c0
[ 3667.021276][    C2] Call trace:
[ 3667.022598][    C2]  __kasan_check_write+0x0/0x40
[ 3667.023666][    C2]  __mmap_region+0x7a4/0xc90
[ 3667.024679][    C2]  __do_mmap_mm+0x600/0xa20
[ 3667.025700][    C2]  do_mmap+0x114/0x384
[ 3667.026583][    C2]  vm_mmap_pgoff+0x138/0x230
[ 3667.027532][    C2]  ksys_mmap_pgoff+0x1d8/0x570
[ 3667.028537][    C2]  __arm64_sys_mmap+0xa4/0xd0
[ 3667.029597][    C2]  el0_svc_common.constprop.0+0xf4/0x414
[ 3667.030682][    C2]  do_el0_svc+0x50/0x11c
[ 3667.031545][    C2]  el0_svc+0x20/0x30
[ 3667.032368][    C2]  el0_sync_handler+0xe4/0x1e0
[ 3667.033305][    C2]  el0_sync+0x148/0x180

perf_mmap_close and ring_buffer_attach involve complicated lock recursion.
The solution is to modify ring_buffer_attach function, that is,
after new event is added to rb->event_list, check whether rb->mmap_count is 0,
if the value is 0, another core performs perf_mmap_close operation
on ring buffer during this period. In this case, we set event->rb to NULL.

Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
---

Changes since v1:
 - Modify commit message description.

 kernel/events/core.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..c67c070f7b39 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
 			       struct perf_buffer *rb)
 {
 	struct perf_buffer *old_rb = NULL;
+	struct perf_buffer *new_rb = rb;
 	unsigned long flags;
 
 	WARN_ON_ONCE(event->parent);
@@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
 
 		spin_lock_irqsave(&rb->event_lock, flags);
 		list_add_rcu(&event->rb_entry, &rb->event_list);
+
+		/*
+		 * When perf_mmap_close traverses rb->event_list during
+		 * detach all other events, new event may not be added to
+		 * rb->event_list, let's check again, if rb->mmap_count is 0,
+		 * it indicates that perf_mmap_close is executed.
+		 * Manually delete event from rb->event_list and
+		 * set event->rb to null.
+		 */
+		if (!atomic_read(&rb->mmap_count)) {
+			list_del_rcu(&event->rb_entry);
+			new_rb = NULL;
+		}
+
 		spin_unlock_irqrestore(&rb->event_lock, flags);
 	}
 
@@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
 	if (has_aux(event))
 		perf_event_stop(event, 0);
 
-	rcu_assign_pointer(event->rb, rb);
+	rcu_assign_pointer(event->rb, new_rb);
 
 	if (old_rb) {
 		ring_buffer_put(old_rb);
@@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 			goto unlock;
 	}
 
+	/*
+	 * If rb->mmap_count is 0, perf_mmap_close is being executed,
+	 * the ring buffer is about to be unmapped and cannot be attached.
+	 */
+	if (rb && !atomic_read(&rb->mmap_count))
+		goto unlock;
+
 	ring_buffer_attach(event, rb);
 
 	ret = 0;
-- 
2.30.GIT


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-04 12:00 [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close Yang Jihong
@ 2022-07-04 15:26 ` Peter Zijlstra
  2022-07-05  2:07   ` Yang Jihong
  2022-07-05 13:07   ` Peter Zijlstra
  0 siblings, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2022-07-04 15:26 UTC (permalink / raw)
  To: Yang Jihong
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
> Data race exists between perf_event_set_output and perf_mmap_close.
> The scenario is as follows:
> 
>                   CPU1                                                       CPU2
>                                                                     perf_mmap_close(event2)
>                                                                       if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>                                                                         detach_rest = true;
> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>   perf_event_set_output(event1, event2)
>                                                                       if (!detach_rest)
>                                                                         goto out_put;
>                                                                       list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>                                                                         ring_buffer_attach(event, NULL)
>                                                                       // because event1 has not been added to event2->rb->event_list,
>                                                                       // event1->rb is not set to NULL in these loops
> 
>     ring_buffer_attach(event1, event2->rb)
>       list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
> 
> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
> 
> again:
>         mutex_lock(&event->mmap_mutex);
>         if (event->rb) {
> <SNIP>
>                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>                         /*
>                          * Raced against perf_mmap_close() through
>                          * perf_event_set_output(). Try again, hope for better
>                          * luck.
>                          */
>                         mutex_unlock(&event->mmap_mutex);
>                         goto again;
>                 }
> <SNIP>

Too tired, must look again tomorrow, little feeback below.

>  kernel/events/core.c | 24 +++++++++++++++++++++++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 80782cddb1da..c67c070f7b39 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
>  			       struct perf_buffer *rb)
>  {
>  	struct perf_buffer *old_rb = NULL;
> +	struct perf_buffer *new_rb = rb;
>  	unsigned long flags;
>  
>  	WARN_ON_ONCE(event->parent);
> @@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
>  
>  		spin_lock_irqsave(&rb->event_lock, flags);
>  		list_add_rcu(&event->rb_entry, &rb->event_list);
> +
> +		/*
> +		 * When perf_mmap_close traverses rb->event_list during
> +		 * detach all other events, new event may not be added to
> +		 * rb->event_list, let's check again, if rb->mmap_count is 0,
> +		 * it indicates that perf_mmap_close is executed.
> +		 * Manually delete event from rb->event_list and
> +		 * set event->rb to null.
> +		 */
> +		if (!atomic_read(&rb->mmap_count)) {
> +			list_del_rcu(&event->rb_entry);
> +			new_rb = NULL;
> +		}
> +
>  		spin_unlock_irqrestore(&rb->event_lock, flags);
>  	}
>  
> @@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
>  	if (has_aux(event))
>  		perf_event_stop(event, 0);
>  
> -	rcu_assign_pointer(event->rb, rb);
> +	rcu_assign_pointer(event->rb, new_rb);
>  
>  	if (old_rb) {
>  		ring_buffer_put(old_rb);

I'm confused by the above hunks; the below will avoid calling
ring_buffer_attach() when !rb->mmap_count, so how can the above ever
execute?

> @@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>  			goto unlock;
>  	}
>  
> +	/*
> +	 * If rb->mmap_count is 0, perf_mmap_close is being executed,
> +	 * the ring buffer is about to be unmapped and cannot be attached.
> +	 */
> +	if (rb && !atomic_read(&rb->mmap_count))
> +		goto unlock;
> +
>  	ring_buffer_attach(event, rb);
>  
>  	ret = 0;

This is wrong I think, it'll leak ring_buffer_get().


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-04 15:26 ` Peter Zijlstra
@ 2022-07-05  2:07   ` Yang Jihong
  2022-07-05 13:07   ` Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: Yang Jihong @ 2022-07-05  2:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

Hello,

On 2022/7/4 23:26, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>> Data race exists between perf_event_set_output and perf_mmap_close.
>> The scenario is as follows:
>>
>>                    CPU1                                                       CPU2
>>                                                                      perf_mmap_close(event2)
>>                                                                        if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>>                                                                          detach_rest = true;
>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>    perf_event_set_output(event1, event2)
>>                                                                        if (!detach_rest)
>>                                                                          goto out_put;
>>                                                                        list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>                                                                          ring_buffer_attach(event, NULL)
>>                                                                        // because event1 has not been added to event2->rb->event_list,
>>                                                                        // event1->rb is not set to NULL in these loops
>>
>>      ring_buffer_attach(event1, event2->rb)
>>        list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>
>> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
>> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>>
>> again:
>>          mutex_lock(&event->mmap_mutex);
>>          if (event->rb) {
>> <SNIP>
>>                  if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>                          /*
>>                           * Raced against perf_mmap_close() through
>>                           * perf_event_set_output(). Try again, hope for better
>>                           * luck.
>>                           */
>>                          mutex_unlock(&event->mmap_mutex);
>>                          goto again;
>>                  }
>> <SNIP>
> 
> Too tired, must look again tomorrow, little feeback below.
Thanks for reviewing this patch. The perf_mmap_close, 
perf_event_set_output, and perf_mmap involve complex data race and lock 
relationships. Therefore, this simple fix is proposed.
> 
>>   kernel/events/core.c | 24 +++++++++++++++++++++++-
>>   1 file changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 80782cddb1da..c67c070f7b39 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
>>   			       struct perf_buffer *rb)
>>   {
>>   	struct perf_buffer *old_rb = NULL;
>> +	struct perf_buffer *new_rb = rb;
>>   	unsigned long flags;
>>   
>>   	WARN_ON_ONCE(event->parent);
>> @@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
>>   
>>   		spin_lock_irqsave(&rb->event_lock, flags);
>>   		list_add_rcu(&event->rb_entry, &rb->event_list);
>> +
>> +		/*
>> +		 * When perf_mmap_close traverses rb->event_list during
>> +		 * detach all other events, new event may not be added to
>> +		 * rb->event_list, let's check again, if rb->mmap_count is 0,
>> +		 * it indicates that perf_mmap_close is executed.
>> +		 * Manually delete event from rb->event_list and
>> +		 * set event->rb to null.
>> +		 */
>> +		if (!atomic_read(&rb->mmap_count)) {
>> +			list_del_rcu(&event->rb_entry);
>> +			new_rb = NULL;
>> +		}
>> +
>>   		spin_unlock_irqrestore(&rb->event_lock, flags);
>>   	}
>>   
>> @@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
>>   	if (has_aux(event))
>>   		perf_event_stop(event, 0);
>>   
>> -	rcu_assign_pointer(event->rb, rb);
>> +	rcu_assign_pointer(event->rb, new_rb);
>>   
>>   	if (old_rb) {
>>   		ring_buffer_put(old_rb);
> 
> I'm confused by the above hunks; the below will avoid calling
> ring_buffer_attach() when !rb->mmap_count, so how can the above ever
> execute?
In this patch, !atomic_read(&rb->mmap_count) is checked before the 
perf_event_set_output function invokes ring_buffer_attach(event, rb). 
Therefore, !atomic_read(&rb->mmap_count) does not need to be checked in 
the ring_buffer_attach function.

Am I right to understand that?

Because there is no lock parallel protection between ioctl(event1, 
PERF_EVENT_IOC_SET_OUTPUT, event2) and perf_mmap_close(event2), they can 
be executed in parallel.

The following scenarios may exist:

                    CPU1 
        CPU2
 
perf_mmap_close(event2)
																	   ...
ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
    perf_event_set_output(event1, event2)
      ...
      if (rb && !atomic_read(&rb->mmap_count))
	   goto unlock;
	 // Here rb->mmap_count = 1, Keep going.
	 ...
	                                                                  if 
(atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
 
  detach_rest = true;
                                                                       ...
 
list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
 
  ring_buffer_attach(event, NULL)
 
  // because event1 has not been added to event2->rb->event_list,
 
  // event1->rb is not set to NULL in these loops
																	  ...
	 ring_buffer_attach(event1, rb)
	   ...
	   list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
	   ...

In this case, the above problems arise.
> 
>> @@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>>   			goto unlock;
>>   	}
>>   
>> +	/*
>> +	 * If rb->mmap_count is 0, perf_mmap_close is being executed,
>> +	 * the ring buffer is about to be unmapped and cannot be attached.
>> +	 */
>> +	if (rb && !atomic_read(&rb->mmap_count))
>> +		goto unlock;
>> +
>>   	ring_buffer_attach(event, rb);
>>   
>>   	ret = 0;
> 
> This is wrong I think, it'll leak ring_buffer_get().
Yes, ring_buffer_put(rb) needs to be added before goto unlock.
I'll fix in next version.

Thanks,
Yang
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-04 15:26 ` Peter Zijlstra
  2022-07-05  2:07   ` Yang Jihong
@ 2022-07-05 13:07   ` Peter Zijlstra
  2022-07-06 12:29     ` Yang Jihong
  2022-07-14 11:35     ` [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() tip-bot2 for Peter Zijlstra
  1 sibling, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2022-07-05 13:07 UTC (permalink / raw)
  To: Yang Jihong
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
> > Data race exists between perf_event_set_output and perf_mmap_close.
> > The scenario is as follows:
> > 
> >                   CPU1                                                       CPU2
> >                                                                     perf_mmap_close(event2)
> >                                                                       if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
> >                                                                         detach_rest = true;
> > ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
> >   perf_event_set_output(event1, event2)
> >                                                                       if (!detach_rest)
> >                                                                         goto out_put;
> >                                                                       list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
> >                                                                         ring_buffer_attach(event, NULL)
> >                                                                       // because event1 has not been added to event2->rb->event_list,
> >                                                                       // event1->rb is not set to NULL in these loops
> > 
> >     ring_buffer_attach(event1, event2->rb)
> >       list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
> > 
> > The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
> > If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
> > 
> > again:
> >         mutex_lock(&event->mmap_mutex);
> >         if (event->rb) {
> > <SNIP>
> >                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
> >                         /*
> >                          * Raced against perf_mmap_close() through
> >                          * perf_event_set_output(). Try again, hope for better
> >                          * luck.
> >                          */
> >                         mutex_unlock(&event->mmap_mutex);
> >                         goto again;
> >                 }
> > <SNIP>
> 
> Too tired, must look again tomorrow, little feeback below.

With brain more awake I ended up with the below. Does that work?

---
 kernel/events/core.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d8c335a07db..c9d32d4d2e20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6262,10 +6262,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
 			/*
-			 * Raced against perf_mmap_close() through
-			 * perf_event_set_output(). Try again, hope for better
-			 * luck.
+			 * Raced against perf_mmap_close(); remove the
+			 * event and try again.
 			 */
+			ring_buffer_attach(event, NULL);
 			mutex_unlock(&event->mmap_mutex);
 			goto again;
 		}
@@ -11840,14 +11840,25 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	goto out;
 }
 
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+	if (b < a)
+		swap(a, b);
+
+	mutex_lock(a);
+	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
 	struct perf_buffer *rb = NULL;
 	int ret = -EINVAL;
 
-	if (!output_event)
+	if (!output_event) {
+		mutex_lock(&event->mmap_mutex);
 		goto set;
+	}
 
 	/* don't allow circular references */
 	if (event == output_event)
@@ -11885,8 +11896,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	    event->pmu != output_event->pmu)
 		goto out;
 
+	/*
+	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
+	 * output_event is already on rb->event_list, and the list iteration
+	 * restarts after every removal, it is guaranteed this new event is
+	 * observed *OR* if output_event is already removed, it's guaranteed we
+	 * observe !rb->mmap_count.
+	 */
+	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
-	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
 	if (atomic_read(&event->mmap_count))
 		goto unlock;
@@ -11896,6 +11914,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 		rb = ring_buffer_get(output_event);
 		if (!rb)
 			goto unlock;
+
+		/* did we race against perf_mmap_close() */
+		if (!atomic_read(&rb->mmap_count)) {
+			ring_buffer_put(rb);
+			goto unlock;
+		}
 	}
 
 	ring_buffer_attach(event, rb);
@@ -11903,20 +11927,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
+	if (output_event)
+		mutex_unlock(&output_event->mmap_mutex);
 
 out:
 	return ret;
 }
 
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
-	if (b < a)
-		swap(a, b);
-
-	mutex_lock(a);
-	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 {
 	bool nmi_safe = false;

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-05 13:07   ` Peter Zijlstra
@ 2022-07-06 12:29     ` Yang Jihong
  2022-07-09  2:00       ` Yang Jihong
  2022-07-14 11:35     ` [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Yang Jihong @ 2022-07-06 12:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

Hello,

On 2022/7/5 21:07, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
>> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>>> Data race exists between perf_event_set_output and perf_mmap_close.
>>> The scenario is as follows:
>>>
>>>                    CPU1                                                       CPU2
>>>                                                                      perf_mmap_close(event2)
>>>                                                                        if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>>>                                                                          detach_rest = true;
>>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>>    perf_event_set_output(event1, event2)
>>>                                                                        if (!detach_rest)
>>>                                                                          goto out_put;
>>>                                                                        list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>>                                                                          ring_buffer_attach(event, NULL)
>>>                                                                        // because event1 has not been added to event2->rb->event_list,
>>>                                                                        // event1->rb is not set to NULL in these loops
>>>
>>>      ring_buffer_attach(event1, event2->rb)
>>>        list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>>
>>> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
>>> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>>>
>>> again:
>>>          mutex_lock(&event->mmap_mutex);
>>>          if (event->rb) {
>>> <SNIP>
>>>                  if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>>                          /*
>>>                           * Raced against perf_mmap_close() through
>>>                           * perf_event_set_output(). Try again, hope for better
>>>                           * luck.
>>>                           */
>>>                          mutex_unlock(&event->mmap_mutex);
>>>                          goto again;
>>>                  }
>>> <SNIP>
>>
>> Too tired, must look again tomorrow, little feeback below.
> 
> With brain more awake I ended up with the below. Does that work?

Yes, I apply the patch on kernel versions 5.10 and mainline,
and it could fixed the problem.

Tested-by: Yang Jihong <yangjihong1@huawei.com>

Thanks,
Yang

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-06 12:29     ` Yang Jihong
@ 2022-07-09  2:00       ` Yang Jihong
  0 siblings, 0 replies; 7+ messages in thread
From: Yang Jihong @ 2022-07-09  2:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

Hello,

On 2022/7/6 20:29, Yang Jihong wrote:
> Hello,
> 
> On 2022/7/5 21:07, Peter Zijlstra wrote:
>> On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
>>> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>>>> Data race exists between perf_event_set_output and perf_mmap_close.
>>>> The scenario is as follows:
>>>>
>>>>                    
>>>> CPU1                                                       CPU2
>>>>                                                                      
>>>> perf_mmap_close(event2)
>>>>                                                                        
>>>> if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>>>>                                                                          
>>>> detach_rest = true;
>>>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>>>    perf_event_set_output(event1, event2)
>>>>                                                                        
>>>> if (!detach_rest)
>>>>                                                                          
>>>> goto out_put;
>>>>                                                                        
>>>> list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>>>                                                                          
>>>> ring_buffer_attach(event, NULL)
>>>>                                                                        
>>>> // because event1 has not been added to event2->rb->event_list,
>>>>                                                                        
>>>> // event1->rb is not set to NULL in these loops
>>>>
>>>>      ring_buffer_attach(event1, event2->rb)
>>>>        list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>>>
>>>> The above data race causes a problem, that is, event1->rb is not 
>>>> NULL, but event1->rb->mmap_count is 0.
>>>> If the perf_mmap interface is invoked for the fd of event1, the 
>>>> kernel keeps in the perf_mmap infinite loop:
>>>>
>>>> again:
>>>>          mutex_lock(&event->mmap_mutex);
>>>>          if (event->rb) {
>>>> <SNIP>
>>>>                  if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>>>                          /*
>>>>                           * Raced against perf_mmap_close() through
>>>>                           * perf_event_set_output(). Try again, hope 
>>>> for better
>>>>                           * luck.
>>>>                           */
>>>>                          mutex_unlock(&event->mmap_mutex);
>>>>                          goto again;
>>>>                  }
>>>> <SNIP>
>>>
>>> Too tired, must look again tomorrow, little feeback below.
>>
>> With brain more awake I ended up with the below. Does that work?
I have verified that this patch can solve the problem.

Do I submit this patch? Or do you submit it?

Thanks,
Yang

> 
> Yes, I apply the patch on kernel versions 5.10 and mainline,
> and it could fixed the problem.
> 
> Tested-by: Yang Jihong <yangjihong1@huawei.com>
> 
> Thanks,
> Yang
> .

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close()
  2022-07-05 13:07   ` Peter Zijlstra
  2022-07-06 12:29     ` Yang Jihong
@ 2022-07-14 11:35     ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2022-07-14 11:35 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Yang Jihong, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     68e3c69803dada336893640110cb87221bb01dcf
Gitweb:        https://git.kernel.org/tip/68e3c69803dada336893640110cb87221bb01dcf
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 05 Jul 2022 15:07:26 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 13 Jul 2022 11:29:12 +02:00

perf/core: Fix data race between perf_event_set_output() and perf_mmap_close()

Yang Jihing reported a race between perf_event_set_output() and
perf_mmap_close():

	CPU1					CPU2

	perf_mmap_close(e2)
	  if (atomic_dec_and_test(&e2->rb->mmap_count)) // 1 - > 0
	    detach_rest = true

						ioctl(e1, IOC_SET_OUTPUT, e2)
						  perf_event_set_output(e1, e2)

	  ...
	  list_for_each_entry_rcu(e, &e2->rb->event_list, rb_entry)
	    ring_buffer_attach(e, NULL);
	    // e1 isn't yet added and
	    // therefore not detached

						    ring_buffer_attach(e1, e2->rb)
						      list_add_rcu(&e1->rb_entry,
								   &e2->rb->event_list)

After this; e1 is attached to an unmapped rb and a subsequent
perf_mmap() will loop forever more:

	again:
		mutex_lock(&e->mmap_mutex);
		if (event->rb) {
			...
			if (!atomic_inc_not_zero(&e->rb->mmap_count)) {
				...
				mutex_unlock(&e->mmap_mutex);
				goto again;
			}
		}

The loop in perf_mmap_close() holds e2->mmap_mutex, while the attach
in perf_event_set_output() holds e1->mmap_mutex. As such there is no
serialization to avoid this race.

Change perf_event_set_output() to take both e1->mmap_mutex and
e2->mmap_mutex to alleviate that problem. Additionally, have the loop
in perf_mmap() detach the rb directly, this avoids having to wait for
the concurrent perf_mmap_close() to get around to doing it to make
progress.

Fixes: 9bb5d40cd93c ("perf: Fix mmap() accounting hole")
Reported-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Yang Jihong <yangjihong1@huawei.com>
Link: https://lkml.kernel.org/r/YsQ3jm2GR38SW7uD@worktop.programming.kicks-ass.net
---
 kernel/events/core.c | 45 +++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cd..d2b3549 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6253,10 +6253,10 @@ again:
 
 		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
 			/*
-			 * Raced against perf_mmap_close() through
-			 * perf_event_set_output(). Try again, hope for better
-			 * luck.
+			 * Raced against perf_mmap_close(); remove the
+			 * event and try again.
 			 */
+			ring_buffer_attach(event, NULL);
 			mutex_unlock(&event->mmap_mutex);
 			goto again;
 		}
@@ -11825,14 +11825,25 @@ err_size:
 	goto out;
 }
 
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+	if (b < a)
+		swap(a, b);
+
+	mutex_lock(a);
+	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
 	struct perf_buffer *rb = NULL;
 	int ret = -EINVAL;
 
-	if (!output_event)
+	if (!output_event) {
+		mutex_lock(&event->mmap_mutex);
 		goto set;
+	}
 
 	/* don't allow circular references */
 	if (event == output_event)
@@ -11870,8 +11881,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	    event->pmu != output_event->pmu)
 		goto out;
 
+	/*
+	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
+	 * output_event is already on rb->event_list, and the list iteration
+	 * restarts after every removal, it is guaranteed this new event is
+	 * observed *OR* if output_event is already removed, it's guaranteed we
+	 * observe !rb->mmap_count.
+	 */
+	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
-	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
 	if (atomic_read(&event->mmap_count))
 		goto unlock;
@@ -11881,6 +11899,12 @@ set:
 		rb = ring_buffer_get(output_event);
 		if (!rb)
 			goto unlock;
+
+		/* did we race against perf_mmap_close() */
+		if (!atomic_read(&rb->mmap_count)) {
+			ring_buffer_put(rb);
+			goto unlock;
+		}
 	}
 
 	ring_buffer_attach(event, rb);
@@ -11888,20 +11912,13 @@ set:
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
+	if (output_event)
+		mutex_unlock(&output_event->mmap_mutex);
 
 out:
 	return ret;
 }
 
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
-	if (b < a)
-		swap(a, b);
-
-	mutex_lock(a);
-	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 {
 	bool nmi_safe = false;

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-07-14 11:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-04 12:00 [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close Yang Jihong
2022-07-04 15:26 ` Peter Zijlstra
2022-07-05  2:07   ` Yang Jihong
2022-07-05 13:07   ` Peter Zijlstra
2022-07-06 12:29     ` Yang Jihong
2022-07-09  2:00       ` Yang Jihong
2022-07-14 11:35     ` [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() tip-bot2 for Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.