linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
@ 2022-07-04 12:00 Yang Jihong
  2022-07-04 15:26 ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Yang Jihong @ 2022-07-04 12:00 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, linux-perf-users, linux-kernel
  Cc: yangjihong1

Data race exists between perf_event_set_output and perf_mmap_close.
The scenario is as follows:

                  CPU1                                                       CPU2
                                                                    perf_mmap_close(event2)
                                                                      if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
                                                                        detach_rest = true;
ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
  perf_event_set_output(event1, event2)
                                                                      if (!detach_rest)
                                                                        goto out_put;
                                                                      list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
                                                                        ring_buffer_attach(event, NULL)
                                                                      // because event1 has not been added to event2->rb->event_list,
                                                                      // event1->rb is not set to NULL in these loops

    ring_buffer_attach(event1, event2->rb)
      list_add_rcu(&event1->rb_entry, &event2->rb->event_list)

The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:

again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
<SNIP>
                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Raced against perf_mmap_close() through
                         * perf_event_set_output(). Try again, hope for better
                         * luck.
                         */
                        mutex_unlock(&event->mmap_mutex);
                        goto again;
                }
<SNIP>

This problem has occurred on the Syzkaller, causing the softlockup. The call stack is as follows:
[ 3666.984385][    C2] watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [syz-executor.2:32404]
[ 3666.986137][    C2] Modules linked in:
[ 3666.989581][    C2] CPU: 2 PID: 32404 Comm: syz-executor.2 Not tainted 5.10.0+ #4
[ 3666.990697][    C2] Hardware name: linux,dummy-virt (DT)
[ 3666.992270][    C2] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
[ 3666.993787][    C2] pc : __kasan_check_write+0x0/0x40
[ 3666.994841][    C2] lr : perf_mmap+0x3c8/0xf80
[ 3666.995661][    C2] sp : ffff00001011f8f0
[ 3666.996598][    C2] x29: ffff00001011f8f0 x28: ffff0000cf644868
[ 3666.998488][    C2] x27: ffff000012cad2c0 x26: 0000000000000000
[ 3666.999888][    C2] x25: 0000000000000001 x24: ffff000012cad298
[ 3667.003511][    C2] x23: 0000000000000000 x22: ffff000012cad000
[ 3667.005504][    C2] x21: ffff0000cf644818 x20: ffff0000cf6d2400
[ 3667.006891][    C2] x19: ffff0000cf6d24c0 x18: 0000000000000000
[ 3667.008295][    C2] x17: 0000000000000000 x16: 0000000000000000
[ 3667.009528][    C2] x15: 0000000000000000 x14: 0000000000000000
[ 3667.010658][    C2] x13: 0000000000000000 x12: ffff800002023f17
[ 3667.012169][    C2] x11: 1fffe00002023f16 x10: ffff800002023f16
[ 3667.013780][    C2] x9 : dfffa00000000000 x8 : ffff00001011f8b7
[ 3667.015265][    C2] x7 : 0000000000000001 x6 : ffff800002023f16
[ 3667.016683][    C2] x5 : ffff0000c0f36400 x4 : 0000000000000000
[ 3667.018078][    C2] x3 : ffffa00010000000 x2 : ffffa000119a0000
[ 3667.019343][    C2] x1 : 0000000000000004 x0 : ffff0000cf6d24c0
[ 3667.021276][    C2] Call trace:
[ 3667.022598][    C2]  __kasan_check_write+0x0/0x40
[ 3667.023666][    C2]  __mmap_region+0x7a4/0xc90
[ 3667.024679][    C2]  __do_mmap_mm+0x600/0xa20
[ 3667.025700][    C2]  do_mmap+0x114/0x384
[ 3667.026583][    C2]  vm_mmap_pgoff+0x138/0x230
[ 3667.027532][    C2]  ksys_mmap_pgoff+0x1d8/0x570
[ 3667.028537][    C2]  __arm64_sys_mmap+0xa4/0xd0
[ 3667.029597][    C2]  el0_svc_common.constprop.0+0xf4/0x414
[ 3667.030682][    C2]  do_el0_svc+0x50/0x11c
[ 3667.031545][    C2]  el0_svc+0x20/0x30
[ 3667.032368][    C2]  el0_sync_handler+0xe4/0x1e0
[ 3667.033305][    C2]  el0_sync+0x148/0x180

perf_mmap_close and ring_buffer_attach involve complicated lock recursion.
The solution is to modify ring_buffer_attach function, that is,
after new event is added to rb->event_list, check whether rb->mmap_count is 0,
if the value is 0, another core performs perf_mmap_close operation
on ring buffer during this period. In this case, we set event->rb to NULL.

Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
---

Changes since v1:
 - Modify commit message description.

 kernel/events/core.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..c67c070f7b39 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
 			       struct perf_buffer *rb)
 {
 	struct perf_buffer *old_rb = NULL;
+	struct perf_buffer *new_rb = rb;
 	unsigned long flags;
 
 	WARN_ON_ONCE(event->parent);
@@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
 
 		spin_lock_irqsave(&rb->event_lock, flags);
 		list_add_rcu(&event->rb_entry, &rb->event_list);
+
+		/*
+		 * When perf_mmap_close traverses rb->event_list during
+		 * detach all other events, new event may not be added to
+		 * rb->event_list, let's check again, if rb->mmap_count is 0,
+		 * it indicates that perf_mmap_close is executed.
+		 * Manually delete event from rb->event_list and
+		 * set event->rb to null.
+		 */
+		if (!atomic_read(&rb->mmap_count)) {
+			list_del_rcu(&event->rb_entry);
+			new_rb = NULL;
+		}
+
 		spin_unlock_irqrestore(&rb->event_lock, flags);
 	}
 
@@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
 	if (has_aux(event))
 		perf_event_stop(event, 0);
 
-	rcu_assign_pointer(event->rb, rb);
+	rcu_assign_pointer(event->rb, new_rb);
 
 	if (old_rb) {
 		ring_buffer_put(old_rb);
@@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 			goto unlock;
 	}
 
+	/*
+	 * If rb->mmap_count is 0, perf_mmap_close is being executed,
+	 * the ring buffer is about to be unmapped and cannot be attached.
+	 */
+	if (rb && !atomic_read(&rb->mmap_count))
+		goto unlock;
+
 	ring_buffer_attach(event, rb);
 
 	ret = 0;
-- 
2.30.GIT


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-04 12:00 [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close Yang Jihong
@ 2022-07-04 15:26 ` Peter Zijlstra
  2022-07-05  2:07   ` Yang Jihong
  2022-07-05 13:07   ` Peter Zijlstra
  0 siblings, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2022-07-04 15:26 UTC (permalink / raw)
  To: Yang Jihong
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
> Data race exists between perf_event_set_output and perf_mmap_close.
> The scenario is as follows:
> 
>                   CPU1                                                       CPU2
>                                                                     perf_mmap_close(event2)
>                                                                       if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>                                                                         detach_rest = true;
> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>   perf_event_set_output(event1, event2)
>                                                                       if (!detach_rest)
>                                                                         goto out_put;
>                                                                       list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>                                                                         ring_buffer_attach(event, NULL)
>                                                                       // because event1 has not been added to event2->rb->event_list,
>                                                                       // event1->rb is not set to NULL in these loops
> 
>     ring_buffer_attach(event1, event2->rb)
>       list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
> 
> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
> 
> again:
>         mutex_lock(&event->mmap_mutex);
>         if (event->rb) {
> <SNIP>
>                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>                         /*
>                          * Raced against perf_mmap_close() through
>                          * perf_event_set_output(). Try again, hope for better
>                          * luck.
>                          */
>                         mutex_unlock(&event->mmap_mutex);
>                         goto again;
>                 }
> <SNIP>

Too tired, must look again tomorrow, little feeback below.

>  kernel/events/core.c | 24 +++++++++++++++++++++++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 80782cddb1da..c67c070f7b39 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
>  			       struct perf_buffer *rb)
>  {
>  	struct perf_buffer *old_rb = NULL;
> +	struct perf_buffer *new_rb = rb;
>  	unsigned long flags;
>  
>  	WARN_ON_ONCE(event->parent);
> @@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
>  
>  		spin_lock_irqsave(&rb->event_lock, flags);
>  		list_add_rcu(&event->rb_entry, &rb->event_list);
> +
> +		/*
> +		 * When perf_mmap_close traverses rb->event_list during
> +		 * detach all other events, new event may not be added to
> +		 * rb->event_list, let's check again, if rb->mmap_count is 0,
> +		 * it indicates that perf_mmap_close is executed.
> +		 * Manually delete event from rb->event_list and
> +		 * set event->rb to null.
> +		 */
> +		if (!atomic_read(&rb->mmap_count)) {
> +			list_del_rcu(&event->rb_entry);
> +			new_rb = NULL;
> +		}
> +
>  		spin_unlock_irqrestore(&rb->event_lock, flags);
>  	}
>  
> @@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
>  	if (has_aux(event))
>  		perf_event_stop(event, 0);
>  
> -	rcu_assign_pointer(event->rb, rb);
> +	rcu_assign_pointer(event->rb, new_rb);
>  
>  	if (old_rb) {
>  		ring_buffer_put(old_rb);

I'm confused by the above hunks; the below will avoid calling
ring_buffer_attach() when !rb->mmap_count, so how can the above ever
execute?

> @@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>  			goto unlock;
>  	}
>  
> +	/*
> +	 * If rb->mmap_count is 0, perf_mmap_close is being executed,
> +	 * the ring buffer is about to be unmapped and cannot be attached.
> +	 */
> +	if (rb && !atomic_read(&rb->mmap_count))
> +		goto unlock;
> +
>  	ring_buffer_attach(event, rb);
>  
>  	ret = 0;

This is wrong I think, it'll leak ring_buffer_get().


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-04 15:26 ` Peter Zijlstra
@ 2022-07-05  2:07   ` Yang Jihong
  2022-07-05 13:07   ` Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: Yang Jihong @ 2022-07-05  2:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

Hello,

On 2022/7/4 23:26, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>> Data race exists between perf_event_set_output and perf_mmap_close.
>> The scenario is as follows:
>>
>>                    CPU1                                                       CPU2
>>                                                                      perf_mmap_close(event2)
>>                                                                        if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>>                                                                          detach_rest = true;
>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>    perf_event_set_output(event1, event2)
>>                                                                        if (!detach_rest)
>>                                                                          goto out_put;
>>                                                                        list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>                                                                          ring_buffer_attach(event, NULL)
>>                                                                        // because event1 has not been added to event2->rb->event_list,
>>                                                                        // event1->rb is not set to NULL in these loops
>>
>>      ring_buffer_attach(event1, event2->rb)
>>        list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>
>> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
>> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>>
>> again:
>>          mutex_lock(&event->mmap_mutex);
>>          if (event->rb) {
>> <SNIP>
>>                  if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>                          /*
>>                           * Raced against perf_mmap_close() through
>>                           * perf_event_set_output(). Try again, hope for better
>>                           * luck.
>>                           */
>>                          mutex_unlock(&event->mmap_mutex);
>>                          goto again;
>>                  }
>> <SNIP>
> 
> Too tired, must look again tomorrow, little feeback below.
Thanks for reviewing this patch. The perf_mmap_close, 
perf_event_set_output, and perf_mmap involve complex data race and lock 
relationships. Therefore, this simple fix is proposed.
> 
>>   kernel/events/core.c | 24 +++++++++++++++++++++++-
>>   1 file changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 80782cddb1da..c67c070f7b39 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -5900,6 +5900,7 @@ static void ring_buffer_attach(struct perf_event *event,
>>   			       struct perf_buffer *rb)
>>   {
>>   	struct perf_buffer *old_rb = NULL;
>> +	struct perf_buffer *new_rb = rb;
>>   	unsigned long flags;
>>   
>>   	WARN_ON_ONCE(event->parent);
>> @@ -5928,6 +5929,20 @@ static void ring_buffer_attach(struct perf_event *event,
>>   
>>   		spin_lock_irqsave(&rb->event_lock, flags);
>>   		list_add_rcu(&event->rb_entry, &rb->event_list);
>> +
>> +		/*
>> +		 * When perf_mmap_close traverses rb->event_list during
>> +		 * detach all other events, new event may not be added to
>> +		 * rb->event_list, let's check again, if rb->mmap_count is 0,
>> +		 * it indicates that perf_mmap_close is executed.
>> +		 * Manually delete event from rb->event_list and
>> +		 * set event->rb to null.
>> +		 */
>> +		if (!atomic_read(&rb->mmap_count)) {
>> +			list_del_rcu(&event->rb_entry);
>> +			new_rb = NULL;
>> +		}
>> +
>>   		spin_unlock_irqrestore(&rb->event_lock, flags);
>>   	}
>>   
>> @@ -5944,7 +5959,7 @@ static void ring_buffer_attach(struct perf_event *event,
>>   	if (has_aux(event))
>>   		perf_event_stop(event, 0);
>>   
>> -	rcu_assign_pointer(event->rb, rb);
>> +	rcu_assign_pointer(event->rb, new_rb);
>>   
>>   	if (old_rb) {
>>   		ring_buffer_put(old_rb);
> 
> I'm confused by the above hunks; the below will avoid calling
> ring_buffer_attach() when !rb->mmap_count, so how can the above ever
> execute?
In this patch, !atomic_read(&rb->mmap_count) is checked before the 
perf_event_set_output function invokes ring_buffer_attach(event, rb). 
Therefore, !atomic_read(&rb->mmap_count) does not need to be checked in 
the ring_buffer_attach function.

Am I right to understand that?

Because there is no lock parallel protection between ioctl(event1, 
PERF_EVENT_IOC_SET_OUTPUT, event2) and perf_mmap_close(event2), they can 
be executed in parallel.

The following scenarios may exist:

                    CPU1 
        CPU2
 
perf_mmap_close(event2)
																	   ...
ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
    perf_event_set_output(event1, event2)
      ...
      if (rb && !atomic_read(&rb->mmap_count))
	   goto unlock;
	 // Here rb->mmap_count = 1, Keep going.
	 ...
	                                                                  if 
(atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
 
  detach_rest = true;
                                                                       ...
 
list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
 
  ring_buffer_attach(event, NULL)
 
  // because event1 has not been added to event2->rb->event_list,
 
  // event1->rb is not set to NULL in these loops
																	  ...
	 ring_buffer_attach(event1, rb)
	   ...
	   list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
	   ...

In this case, the above problems arise.
> 
>> @@ -11883,6 +11898,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>>   			goto unlock;
>>   	}
>>   
>> +	/*
>> +	 * If rb->mmap_count is 0, perf_mmap_close is being executed,
>> +	 * the ring buffer is about to be unmapped and cannot be attached.
>> +	 */
>> +	if (rb && !atomic_read(&rb->mmap_count))
>> +		goto unlock;
>> +
>>   	ring_buffer_attach(event, rb);
>>   
>>   	ret = 0;
> 
> This is wrong I think, it'll leak ring_buffer_get().
Yes, ring_buffer_put(rb) needs to be added before goto unlock.
I'll fix in next version.

Thanks,
Yang
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-04 15:26 ` Peter Zijlstra
  2022-07-05  2:07   ` Yang Jihong
@ 2022-07-05 13:07   ` Peter Zijlstra
  2022-07-06 12:29     ` Yang Jihong
  2022-07-14 11:35     ` [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() tip-bot2 for Peter Zijlstra
  1 sibling, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2022-07-05 13:07 UTC (permalink / raw)
  To: Yang Jihong
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
> > Data race exists between perf_event_set_output and perf_mmap_close.
> > The scenario is as follows:
> > 
> >                   CPU1                                                       CPU2
> >                                                                     perf_mmap_close(event2)
> >                                                                       if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
> >                                                                         detach_rest = true;
> > ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
> >   perf_event_set_output(event1, event2)
> >                                                                       if (!detach_rest)
> >                                                                         goto out_put;
> >                                                                       list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
> >                                                                         ring_buffer_attach(event, NULL)
> >                                                                       // because event1 has not been added to event2->rb->event_list,
> >                                                                       // event1->rb is not set to NULL in these loops
> > 
> >     ring_buffer_attach(event1, event2->rb)
> >       list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
> > 
> > The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
> > If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
> > 
> > again:
> >         mutex_lock(&event->mmap_mutex);
> >         if (event->rb) {
> > <SNIP>
> >                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
> >                         /*
> >                          * Raced against perf_mmap_close() through
> >                          * perf_event_set_output(). Try again, hope for better
> >                          * luck.
> >                          */
> >                         mutex_unlock(&event->mmap_mutex);
> >                         goto again;
> >                 }
> > <SNIP>
> 
> Too tired, must look again tomorrow, little feeback below.

With brain more awake I ended up with the below. Does that work?

---
 kernel/events/core.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d8c335a07db..c9d32d4d2e20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6262,10 +6262,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
 			/*
-			 * Raced against perf_mmap_close() through
-			 * perf_event_set_output(). Try again, hope for better
-			 * luck.
+			 * Raced against perf_mmap_close(); remove the
+			 * event and try again.
 			 */
+			ring_buffer_attach(event, NULL);
 			mutex_unlock(&event->mmap_mutex);
 			goto again;
 		}
@@ -11840,14 +11840,25 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	goto out;
 }
 
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+	if (b < a)
+		swap(a, b);
+
+	mutex_lock(a);
+	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
 	struct perf_buffer *rb = NULL;
 	int ret = -EINVAL;
 
-	if (!output_event)
+	if (!output_event) {
+		mutex_lock(&event->mmap_mutex);
 		goto set;
+	}
 
 	/* don't allow circular references */
 	if (event == output_event)
@@ -11885,8 +11896,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	    event->pmu != output_event->pmu)
 		goto out;
 
+	/*
+	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
+	 * output_event is already on rb->event_list, and the list iteration
+	 * restarts after every removal, it is guaranteed this new event is
+	 * observed *OR* if output_event is already removed, it's guaranteed we
+	 * observe !rb->mmap_count.
+	 */
+	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
-	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
 	if (atomic_read(&event->mmap_count))
 		goto unlock;
@@ -11896,6 +11914,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 		rb = ring_buffer_get(output_event);
 		if (!rb)
 			goto unlock;
+
+		/* did we race against perf_mmap_close() */
+		if (!atomic_read(&rb->mmap_count)) {
+			ring_buffer_put(rb);
+			goto unlock;
+		}
 	}
 
 	ring_buffer_attach(event, rb);
@@ -11903,20 +11927,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
+	if (output_event)
+		mutex_unlock(&output_event->mmap_mutex);
 
 out:
 	return ret;
 }
 
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
-	if (b < a)
-		swap(a, b);
-
-	mutex_lock(a);
-	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 {
 	bool nmi_safe = false;

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-05 13:07   ` Peter Zijlstra
@ 2022-07-06 12:29     ` Yang Jihong
  2022-07-09  2:00       ` Yang Jihong
  2022-07-14 11:35     ` [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Yang Jihong @ 2022-07-06 12:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

Hello,

On 2022/7/5 21:07, Peter Zijlstra wrote:
> On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
>> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>>> Data race exists between perf_event_set_output and perf_mmap_close.
>>> The scenario is as follows:
>>>
>>>                    CPU1                                                       CPU2
>>>                                                                      perf_mmap_close(event2)
>>>                                                                        if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>>>                                                                          detach_rest = true;
>>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>>    perf_event_set_output(event1, event2)
>>>                                                                        if (!detach_rest)
>>>                                                                          goto out_put;
>>>                                                                        list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>>                                                                          ring_buffer_attach(event, NULL)
>>>                                                                        // because event1 has not been added to event2->rb->event_list,
>>>                                                                        // event1->rb is not set to NULL in these loops
>>>
>>>      ring_buffer_attach(event1, event2->rb)
>>>        list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>>
>>> The above data race causes a problem, that is, event1->rb is not NULL, but event1->rb->mmap_count is 0.
>>> If the perf_mmap interface is invoked for the fd of event1, the kernel keeps in the perf_mmap infinite loop:
>>>
>>> again:
>>>          mutex_lock(&event->mmap_mutex);
>>>          if (event->rb) {
>>> <SNIP>
>>>                  if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>>                          /*
>>>                           * Raced against perf_mmap_close() through
>>>                           * perf_event_set_output(). Try again, hope for better
>>>                           * luck.
>>>                           */
>>>                          mutex_unlock(&event->mmap_mutex);
>>>                          goto again;
>>>                  }
>>> <SNIP>
>>
>> Too tired, must look again tomorrow, little feeback below.
> 
> With brain more awake I ended up with the below. Does that work?

Yes, I apply the patch on kernel versions 5.10 and mainline,
and it could fixed the problem.

Tested-by: Yang Jihong <yangjihong1@huawei.com>

Thanks,
Yang

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close
  2022-07-06 12:29     ` Yang Jihong
@ 2022-07-09  2:00       ` Yang Jihong
  0 siblings, 0 replies; 7+ messages in thread
From: Yang Jihong @ 2022-07-09  2:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	linux-perf-users, linux-kernel

Hello,

On 2022/7/6 20:29, Yang Jihong wrote:
> Hello,
> 
> On 2022/7/5 21:07, Peter Zijlstra wrote:
>> On Mon, Jul 04, 2022 at 05:26:04PM +0200, Peter Zijlstra wrote:
>>> On Mon, Jul 04, 2022 at 08:00:06PM +0800, Yang Jihong wrote:
>>>> Data race exists between perf_event_set_output and perf_mmap_close.
>>>> The scenario is as follows:
>>>>
>>>>                    
>>>> CPU1                                                       CPU2
>>>>                                                                      
>>>> perf_mmap_close(event2)
>>>>                                                                        
>>>> if (atomic_dec_and_test(&event2->rb->mmap_count)  // mmap_count 1 -> 0
>>>>                                                                          
>>>> detach_rest = true;
>>>> ioctl(event1, PERF_EVENT_IOC_SET_OUTPUT, event2)
>>>>    perf_event_set_output(event1, event2)
>>>>                                                                        
>>>> if (!detach_rest)
>>>>                                                                          
>>>> goto out_put;
>>>>                                                                        
>>>> list_for_each_entry_rcu(event, &event2->rb->event_list, rb_entry)
>>>>                                                                          
>>>> ring_buffer_attach(event, NULL)
>>>>                                                                        
>>>> // because event1 has not been added to event2->rb->event_list,
>>>>                                                                        
>>>> // event1->rb is not set to NULL in these loops
>>>>
>>>>      ring_buffer_attach(event1, event2->rb)
>>>>        list_add_rcu(&event1->rb_entry, &event2->rb->event_list)
>>>>
>>>> The above data race causes a problem, that is, event1->rb is not 
>>>> NULL, but event1->rb->mmap_count is 0.
>>>> If the perf_mmap interface is invoked for the fd of event1, the 
>>>> kernel keeps in the perf_mmap infinite loop:
>>>>
>>>> again:
>>>>          mutex_lock(&event->mmap_mutex);
>>>>          if (event->rb) {
>>>> <SNIP>
>>>>                  if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
>>>>                          /*
>>>>                           * Raced against perf_mmap_close() through
>>>>                           * perf_event_set_output(). Try again, hope 
>>>> for better
>>>>                           * luck.
>>>>                           */
>>>>                          mutex_unlock(&event->mmap_mutex);
>>>>                          goto again;
>>>>                  }
>>>> <SNIP>
>>>
>>> Too tired, must look again tomorrow, little feeback below.
>>
>> With brain more awake I ended up with the below. Does that work?
I have verified that this patch can solve the problem.

Do I submit this patch? Or do you submit it?

Thanks,
Yang

> 
> Yes, I apply the patch on kernel versions 5.10 and mainline,
> and it could fixed the problem.
> 
> Tested-by: Yang Jihong <yangjihong1@huawei.com>
> 
> Thanks,
> Yang
> .

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close()
  2022-07-05 13:07   ` Peter Zijlstra
  2022-07-06 12:29     ` Yang Jihong
@ 2022-07-14 11:35     ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2022-07-14 11:35 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Yang Jihong, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     68e3c69803dada336893640110cb87221bb01dcf
Gitweb:        https://git.kernel.org/tip/68e3c69803dada336893640110cb87221bb01dcf
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 05 Jul 2022 15:07:26 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 13 Jul 2022 11:29:12 +02:00

perf/core: Fix data race between perf_event_set_output() and perf_mmap_close()

Yang Jihing reported a race between perf_event_set_output() and
perf_mmap_close():

	CPU1					CPU2

	perf_mmap_close(e2)
	  if (atomic_dec_and_test(&e2->rb->mmap_count)) // 1 - > 0
	    detach_rest = true

						ioctl(e1, IOC_SET_OUTPUT, e2)
						  perf_event_set_output(e1, e2)

	  ...
	  list_for_each_entry_rcu(e, &e2->rb->event_list, rb_entry)
	    ring_buffer_attach(e, NULL);
	    // e1 isn't yet added and
	    // therefore not detached

						    ring_buffer_attach(e1, e2->rb)
						      list_add_rcu(&e1->rb_entry,
								   &e2->rb->event_list)

After this; e1 is attached to an unmapped rb and a subsequent
perf_mmap() will loop forever more:

	again:
		mutex_lock(&e->mmap_mutex);
		if (event->rb) {
			...
			if (!atomic_inc_not_zero(&e->rb->mmap_count)) {
				...
				mutex_unlock(&e->mmap_mutex);
				goto again;
			}
		}

The loop in perf_mmap_close() holds e2->mmap_mutex, while the attach
in perf_event_set_output() holds e1->mmap_mutex. As such there is no
serialization to avoid this race.

Change perf_event_set_output() to take both e1->mmap_mutex and
e2->mmap_mutex to alleviate that problem. Additionally, have the loop
in perf_mmap() detach the rb directly, this avoids having to wait for
the concurrent perf_mmap_close() to get around to doing it to make
progress.

Fixes: 9bb5d40cd93c ("perf: Fix mmap() accounting hole")
Reported-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Yang Jihong <yangjihong1@huawei.com>
Link: https://lkml.kernel.org/r/YsQ3jm2GR38SW7uD@worktop.programming.kicks-ass.net
---
 kernel/events/core.c | 45 +++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cd..d2b3549 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6253,10 +6253,10 @@ again:
 
 		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
 			/*
-			 * Raced against perf_mmap_close() through
-			 * perf_event_set_output(). Try again, hope for better
-			 * luck.
+			 * Raced against perf_mmap_close(); remove the
+			 * event and try again.
 			 */
+			ring_buffer_attach(event, NULL);
 			mutex_unlock(&event->mmap_mutex);
 			goto again;
 		}
@@ -11825,14 +11825,25 @@ err_size:
 	goto out;
 }
 
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+	if (b < a)
+		swap(a, b);
+
+	mutex_lock(a);
+	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
 	struct perf_buffer *rb = NULL;
 	int ret = -EINVAL;
 
-	if (!output_event)
+	if (!output_event) {
+		mutex_lock(&event->mmap_mutex);
 		goto set;
+	}
 
 	/* don't allow circular references */
 	if (event == output_event)
@@ -11870,8 +11881,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	    event->pmu != output_event->pmu)
 		goto out;
 
+	/*
+	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
+	 * output_event is already on rb->event_list, and the list iteration
+	 * restarts after every removal, it is guaranteed this new event is
+	 * observed *OR* if output_event is already removed, it's guaranteed we
+	 * observe !rb->mmap_count.
+	 */
+	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
-	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
 	if (atomic_read(&event->mmap_count))
 		goto unlock;
@@ -11881,6 +11899,12 @@ set:
 		rb = ring_buffer_get(output_event);
 		if (!rb)
 			goto unlock;
+
+		/* did we race against perf_mmap_close() */
+		if (!atomic_read(&rb->mmap_count)) {
+			ring_buffer_put(rb);
+			goto unlock;
+		}
 	}
 
 	ring_buffer_attach(event, rb);
@@ -11888,20 +11912,13 @@ set:
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
+	if (output_event)
+		mutex_unlock(&output_event->mmap_mutex);
 
 out:
 	return ret;
 }
 
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
-	if (b < a)
-		swap(a, b);
-
-	mutex_lock(a);
-	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 {
 	bool nmi_safe = false;

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-07-14 11:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-04 12:00 [PATCH v2] perf/core: Fix data race between perf_event_set_output and perf_mmap_close Yang Jihong
2022-07-04 15:26 ` Peter Zijlstra
2022-07-05  2:07   ` Yang Jihong
2022-07-05 13:07   ` Peter Zijlstra
2022-07-06 12:29     ` Yang Jihong
2022-07-09  2:00       ` Yang Jihong
2022-07-14 11:35     ` [tip: perf/urgent] perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() tip-bot2 for Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).