All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next v2] bpf: using rcu_read_lock for bpf_sk_storage_map iterator
@ 2020-09-15 22:35 Yonghong Song
  2020-09-16  5:37 ` Martin KaFai Lau
  0 siblings, 1 reply; 3+ messages in thread
From: Yonghong Song @ 2020-09-15 22:35 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Martin KaFai Lau, Song Liu

If a bucket contains a lot of sockets, during bpf_iter traversing
a bucket, concurrent userspace bpf_map_update_elem() and
bpf program bpf_sk_storage_{get,delete}() may experience
some undesirable delays as they will compete with bpf_iter
for bucket lock.

Note that the number of buckets for bpf_sk_storage_map
is roughly the same as the number of cpus. So if there
are lots of sockets in the system, each bucket could
contain lots of sockets.

Different actual use cases may experience different delays.
Here, using selftest bpf_iter subtest bpf_sk_storage_map,
I hacked the kernel with ktime_get_mono_fast_ns()
to collect the time when a bucket was locked
during bpf_iter prog traversing that bucket. This way,
the maximum incurred delay was measured w.r.t. the
number of elements in a bucket.
    # elems in each bucket          delay(ns)
      64                            17000
      256                           72512
      2048                          875246

The potential delays will be further increased if
we have even more elemnts in a bucket. Using rcu_read_lock()
is a reasonable compromise here. It may lose some precision, e.g.,
access stale sockets, but it will not hurt performance of
bpf program or user space application which also tries
to get/delete or update map elements.

Cc: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 net/core/bpf_sk_storage.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

Changelog:
 v1 -> v2:
   - added some performance number. (Song)
   - tried to silence some sparse complains. but still has some left like
       context imbalance in "..." - different lock contexts for basic block
     which the code is too hard for sparse to analyze. (Jakub)

diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 4a86ea34f29e..4fc6b03d3639 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -678,6 +678,7 @@ struct bpf_iter_seq_sk_storage_map_info {
 static struct bpf_local_storage_elem *
 bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 				 struct bpf_local_storage_elem *prev_selem)
+	__acquires(RCU) __releases(RCU)
 {
 	struct bpf_local_storage *sk_storage;
 	struct bpf_local_storage_elem *selem;
@@ -701,7 +702,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 		if (!selem) {
 			/* not found, unlock and go to the next bucket */
 			b = &smap->buckets[bucket_id++];
-			raw_spin_unlock_bh(&b->lock);
+			rcu_read_unlock();
 			skip_elems = 0;
 			break;
 		}
@@ -715,7 +716,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 
 	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
 		b = &smap->buckets[i];
-		raw_spin_lock_bh(&b->lock);
+		rcu_read_lock();
 		count = 0;
 		hlist_for_each_entry(selem, &b->list, map_node) {
 			sk_storage = rcu_dereference_raw(selem->local_storage);
@@ -726,7 +727,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 			}
 			count++;
 		}
-		raw_spin_unlock_bh(&b->lock);
+		rcu_read_unlock();
 		skip_elems = 0;
 	}
 
@@ -801,18 +802,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
 }
 
 static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
 {
-	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
-	struct bpf_local_storage_map *smap;
-	struct bpf_local_storage_map_bucket *b;
-
-	if (!v) {
+	if (!v)
 		(void)__bpf_sk_storage_map_seq_show(seq, v);
-	} else {
-		smap = (struct bpf_local_storage_map *)info->map;
-		b = &smap->buckets[info->bucket_id];
-		raw_spin_unlock_bh(&b->lock);
-	}
+	else
+		rcu_read_unlock();
 }
 
 static int bpf_iter_init_sk_storage_map(void *priv_data,
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH bpf-next v2] bpf: using rcu_read_lock for bpf_sk_storage_map iterator
  2020-09-15 22:35 [PATCH bpf-next v2] bpf: using rcu_read_lock for bpf_sk_storage_map iterator Yonghong Song
@ 2020-09-16  5:37 ` Martin KaFai Lau
  2020-09-16  6:09   ` Yonghong Song
  0 siblings, 1 reply; 3+ messages in thread
From: Martin KaFai Lau @ 2020-09-16  5:37 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team, Song Liu

On Tue, Sep 15, 2020 at 03:35:07PM -0700, Yonghong Song wrote:
> If a bucket contains a lot of sockets, during bpf_iter traversing
> a bucket, concurrent userspace bpf_map_update_elem() and
> bpf program bpf_sk_storage_{get,delete}() may experience
> some undesirable delays as they will compete with bpf_iter
> for bucket lock.
> 
> Note that the number of buckets for bpf_sk_storage_map
> is roughly the same as the number of cpus. So if there
> are lots of sockets in the system, each bucket could
> contain lots of sockets.
> 
> Different actual use cases may experience different delays.
> Here, using selftest bpf_iter subtest bpf_sk_storage_map,
> I hacked the kernel with ktime_get_mono_fast_ns()
> to collect the time when a bucket was locked
> during bpf_iter prog traversing that bucket. This way,
> the maximum incurred delay was measured w.r.t. the
> number of elements in a bucket.
>     # elems in each bucket          delay(ns)
>       64                            17000
>       256                           72512
>       2048                          875246
> 
> The potential delays will be further increased if
> we have even more elemnts in a bucket. Using rcu_read_lock()
> is a reasonable compromise here. It may lose some precision, e.g.,
> access stale sockets, but it will not hurt performance of
> bpf program or user space application which also tries
> to get/delete or update map elements.
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Acked-by: Song Liu <songliubraving@fb.com>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  net/core/bpf_sk_storage.c | 21 ++++++++-------------
>  1 file changed, 8 insertions(+), 13 deletions(-)
> 
> Changelog:
>  v1 -> v2:
>    - added some performance number. (Song)
>    - tried to silence some sparse complains. but still has some left like
>        context imbalance in "..." - different lock contexts for basic block
>      which the code is too hard for sparse to analyze. (Jakub)
> 
> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> index 4a86ea34f29e..4fc6b03d3639 100644
> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -678,6 +678,7 @@ struct bpf_iter_seq_sk_storage_map_info {
>  static struct bpf_local_storage_elem *
>  bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
>  				 struct bpf_local_storage_elem *prev_selem)
> +	__acquires(RCU) __releases(RCU)
>  {
>  	struct bpf_local_storage *sk_storage;
>  	struct bpf_local_storage_elem *selem;
> @@ -701,7 +702,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
>  		if (!selem) {
>  			/* not found, unlock and go to the next bucket */
>  			b = &smap->buckets[bucket_id++];
> -			raw_spin_unlock_bh(&b->lock);
> +			rcu_read_unlock();
>  			skip_elems = 0;
>  			break;
>  		}
> @@ -715,7 +716,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
>  
>  	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
>  		b = &smap->buckets[i];
> -		raw_spin_lock_bh(&b->lock);
> +		rcu_read_lock();
>  		count = 0;
>  		hlist_for_each_entry(selem, &b->list, map_node) {
hlist_for_each_entry_rcu()

>  			sk_storage = rcu_dereference_raw(selem->local_storage);
Does lockdep complain without "_raw"?

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH bpf-next v2] bpf: using rcu_read_lock for bpf_sk_storage_map iterator
  2020-09-16  5:37 ` Martin KaFai Lau
@ 2020-09-16  6:09   ` Yonghong Song
  0 siblings, 0 replies; 3+ messages in thread
From: Yonghong Song @ 2020-09-16  6:09 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: bpf, netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team, Song Liu



On 9/15/20 10:37 PM, Martin KaFai Lau wrote:
> On Tue, Sep 15, 2020 at 03:35:07PM -0700, Yonghong Song wrote:
>> If a bucket contains a lot of sockets, during bpf_iter traversing
>> a bucket, concurrent userspace bpf_map_update_elem() and
>> bpf program bpf_sk_storage_{get,delete}() may experience
>> some undesirable delays as they will compete with bpf_iter
>> for bucket lock.
>>
>> Note that the number of buckets for bpf_sk_storage_map
>> is roughly the same as the number of cpus. So if there
>> are lots of sockets in the system, each bucket could
>> contain lots of sockets.
>>
>> Different actual use cases may experience different delays.
>> Here, using selftest bpf_iter subtest bpf_sk_storage_map,
>> I hacked the kernel with ktime_get_mono_fast_ns()
>> to collect the time when a bucket was locked
>> during bpf_iter prog traversing that bucket. This way,
>> the maximum incurred delay was measured w.r.t. the
>> number of elements in a bucket.
>>      # elems in each bucket          delay(ns)
>>        64                            17000
>>        256                           72512
>>        2048                          875246
>>
>> The potential delays will be further increased if
>> we have even more elemnts in a bucket. Using rcu_read_lock()
>> is a reasonable compromise here. It may lose some precision, e.g.,
>> access stale sockets, but it will not hurt performance of
>> bpf program or user space application which also tries
>> to get/delete or update map elements.
>>
>> Cc: Martin KaFai Lau <kafai@fb.com>
>> Acked-by: Song Liu <songliubraving@fb.com>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   net/core/bpf_sk_storage.c | 21 ++++++++-------------
>>   1 file changed, 8 insertions(+), 13 deletions(-)
>>
>> Changelog:
>>   v1 -> v2:
>>     - added some performance number. (Song)
>>     - tried to silence some sparse complains. but still has some left like
>>         context imbalance in "..." - different lock contexts for basic block
>>       which the code is too hard for sparse to analyze. (Jakub)
>>
>> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
>> index 4a86ea34f29e..4fc6b03d3639 100644
>> --- a/net/core/bpf_sk_storage.c
>> +++ b/net/core/bpf_sk_storage.c
>> @@ -678,6 +678,7 @@ struct bpf_iter_seq_sk_storage_map_info {
>>   static struct bpf_local_storage_elem *
>>   bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
>>   				 struct bpf_local_storage_elem *prev_selem)
>> +	__acquires(RCU) __releases(RCU)
>>   {
>>   	struct bpf_local_storage *sk_storage;
>>   	struct bpf_local_storage_elem *selem;
>> @@ -701,7 +702,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
>>   		if (!selem) {
>>   			/* not found, unlock and go to the next bucket */
>>   			b = &smap->buckets[bucket_id++];
>> -			raw_spin_unlock_bh(&b->lock);
>> +			rcu_read_unlock();
>>   			skip_elems = 0;
>>   			break;
>>   		}
>> @@ -715,7 +716,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
>>   
>>   	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
>>   		b = &smap->buckets[i];
>> -		raw_spin_lock_bh(&b->lock);
>> +		rcu_read_lock();
>>   		count = 0;
>>   		hlist_for_each_entry(selem, &b->list, map_node) {
> hlist_for_each_entry_rcu()

Good catch!

> 
>>   			sk_storage = rcu_dereference_raw(selem->local_storage);
> Does lockdep complain without "_raw"?

It didn't. But I think using rcu_dereference() is better as it provides 
extra checking.

Will fix and send v3.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-09-16  6:09 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-15 22:35 [PATCH bpf-next v2] bpf: using rcu_read_lock for bpf_sk_storage_map iterator Yonghong Song
2020-09-16  5:37 ` Martin KaFai Lau
2020-09-16  6:09   ` Yonghong Song

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.