BPF Archive on lore.kernel.org
 help / color / Atom feed
* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
       [not found] ` <20210108231950.3844417-2-songliubraving@fb.com>
@ 2021-01-11  6:27   ` Yonghong Song
  2021-01-11 10:17     ` KP Singh
  2021-01-11 10:14   ` KP Singh
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 29+ messages in thread
From: Yonghong Song @ 2021-01-11  6:27 UTC (permalink / raw)
  To: Song Liu, bpf, netdev, linux-kernel
  Cc: mingo, peterz, ast, daniel, andrii, john.fastabend, kpsingh,
	kernel-team, haoluo, kernel test robot



On 1/8/21 3:19 PM, Song Liu wrote:
> To access per-task data, BPF program typically creates a hash table with
> pid as the key. This is not ideal because:
>   1. The use need to estimate requires size of the hash table, with may be
>      inaccurate;
>   2. Big hash tables are slow;
>   3. To clean up the data properly during task terminations, the user need
>      to write code.
> 
> Task local storage overcomes these issues and becomes a better option for
> these per-task data. Task local storage is only available to BPF_LSM. Now
> enable it for tracing programs.
> 
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   include/linux/bpf.h            |  7 +++++++
>   include/linux/bpf_lsm.h        | 22 ----------------------
>   include/linux/bpf_types.h      |  2 +-
>   include/linux/sched.h          |  5 +++++
>   kernel/bpf/Makefile            |  3 +--
>   kernel/bpf/bpf_local_storage.c | 28 +++++++++++++++++-----------
>   kernel/bpf/bpf_lsm.c           |  4 ----
>   kernel/bpf/bpf_task_storage.c  | 26 ++++++--------------------
>   kernel/fork.c                  |  5 +++++
>   kernel/trace/bpf_trace.c       |  4 ++++
>   10 files changed, 46 insertions(+), 60 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 07cb5d15e7439..cf16548f28f7b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1480,6 +1480,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
>   struct bpf_link *bpf_link_by_id(u32 id);
>   
>   const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
> +void bpf_task_storage_free(struct task_struct *task);
>   #else /* !CONFIG_BPF_SYSCALL */
>   static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>   {
> @@ -1665,6 +1666,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
>   {
>   	return NULL;
>   }
> +
> +static inline void bpf_task_storage_free(struct task_struct *task)
> +{
> +}
>   #endif /* CONFIG_BPF_SYSCALL */
>   
>   static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
> @@ -1860,6 +1865,8 @@ extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
>   extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
>   extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
>   extern const struct bpf_func_proto bpf_sock_from_file_proto;
> +extern const struct bpf_func_proto bpf_task_storage_get_proto;
> +extern const struct bpf_func_proto bpf_task_storage_delete_proto;
>   
>   const struct bpf_func_proto *bpf_tracing_func_proto(
>   	enum bpf_func_id func_id, const struct bpf_prog *prog);
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index 0d1c33ace3987..479c101546ad1 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode(
>   	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
>   }
>   
> -static inline struct bpf_storage_blob *bpf_task(
> -	const struct task_struct *task)
> -{
> -	if (unlikely(!task->security))
> -		return NULL;
> -
> -	return task->security + bpf_lsm_blob_sizes.lbs_task;
> -}
> -
>   extern const struct bpf_func_proto bpf_inode_storage_get_proto;
>   extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
> -extern const struct bpf_func_proto bpf_task_storage_get_proto;
> -extern const struct bpf_func_proto bpf_task_storage_delete_proto;
>   void bpf_inode_storage_free(struct inode *inode);
> -void bpf_task_storage_free(struct task_struct *task);
>   
>   #else /* !CONFIG_BPF_LSM */
>   
> @@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode(
>   	return NULL;
>   }
>   
> -static inline struct bpf_storage_blob *bpf_task(
> -	const struct task_struct *task)
> -{
> -	return NULL;
> -}
> -
>   static inline void bpf_inode_storage_free(struct inode *inode)
>   {
>   }
>   
> -static inline void bpf_task_storage_free(struct task_struct *task)
> -{
> -}
> -
>   #endif /* CONFIG_BPF_LSM */
>   
>   #endif /* _LINUX_BPF_LSM_H */
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 99f7fd657d87a..b9edee336d804 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -109,8 +109,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
>   #endif
>   #ifdef CONFIG_BPF_LSM
>   BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
> -BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
>   #endif
> +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
>   BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
>   #if defined(CONFIG_XDP_SOCKETS)
>   BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 51d535b69bd6f..4a173defa2010 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -42,6 +42,7 @@ struct audit_context;
>   struct backing_dev_info;
>   struct bio_list;
>   struct blk_plug;
> +struct bpf_local_storage;
>   struct capture_control;
>   struct cfs_rq;
>   struct fs_struct;
> @@ -1348,6 +1349,10 @@ struct task_struct {
>   	/* Used by LSM modules for access restriction: */
>   	void				*security;
>   #endif
> +#ifdef CONFIG_BPF_SYSCALL
> +	/* Used by BPF task local storage */
> +	struct bpf_local_storage	*bpf_storage;
> +#endif

I remembered there is a discussion where KP initially wanted to put 
bpf_local_storage in task_struct, but later on changed to
use in lsm as his use case mostly for lsm. Did anybody
remember the details of the discussion? Just want to be
sure what is the concern people has with putting bpf_local_storage
in task_struct and whether the use case presented by
Song will justify it.

>   
>   #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>   	unsigned long			lowest_stack;
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index d1249340fd6ba..ca995fdfa45e7 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -8,9 +8,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
>   
>   obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_task_storage.o
>   obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
> -obj-${CONFIG_BPF_LSM}	  += bpf_task_storage.o
>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>   obj-$(CONFIG_BPF_JIT) += trampoline.o
>   obj-$(CONFIG_BPF_SYSCALL) += btf.o
[...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
       [not found] ` <20210108231950.3844417-2-songliubraving@fb.com>
  2021-01-11  6:27   ` [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs Yonghong Song
@ 2021-01-11 10:14   ` KP Singh
  2021-01-11 23:16     ` Song Liu
  2021-01-11 17:16   ` Yonghong Song
  2021-01-11 18:56   ` Martin KaFai Lau
  3 siblings, 1 reply; 29+ messages in thread
From: KP Singh @ 2021-01-11 10:14 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot

On Sat, Jan 9, 2021 at 12:35 AM Song Liu <songliubraving@fb.com> wrote:
>
> To access per-task data, BPF program typically creates a hash table with
> pid as the key. This is not ideal because:
>  1. The use need to estimate requires size of the hash table, with may be
>     inaccurate;
>  2. Big hash tables are slow;
>  3. To clean up the data properly during task terminations, the user need
>     to write code.
>
> Task local storage overcomes these issues and becomes a better option for
> these per-task data. Task local storage is only available to BPF_LSM. Now
> enable it for tracing programs.

Also mention here that you change the pointer from being a security blob to a
dedicated member in the task struct. I assume this is because you want to
use it without CONFIG_BPF_LSM?

>

Can you also mention the reasons for changing the
raw_spin_lock_bh to raw_spin_lock_irqsave in the commit log?


> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  include/linux/bpf.h            |  7 +++++++
>  include/linux/bpf_lsm.h        | 22 ----------------------
>  include/linux/bpf_types.h      |  2 +-
>  include/linux/sched.h          |  5 +++++
>  kernel/bpf/Makefile            |  3 +--
>  kernel/bpf/bpf_local_storage.c | 28 +++++++++++++++++-----------
>  kernel/bpf/bpf_lsm.c           |  4 ----
>  kernel/bpf/bpf_task_storage.c  | 26 ++++++--------------------
>  kernel/fork.c                  |  5 +++++
>  kernel/trace/bpf_trace.c       |  4 ++++
>  10 files changed, 46 insertions(+), 60 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11  6:27   ` [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs Yonghong Song
@ 2021-01-11 10:17     ` KP Singh
  2021-01-11 15:56       ` Yonghong Song
  0 siblings, 1 reply; 29+ messages in thread
From: KP Singh @ 2021-01-11 10:17 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Song Liu, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot

On Mon, Jan 11, 2021 at 7:27 AM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 1/8/21 3:19 PM, Song Liu wrote:
> > To access per-task data, BPF program typically creates a hash table with
> > pid as the key. This is not ideal because:
> >   1. The use need to estimate requires size of the hash table, with may be
> >      inaccurate;
> >   2. Big hash tables are slow;
> >   3. To clean up the data properly during task terminations, the user need
> >      to write code.
> >
> > Task local storage overcomes these issues and becomes a better option for
> > these per-task data. Task local storage is only available to BPF_LSM. Now
> > enable it for tracing programs.
> >
> > Reported-by: kernel test robot <lkp@intel.com>
> > Signed-off-by: Song Liu <songliubraving@fb.com>
> > ---

[...]

> >   struct cfs_rq;
> >   struct fs_struct;
> > @@ -1348,6 +1349,10 @@ struct task_struct {
> >       /* Used by LSM modules for access restriction: */
> >       void                            *security;
> >   #endif
> > +#ifdef CONFIG_BPF_SYSCALL
> > +     /* Used by BPF task local storage */
> > +     struct bpf_local_storage        *bpf_storage;
> > +#endif
>
> I remembered there is a discussion where KP initially wanted to put
> bpf_local_storage in task_struct, but later on changed to
> use in lsm as his use case mostly for lsm. Did anybody
> remember the details of the discussion? Just want to be
> sure what is the concern people has with putting bpf_local_storage
> in task_struct and whether the use case presented by
> Song will justify it.
>

If I recall correctly, the discussion was about inode local storage and
it was decided to use the security blob since the use-case was only LSM
programs. Since we now plan to use it in tracing,
detangling the dependency from CONFIG_BPF_LSM
sounds logical to me.


> >
> >   #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
> >       unsigned long                   lowest_stack;
> > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> > index d1249340fd6ba..ca995fdfa45e7 100644
> > --- a/kernel/bpf/Makefile
> > +++ b/kernel/bpf/Makefile
> > @@ -8,9 +8,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
> >
> >   obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
> >   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> > -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> > +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_task_storage.o
> >   obj-${CONFIG_BPF_LSM}         += bpf_inode_storage.o
> > -obj-${CONFIG_BPF_LSM}          += bpf_task_storage.o
> >   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> >   obj-$(CONFIG_BPF_JIT) += trampoline.o
> >   obj-$(CONFIG_BPF_SYSCALL) += btf.o
> [...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 10:17     ` KP Singh
@ 2021-01-11 15:56       ` Yonghong Song
  0 siblings, 0 replies; 29+ messages in thread
From: Yonghong Song @ 2021-01-11 15:56 UTC (permalink / raw)
  To: KP Singh
  Cc: Song Liu, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot



On 1/11/21 2:17 AM, KP Singh wrote:
> On Mon, Jan 11, 2021 at 7:27 AM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 1/8/21 3:19 PM, Song Liu wrote:
>>> To access per-task data, BPF program typically creates a hash table with
>>> pid as the key. This is not ideal because:
>>>    1. The use need to estimate requires size of the hash table, with may be
>>>       inaccurate;
>>>    2. Big hash tables are slow;
>>>    3. To clean up the data properly during task terminations, the user need
>>>       to write code.
>>>
>>> Task local storage overcomes these issues and becomes a better option for
>>> these per-task data. Task local storage is only available to BPF_LSM. Now
>>> enable it for tracing programs.
>>>
>>> Reported-by: kernel test robot <lkp@intel.com>
>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>> ---
> 
> [...]
> 
>>>    struct cfs_rq;
>>>    struct fs_struct;
>>> @@ -1348,6 +1349,10 @@ struct task_struct {
>>>        /* Used by LSM modules for access restriction: */
>>>        void                            *security;
>>>    #endif
>>> +#ifdef CONFIG_BPF_SYSCALL
>>> +     /* Used by BPF task local storage */
>>> +     struct bpf_local_storage        *bpf_storage;
>>> +#endif
>>
>> I remembered there is a discussion where KP initially wanted to put
>> bpf_local_storage in task_struct, but later on changed to
>> use in lsm as his use case mostly for lsm. Did anybody
>> remember the details of the discussion? Just want to be
>> sure what is the concern people has with putting bpf_local_storage
>> in task_struct and whether the use case presented by
>> Song will justify it.
>>
> 
> If I recall correctly, the discussion was about inode local storage and
> it was decided to use the security blob since the use-case was only LSM
> programs. Since we now plan to use it in tracing,
> detangling the dependency from CONFIG_BPF_LSM
> sounds logical to me.

Sounds good. Thanks for explanation.

> 
> 
>>>
>>>    #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
>>>        unsigned long                   lowest_stack;
>>> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
>>> index d1249340fd6ba..ca995fdfa45e7 100644
>>> --- a/kernel/bpf/Makefile
>>> +++ b/kernel/bpf/Makefile
>>> @@ -8,9 +8,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
>>>
>>>    obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
>>>    obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>>> -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
>>> +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_task_storage.o
>>>    obj-${CONFIG_BPF_LSM}         += bpf_inode_storage.o
>>> -obj-${CONFIG_BPF_LSM}          += bpf_task_storage.o
>>>    obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>>>    obj-$(CONFIG_BPF_JIT) += trampoline.o
>>>    obj-$(CONFIG_BPF_SYSCALL) += btf.o
>> [...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
       [not found] ` <20210108231950.3844417-2-songliubraving@fb.com>
  2021-01-11  6:27   ` [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs Yonghong Song
  2021-01-11 10:14   ` KP Singh
@ 2021-01-11 17:16   ` Yonghong Song
  2021-01-11 18:56   ` Martin KaFai Lau
  3 siblings, 0 replies; 29+ messages in thread
From: Yonghong Song @ 2021-01-11 17:16 UTC (permalink / raw)
  To: Song Liu, bpf, netdev, linux-kernel
  Cc: mingo, peterz, ast, daniel, andrii, john.fastabend, kpsingh,
	kernel-team, haoluo, kernel test robot



On 1/8/21 3:19 PM, Song Liu wrote:
> To access per-task data, BPF program typically creates a hash table with
> pid as the key. This is not ideal because:
>   1. The use need to estimate requires size of the hash table, with may be
>      inaccurate;
>   2. Big hash tables are slow;
>   3. To clean up the data properly during task terminations, the user need
>      to write code.
> 
> Task local storage overcomes these issues and becomes a better option for
> these per-task data. Task local storage is only available to BPF_LSM. Now
> enable it for tracing programs.
> 
> Reported-by: kernel test robot <lkp@intel.com>

The whole patch is not reported by kernel test robot. I think we should
drop this.

> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   include/linux/bpf.h            |  7 +++++++
>   include/linux/bpf_lsm.h        | 22 ----------------------
>   include/linux/bpf_types.h      |  2 +-
>   include/linux/sched.h          |  5 +++++
>   kernel/bpf/Makefile            |  3 +--
>   kernel/bpf/bpf_local_storage.c | 28 +++++++++++++++++-----------
>   kernel/bpf/bpf_lsm.c           |  4 ----
>   kernel/bpf/bpf_task_storage.c  | 26 ++++++--------------------
>   kernel/fork.c                  |  5 +++++
>   kernel/trace/bpf_trace.c       |  4 ++++
>   10 files changed, 46 insertions(+), 60 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 07cb5d15e7439..cf16548f28f7b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1480,6 +1480,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
>   struct bpf_link *bpf_link_by_id(u32 id);
>   
>   const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
> +void bpf_task_storage_free(struct task_struct *task);
>   #else /* !CONFIG_BPF_SYSCALL */
>   static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>   {
> @@ -1665,6 +1666,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
>   {
>   	return NULL;
>   }
> +
> +static inline void bpf_task_storage_free(struct task_struct *task)
> +{
> +}
>   #endif /* CONFIG_BPF_SYSCALL */
[...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for task local storage
       [not found] ` <20210108231950.3844417-3-songliubraving@fb.com>
@ 2021-01-11 17:30   ` Yonghong Song
  2021-01-11 17:44     ` KP Singh
  2021-01-11 22:49     ` Song Liu
  2021-01-12  7:06   ` Andrii Nakryiko
  1 sibling, 2 replies; 29+ messages in thread
From: Yonghong Song @ 2021-01-11 17:30 UTC (permalink / raw)
  To: Song Liu, bpf, netdev, linux-kernel
  Cc: mingo, peterz, ast, daniel, andrii, john.fastabend, kpsingh,
	kernel-team, haoluo



On 1/8/21 3:19 PM, Song Liu wrote:
> Task local storage is enabled for tracing programs. Add a test for it
> without CONFIG_BPF_LSM.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   .../bpf/prog_tests/test_task_local_storage.c  | 34 +++++++++++++++++
>   .../selftests/bpf/progs/task_local_storage.c  | 37 +++++++++++++++++++
>   2 files changed, 71 insertions(+)
>   create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
>   create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> new file mode 100644
> index 0000000000000..7de7a154ebbe6
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> @@ -0,0 +1,34 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */

2020 -> 2021

> +
> +#include <sys/types.h>
> +#include <unistd.h>
> +#include <test_progs.h>
> +#include "task_local_storage.skel.h"
> +
> +static unsigned int duration;
> +
> +void test_test_task_local_storage(void)
> +{
> +	struct task_local_storage *skel;
> +	const int count = 10;
> +	int i, err;
> +
> +	skel = task_local_storage__open_and_load();
> +

Extra line is unnecessary here.

> +	if (CHECK(!skel, "skel_open_and_load", "skeleton open and load failed\n"))
> +		return;
> +
> +	err = task_local_storage__attach(skel);
> +

ditto.

> +	if (CHECK(err, "skel_attach", "skeleton attach failed\n"))
> +		goto out;
> +
> +	for (i = 0; i < count; i++)
> +		usleep(1000);

Does a smaller usleep value will work? If it is, recommend to have a 
smaller value here to reduce test_progs running time.

> +	CHECK(skel->bss->value < count, "task_local_storage_value",
> +	      "task local value too small\n");
> +
> +out:
> +	task_local_storage__destroy(skel);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c
> new file mode 100644
> index 0000000000000..807255c5c162d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c
> @@ -0,0 +1,37 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */

2020 -> 2021

> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +struct local_data {
> +	__u64 val;
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct local_data);
> +} task_storage_map SEC(".maps");
> +
> +int value = 0;
> +
> +SEC("tp_btf/sched_switch")
> +int BPF_PROG(on_switch, bool preempt, struct task_struct *prev,
> +	     struct task_struct *next)
> +{
> +	struct local_data *storage;

If it possible that we do some filtering based on test_progs pid
so below bpf_task_storage_get is only called for test_progs process?
This is more targeted and can avoid counter contributions from
other unrelated processes and make test_task_local_storage.c result
comparison more meaningful.

> +
> +	storage = bpf_task_storage_get(&task_storage_map,
> +				       next, 0,
> +				       BPF_LOCAL_STORAGE_GET_F_CREATE);
> +	if (storage) {
> +		storage->val++;
> +		value = storage->val;
> +	}
> +	return 0;
> +}
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 3/4] bpf: runqslower: prefer use local vmlinux
       [not found] ` <20210108231950.3844417-4-songliubraving@fb.com>
@ 2021-01-11 17:37   ` Yonghong Song
  0 siblings, 0 replies; 29+ messages in thread
From: Yonghong Song @ 2021-01-11 17:37 UTC (permalink / raw)
  To: Song Liu, bpf, netdev, linux-kernel
  Cc: mingo, peterz, ast, daniel, andrii, john.fastabend, kpsingh,
	kernel-team, haoluo



On 1/8/21 3:19 PM, Song Liu wrote:
> Update the Makefile to prefer using ../../../vmlinux, which has latest
> definitions for vmlinux.h
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   tools/bpf/runqslower/Makefile | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile
> index 4d5ca54fcd4c8..306f1ce5a97b2 100644
> --- a/tools/bpf/runqslower/Makefile
> +++ b/tools/bpf/runqslower/Makefile
> @@ -19,7 +19,8 @@ CFLAGS := -g -Wall
>   
>   # Try to detect best kernel BTF source
>   KERNEL_REL := $(shell uname -r)
> -VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL)
> +VMLINUX_BTF_PATHS := ../../../vmlinux /sys/kernel/btf/vmlinux \
> +	/boot/vmlinux-$(KERNEL_REL)

selftests/bpf Makefile has:

VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)                            \
                      $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)    \
                      ../../../../vmlinux                                \
                      /sys/kernel/btf/vmlinux                            \
                      /boot/vmlinux-$(shell uname -r)

If you intend to add ../../../vmlinux, I think we should also
add $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux).

>   VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword			       \
>   					  $(wildcard $(VMLINUX_BTF_PATHS))))
>   
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for task local storage
  2021-01-11 17:30   ` [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for " Yonghong Song
@ 2021-01-11 17:44     ` KP Singh
  2021-01-11 22:50       ` Song Liu
  2021-01-11 22:49     ` Song Liu
  1 sibling, 1 reply; 29+ messages in thread
From: KP Singh @ 2021-01-11 17:44 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Song Liu, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo

On Mon, Jan 11, 2021 at 6:31 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 1/8/21 3:19 PM, Song Liu wrote:
> > Task local storage is enabled for tracing programs. Add a test for it
> > without CONFIG_BPF_LSM.

Can you also explain what the test does in the commit log?

It would also be nicer to have a somewhat more realistic selftest which
represents a simple tracing + task local storage use case.

> >
> > Signed-off-by: Song Liu <songliubraving@fb.com>
> > ---
> >   .../bpf/prog_tests/test_task_local_storage.c  | 34 +++++++++++++++++
> >   .../selftests/bpf/progs/task_local_storage.c  | 37 +++++++++++++++++++
> >   2 files changed, 71 insertions(+)
> >   create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> >   create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> > new file mode 100644
> > index 0000000000000..7de7a154ebbe6
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> > @@ -0,0 +1,34 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2020 Facebook */
>
> 2020 -> 2021
>
> > +
> > +#include <sys/types.h>
> > +#include <unistd.h>
> > +#include <test_progs.h>
> > +#include "task_local_storage.skel.h"
> > +
> > +static unsigned int duration;
> > +
> > +void test_test_task_local_storage(void)
> > +{
> > +     struct task_local_storage *skel;
> > +     const int count = 10;
> > +     int i, err;
> > +
> > +     skel = task_local_storage__open_and_load();
> > +
>
> Extra line is unnecessary here.
>
> > +     if (CHECK(!skel, "skel_open_and_load", "skeleton open and load failed\n"))
> > +             return;
> > +
> > +     err = task_local_storage__attach(skel);
> > +
>
> ditto.
>
> > +     if (CHECK(err, "skel_attach", "skeleton attach failed\n"))
> > +             goto out;
> > +
> > +     for (i = 0; i < count; i++)
> > +             usleep(1000);
>
> Does a smaller usleep value will work? If it is, recommend to have a
> smaller value here to reduce test_progs running time.
>
> > +     CHECK(skel->bss->value < count, "task_local_storage_value",
> > +           "task local value too small\n");

[...]

> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2020 Facebook */
>
> 2020 -> 2021
>
> > +
> > +#include "vmlinux.h"
> > +#include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_tracing.h>
> > +
> > +char _license[] SEC("license") = "GPL";

[...]

> > +{
> > +     struct local_data *storage;
>
> If it possible that we do some filtering based on test_progs pid
> so below bpf_task_storage_get is only called for test_progs process?
> This is more targeted and can avoid counter contributions from
> other unrelated processes and make test_task_local_storage.c result
> comparison more meaningful.

Indeed, have a look at the monitored_pid approach some of the LSM programs
do.

>
> > +
> > +     storage = bpf_task_storage_get(&task_storage_map,
> > +                                    next, 0,
> > +                                    BPF_LOCAL_STORAGE_GET_F_CREATE);
> > +     if (storage) {
> > +             storage->val++;
> > +             value = storage->val;
> > +     }
> > +     return 0;
> > +}
> >

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
       [not found] ` <20210108231950.3844417-5-songliubraving@fb.com>
@ 2021-01-11 17:49   ` Yonghong Song
  2021-01-11 22:54     ` Song Liu
  0 siblings, 1 reply; 29+ messages in thread
From: Yonghong Song @ 2021-01-11 17:49 UTC (permalink / raw)
  To: Song Liu, bpf, netdev, linux-kernel
  Cc: mingo, peterz, ast, daniel, andrii, john.fastabend, kpsingh,
	kernel-team, haoluo



On 1/8/21 3:19 PM, Song Liu wrote:
> Replace hashtab with task local storage in runqslower. This improves the
> performance of these BPF programs. The following table summarizes average
> runtime of these programs, in nanoseconds:
> 
>                            task-local   hash-prealloc   hash-no-prealloc
> handle__sched_wakeup             125             340               3124
> handle__sched_wakeup_new        2812            1510               2998
> handle__sched_switch             151             208                991
> 
> Note that, task local storage gives better performance than hashtab for
> handle__sched_wakeup and handle__sched_switch. On the other hand, for
> handle__sched_wakeup_new, task local storage is slower than hashtab with
> prealloc. This is because handle__sched_wakeup_new accesses the data for
> the first time, so it has to allocate the data for task local storage.
> Once the initial allocation is done, subsequent accesses, as those in
> handle__sched_wakeup, are much faster with task local storage. If we
> disable hashtab prealloc, task local storage is much faster for all 3
> functions.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>   tools/bpf/runqslower/runqslower.bpf.c | 26 +++++++++++++++-----------
>   1 file changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
> index 1f18a409f0443..c4de4179a0a17 100644
> --- a/tools/bpf/runqslower/runqslower.bpf.c
> +++ b/tools/bpf/runqslower/runqslower.bpf.c
> @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0;
>   const volatile pid_t targ_pid = 0;
>   
>   struct {
> -	__uint(type, BPF_MAP_TYPE_HASH);
> -	__uint(max_entries, 10240);
> -	__type(key, u32);
> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
>   	__type(value, u64);
>   } start SEC(".maps");
>   
> @@ -25,15 +25,19 @@ struct {
>   
>   /* record enqueue timestamp */
>   __always_inline
> -static int trace_enqueue(u32 tgid, u32 pid)
> +static int trace_enqueue(struct task_struct *t)
>   {
> -	u64 ts;
> +	u32 pid = t->pid;
> +	u64 ts, *ptr;
>   
>   	if (!pid || (targ_pid && targ_pid != pid))
>   		return 0;
>   
>   	ts = bpf_ktime_get_ns();
> -	bpf_map_update_elem(&start, &pid, &ts, 0);
> +	ptr = bpf_task_storage_get(&start, t, 0,
> +				   BPF_LOCAL_STORAGE_GET_F_CREATE);
> +	if (ptr)
> +		*ptr = ts;
>   	return 0;
>   }
>   
> @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx)
>   	/* TP_PROTO(struct task_struct *p) */
>   	struct task_struct *p = (void *)ctx[0];
>   
> -	return trace_enqueue(p->tgid, p->pid);
> +	return trace_enqueue(p);
>   }
>   
>   SEC("tp_btf/sched_wakeup_new")
> @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx)
>   	/* TP_PROTO(struct task_struct *p) */
>   	struct task_struct *p = (void *)ctx[0];
>   
> -	return trace_enqueue(p->tgid, p->pid);
> +	return trace_enqueue(p);
>   }
>   
>   SEC("tp_btf/sched_switch")
> @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx)
>   
>   	/* ivcsw: treat like an enqueue event and store timestamp */
>   	if (prev->state == TASK_RUNNING)
> -		trace_enqueue(prev->tgid, prev->pid);
> +		trace_enqueue(prev);
>   
>   	pid = next->pid;
>   
>   	/* fetch timestamp and calculate delta */
> -	tsp = bpf_map_lookup_elem(&start, &pid);
> +	tsp = bpf_task_storage_get(&start, next, 0, 0);
>   	if (!tsp)
>   		return 0;   /* missed enqueue */

Previously, hash table may overflow so we may have missed enqueue.
Here with task local storage, is it possible to add additional pid
filtering in the beginning of handle__sched_switch such that
missed enqueue here can be treated as an error?

>   
> @@ -91,7 +95,7 @@ int handle__sched_switch(u64 *ctx)
>   	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
>   			      &event, sizeof(event));
>   
> -	bpf_map_delete_elem(&start, &pid);
> +	bpf_task_storage_delete(&start, next);
>   	return 0;
>   }
>   
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
       [not found] ` <20210108231950.3844417-2-songliubraving@fb.com>
                     ` (2 preceding siblings ...)
  2021-01-11 17:16   ` Yonghong Song
@ 2021-01-11 18:56   ` Martin KaFai Lau
  2021-01-11 21:35     ` KP Singh
  2021-01-11 23:41     ` Song Liu
  3 siblings, 2 replies; 29+ messages in thread
From: Martin KaFai Lau @ 2021-01-11 18:56 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, linux-kernel, mingo, peterz, ast, daniel, andrii,
	john.fastabend, kpsingh, kernel-team, haoluo, kernel test robot

On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:

[ ... ]

> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index dd5aedee99e73..9bd47ad2b26f1 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>  {
>  	struct bpf_local_storage *local_storage;
>  	bool free_local_storage = false;
> +	unsigned long flags;
>  
>  	if (unlikely(!selem_linked_to_storage(selem)))
>  		/* selem has already been unlinked from sk */
>  		return;
>  
>  	local_storage = rcu_dereference(selem->local_storage);
> -	raw_spin_lock_bh(&local_storage->lock);
> +	raw_spin_lock_irqsave(&local_storage->lock, flags);
It will be useful to have a few words in commit message on this change
for future reference purpose.

Please also remove the in_irq() check from bpf_sk_storage.c
to avoid confusion in the future.  It probably should
be in a separate patch.

[ ... ]

> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> index 4ef1959a78f27..f654b56907b69 100644
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 7425b3224891d..3d65c8ebfd594 100644
[ ... ]

> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -96,6 +96,7 @@
>  #include <linux/kasan.h>
>  #include <linux/scs.h>
>  #include <linux/io_uring.h>
> +#include <linux/bpf.h>
>  
>  #include <asm/pgalloc.h>
>  #include <linux/uaccess.h>
> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>  	cgroup_free(tsk);
>  	task_numa_free(tsk, true);
>  	security_task_free(tsk);
> +	bpf_task_storage_free(tsk);
>  	exit_creds(tsk);
If exit_creds() is traced by a bpf and this bpf is doing
bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
new task storage will be created after bpf_task_storage_free().

I recalled there was an earlier discussion with KP and KP mentioned
BPF_LSM will not be called with a task that is going away.
It seems enabling bpf task storage in bpf tracing will break
this assumption and needs to be addressed?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 18:56   ` Martin KaFai Lau
@ 2021-01-11 21:35     ` KP Singh
  2021-01-11 21:58       ` Martin KaFai Lau
  2021-01-11 23:41     ` Song Liu
  1 sibling, 1 reply; 29+ messages in thread
From: KP Singh @ 2021-01-11 21:35 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: Song Liu, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot

On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>
> [ ... ]
>
> > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > index dd5aedee99e73..9bd47ad2b26f1 100644
> > --- a/kernel/bpf/bpf_local_storage.c
> > +++ b/kernel/bpf/bpf_local_storage.c
> > @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> >  {
> >       struct bpf_local_storage *local_storage;
> >       bool free_local_storage = false;
> > +     unsigned long flags;
> >
> >       if (unlikely(!selem_linked_to_storage(selem)))
> >               /* selem has already been unlinked from sk */
> >               return;
> >
> >       local_storage = rcu_dereference(selem->local_storage);
> > -     raw_spin_lock_bh(&local_storage->lock);
> > +     raw_spin_lock_irqsave(&local_storage->lock, flags);
> It will be useful to have a few words in commit message on this change
> for future reference purpose.
>
> Please also remove the in_irq() check from bpf_sk_storage.c
> to avoid confusion in the future.  It probably should
> be in a separate patch.
>
> [ ... ]
>
> > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> > index 4ef1959a78f27..f654b56907b69 100644
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 7425b3224891d..3d65c8ebfd594 100644
> [ ... ]
>
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -96,6 +96,7 @@
> >  #include <linux/kasan.h>
> >  #include <linux/scs.h>
> >  #include <linux/io_uring.h>
> > +#include <linux/bpf.h>
> >
> >  #include <asm/pgalloc.h>
> >  #include <linux/uaccess.h>
> > @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
> >       cgroup_free(tsk);
> >       task_numa_free(tsk, true);
> >       security_task_free(tsk);
> > +     bpf_task_storage_free(tsk);
> >       exit_creds(tsk);
> If exit_creds() is traced by a bpf and this bpf is doing
> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
> new task storage will be created after bpf_task_storage_free().
>
> I recalled there was an earlier discussion with KP and KP mentioned
> BPF_LSM will not be called with a task that is going away.
> It seems enabling bpf task storage in bpf tracing will break
> this assumption and needs to be addressed?

For tracing programs, I think we will need an allow list where
task local storage can be used.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 21:35     ` KP Singh
@ 2021-01-11 21:58       ` Martin KaFai Lau
  2021-01-11 23:45         ` Song Liu
  0 siblings, 1 reply; 29+ messages in thread
From: Martin KaFai Lau @ 2021-01-11 21:58 UTC (permalink / raw)
  To: KP Singh
  Cc: Song Liu, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot

On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
> >
> > [ ... ]
> >
> > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > > index dd5aedee99e73..9bd47ad2b26f1 100644
> > > --- a/kernel/bpf/bpf_local_storage.c
> > > +++ b/kernel/bpf/bpf_local_storage.c
> > > @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> > >  {
> > >       struct bpf_local_storage *local_storage;
> > >       bool free_local_storage = false;
> > > +     unsigned long flags;
> > >
> > >       if (unlikely(!selem_linked_to_storage(selem)))
> > >               /* selem has already been unlinked from sk */
> > >               return;
> > >
> > >       local_storage = rcu_dereference(selem->local_storage);
> > > -     raw_spin_lock_bh(&local_storage->lock);
> > > +     raw_spin_lock_irqsave(&local_storage->lock, flags);
> > It will be useful to have a few words in commit message on this change
> > for future reference purpose.
> >
> > Please also remove the in_irq() check from bpf_sk_storage.c
> > to avoid confusion in the future.  It probably should
> > be in a separate patch.
> >
> > [ ... ]
> >
> > > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> > > index 4ef1959a78f27..f654b56907b69 100644
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 7425b3224891d..3d65c8ebfd594 100644
> > [ ... ]
> >
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -96,6 +96,7 @@
> > >  #include <linux/kasan.h>
> > >  #include <linux/scs.h>
> > >  #include <linux/io_uring.h>
> > > +#include <linux/bpf.h>
> > >
> > >  #include <asm/pgalloc.h>
> > >  #include <linux/uaccess.h>
> > > @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
> > >       cgroup_free(tsk);
> > >       task_numa_free(tsk, true);
> > >       security_task_free(tsk);
> > > +     bpf_task_storage_free(tsk);
> > >       exit_creds(tsk);
> > If exit_creds() is traced by a bpf and this bpf is doing
> > bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
> > new task storage will be created after bpf_task_storage_free().
> >
> > I recalled there was an earlier discussion with KP and KP mentioned
> > BPF_LSM will not be called with a task that is going away.
> > It seems enabling bpf task storage in bpf tracing will break
> > this assumption and needs to be addressed?
> 
> For tracing programs, I think we will need an allow list where
> task local storage can be used.
Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for task local storage
  2021-01-11 17:30   ` [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for " Yonghong Song
  2021-01-11 17:44     ` KP Singh
@ 2021-01-11 22:49     ` Song Liu
  1 sibling, 0 replies; 29+ messages in thread
From: Song Liu @ 2021-01-11 22:49 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, netdev, lkml, Ingo Molnar, Peter Ziljstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	john fastabend, KP Singh, Kernel Team, haoluo



> On Jan 11, 2021, at 9:30 AM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 1/8/21 3:19 PM, Song Liu wrote:
>> Task local storage is enabled for tracing programs. Add a test for it
>> without CONFIG_BPF_LSM.
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>>  .../bpf/prog_tests/test_task_local_storage.c  | 34 +++++++++++++++++
>>  .../selftests/bpf/progs/task_local_storage.c  | 37 +++++++++++++++++++
>>  2 files changed, 71 insertions(+)
>>  create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
>>  create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c
>> diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
>> new file mode 100644
>> index 0000000000000..7de7a154ebbe6
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
>> @@ -0,0 +1,34 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2020 Facebook */
> 
> 2020 -> 2021
> 
>> +
>> +#include <sys/types.h>
>> +#include <unistd.h>
>> +#include <test_progs.h>
>> +#include "task_local_storage.skel.h"
>> +
>> +static unsigned int duration;
>> +
>> +void test_test_task_local_storage(void)
>> +{
>> +	struct task_local_storage *skel;
>> +	const int count = 10;
>> +	int i, err;
>> +
>> +	skel = task_local_storage__open_and_load();
>> +
> 
> Extra line is unnecessary here.
> 
>> +	if (CHECK(!skel, "skel_open_and_load", "skeleton open and load failed\n"))
>> +		return;
>> +
>> +	err = task_local_storage__attach(skel);
>> +
> 
> ditto.
> 
>> +	if (CHECK(err, "skel_attach", "skeleton attach failed\n"))
>> +		goto out;
>> +
>> +	for (i = 0; i < count; i++)
>> +		usleep(1000);
> 
> Does a smaller usleep value will work? If it is, recommend to have a smaller value here to reduce test_progs running time.

I thought 10ms total was acceptable. But yeah, smaller value should still work. 

> 
>> +	CHECK(skel->bss->value < count, "task_local_storage_value",
>> +	      "task local value too small\n");
>> +
>> +out:
>> +	task_local_storage__destroy(skel);
>> +}
>> diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c
>> new file mode 100644
>> index 0000000000000..807255c5c162d
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c
>> @@ -0,0 +1,37 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2020 Facebook */
> 
> 2020 -> 2021
> 
>> +
>> +#include "vmlinux.h"
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_tracing.h>
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +struct local_data {
>> +	__u64 val;
>> +};
>> +
>> +struct {
>> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>> +	__type(key, int);
>> +	__type(value, struct local_data);
>> +} task_storage_map SEC(".maps");
>> +
>> +int value = 0;
>> +
>> +SEC("tp_btf/sched_switch")
>> +int BPF_PROG(on_switch, bool preempt, struct task_struct *prev,
>> +	     struct task_struct *next)
>> +{
>> +	struct local_data *storage;
> 
> If it possible that we do some filtering based on test_progs pid
> so below bpf_task_storage_get is only called for test_progs process?
> This is more targeted and can avoid counter contributions from
> other unrelated processes and make test_task_local_storage.c result
> comparison more meaningful.

Make sense. Will fix in the next version. 

> 
>> +
>> +	storage = bpf_task_storage_get(&task_storage_map,
>> +				       next, 0,
>> +				       BPF_LOCAL_STORAGE_GET_F_CREATE);
>> +	if (storage) {
>> +		storage->val++;
>> +		value = storage->val;
>> +	}
>> +	return 0;
>> +}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for task local storage
  2021-01-11 17:44     ` KP Singh
@ 2021-01-11 22:50       ` Song Liu
  0 siblings, 0 replies; 29+ messages in thread
From: Song Liu @ 2021-01-11 22:50 UTC (permalink / raw)
  To: KP Singh
  Cc: Yonghong Song, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo



> On Jan 11, 2021, at 9:44 AM, KP Singh <kpsingh@kernel.org> wrote:
> 
> On Mon, Jan 11, 2021 at 6:31 PM Yonghong Song <yhs@fb.com> wrote:
>> 
>> 
>> 
>> On 1/8/21 3:19 PM, Song Liu wrote:
>>> Task local storage is enabled for tracing programs. Add a test for it
>>> without CONFIG_BPF_LSM.
> 
> Can you also explain what the test does in the commit log?
> 
> It would also be nicer to have a somewhat more realistic selftest which
> represents a simple tracing + task local storage use case.

Let me try to make this more realistic. 

Thanks,
Song

[...]


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
  2021-01-11 17:49   ` [PATCH bpf-next 4/4] bpf: runqslower: use task local storage Yonghong Song
@ 2021-01-11 22:54     ` Song Liu
  2021-01-12  3:24       ` Yonghong Song
  0 siblings, 1 reply; 29+ messages in thread
From: Song Liu @ 2021-01-11 22:54 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Networking, linux-kernel, mingo, peterz, ast, daniel,
	andrii, john.fastabend, kpsingh, Kernel Team, haoluo



> On Jan 11, 2021, at 9:49 AM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 1/8/21 3:19 PM, Song Liu wrote:
>> Replace hashtab with task local storage in runqslower. This improves the
>> performance of these BPF programs. The following table summarizes average
>> runtime of these programs, in nanoseconds:
>>                           task-local   hash-prealloc   hash-no-prealloc
>> handle__sched_wakeup             125             340               3124
>> handle__sched_wakeup_new        2812            1510               2998
>> handle__sched_switch             151             208                991
>> Note that, task local storage gives better performance than hashtab for
>> handle__sched_wakeup and handle__sched_switch. On the other hand, for
>> handle__sched_wakeup_new, task local storage is slower than hashtab with
>> prealloc. This is because handle__sched_wakeup_new accesses the data for
>> the first time, so it has to allocate the data for task local storage.
>> Once the initial allocation is done, subsequent accesses, as those in
>> handle__sched_wakeup, are much faster with task local storage. If we
>> disable hashtab prealloc, task local storage is much faster for all 3
>> functions.
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> ---
>>  tools/bpf/runqslower/runqslower.bpf.c | 26 +++++++++++++++-----------
>>  1 file changed, 15 insertions(+), 11 deletions(-)
>> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
>> index 1f18a409f0443..c4de4179a0a17 100644
>> --- a/tools/bpf/runqslower/runqslower.bpf.c
>> +++ b/tools/bpf/runqslower/runqslower.bpf.c
>> @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0;
>>  const volatile pid_t targ_pid = 0;
>>    struct {
>> -	__uint(type, BPF_MAP_TYPE_HASH);
>> -	__uint(max_entries, 10240);
>> -	__type(key, u32);
>> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>> +	__type(key, int);
>>  	__type(value, u64);
>>  } start SEC(".maps");
>>  @@ -25,15 +25,19 @@ struct {
>>    /* record enqueue timestamp */
>>  __always_inline
>> -static int trace_enqueue(u32 tgid, u32 pid)
>> +static int trace_enqueue(struct task_struct *t)
>>  {
>> -	u64 ts;
>> +	u32 pid = t->pid;
>> +	u64 ts, *ptr;
>>    	if (!pid || (targ_pid && targ_pid != pid))
>>  		return 0;
>>    	ts = bpf_ktime_get_ns();
>> -	bpf_map_update_elem(&start, &pid, &ts, 0);
>> +	ptr = bpf_task_storage_get(&start, t, 0,
>> +				   BPF_LOCAL_STORAGE_GET_F_CREATE);
>> +	if (ptr)
>> +		*ptr = ts;
>>  	return 0;
>>  }
>>  @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx)
>>  	/* TP_PROTO(struct task_struct *p) */
>>  	struct task_struct *p = (void *)ctx[0];
>>  -	return trace_enqueue(p->tgid, p->pid);
>> +	return trace_enqueue(p);
>>  }
>>    SEC("tp_btf/sched_wakeup_new")
>> @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx)
>>  	/* TP_PROTO(struct task_struct *p) */
>>  	struct task_struct *p = (void *)ctx[0];
>>  -	return trace_enqueue(p->tgid, p->pid);
>> +	return trace_enqueue(p);
>>  }
>>    SEC("tp_btf/sched_switch")
>> @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx)
>>    	/* ivcsw: treat like an enqueue event and store timestamp */
>>  	if (prev->state == TASK_RUNNING)
>> -		trace_enqueue(prev->tgid, prev->pid);
>> +		trace_enqueue(prev);
>>    	pid = next->pid;
>>    	/* fetch timestamp and calculate delta */
>> -	tsp = bpf_map_lookup_elem(&start, &pid);
>> +	tsp = bpf_task_storage_get(&start, next, 0, 0);
>>  	if (!tsp)
>>  		return 0;   /* missed enqueue */
> 
> Previously, hash table may overflow so we may have missed enqueue.
> Here with task local storage, is it possible to add additional pid
> filtering in the beginning of handle__sched_switch such that
> missed enqueue here can be treated as an error?

IIUC, hashtab overflow is not the only reason of missed enqueue. If the
wakeup (which calls trace_enqueue) happens before runqslower starts, we
may still get missed enqueue in sched_switch, no?

Thanks,
Song

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 10:14   ` KP Singh
@ 2021-01-11 23:16     ` Song Liu
  0 siblings, 0 replies; 29+ messages in thread
From: Song Liu @ 2021-01-11 23:16 UTC (permalink / raw)
  To: KP Singh
  Cc: bpf, Networking, open list, Ingo Molnar, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot



> On Jan 11, 2021, at 2:14 AM, KP Singh <kpsingh@kernel.org> wrote:
> 
> On Sat, Jan 9, 2021 at 12:35 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> To access per-task data, BPF program typically creates a hash table with
>> pid as the key. This is not ideal because:
>> 1. The use need to estimate requires size of the hash table, with may be
>>    inaccurate;
>> 2. Big hash tables are slow;
>> 3. To clean up the data properly during task terminations, the user need
>>    to write code.
>> 
>> Task local storage overcomes these issues and becomes a better option for
>> these per-task data. Task local storage is only available to BPF_LSM. Now
>> enable it for tracing programs.
> 
> Also mention here that you change the pointer from being a security blob to a
> dedicated member in the task struct. I assume this is because you want to
> use it without CONFIG_BPF_LSM?

Yes, exactly. I will add this to the commit log. 

> 
>> 
> 
> Can you also mention the reasons for changing the
> raw_spin_lock_bh to raw_spin_lock_irqsave in the commit log?

This is because we will use these in irq context. I will add this to the 
commit log. 

Thanks,
Song


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 18:56   ` Martin KaFai Lau
  2021-01-11 21:35     ` KP Singh
@ 2021-01-11 23:41     ` Song Liu
  2021-01-12 18:21       ` Martin KaFai Lau
  1 sibling, 1 reply; 29+ messages in thread
From: Song Liu @ 2021-01-11 23:41 UTC (permalink / raw)
  To: Martin Lau
  Cc: bpf, Networking, open list, mingo, peterz, ast, daniel, andrii,
	john.fastabend, kpsingh, Kernel Team, haoluo, kernel test robot



> On Jan 11, 2021, at 10:56 AM, Martin Lau <kafai@fb.com> wrote:
> 
> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
> 
> [ ... ]
> 
>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>> index dd5aedee99e73..9bd47ad2b26f1 100644
>> --- a/kernel/bpf/bpf_local_storage.c
>> +++ b/kernel/bpf/bpf_local_storage.c
>> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>> {
>> 	struct bpf_local_storage *local_storage;
>> 	bool free_local_storage = false;
>> +	unsigned long flags;
>> 
>> 	if (unlikely(!selem_linked_to_storage(selem)))
>> 		/* selem has already been unlinked from sk */
>> 		return;
>> 
>> 	local_storage = rcu_dereference(selem->local_storage);
>> -	raw_spin_lock_bh(&local_storage->lock);
>> +	raw_spin_lock_irqsave(&local_storage->lock, flags);
> It will be useful to have a few words in commit message on this change
> for future reference purpose.
> 
> Please also remove the in_irq() check from bpf_sk_storage.c
> to avoid confusion in the future.  It probably should
> be in a separate patch.

Do you mean we allow bpf_sk_storage_get_tracing() and 
bpf_sk_storage_delete_tracing() in irq context? Like

diff --git i/net/core/bpf_sk_storage.c w/net/core/bpf_sk_storage.c
index 4edd033e899c0..14dd5e3c67402 100644
--- i/net/core/bpf_sk_storage.c
+++ w/net/core/bpf_sk_storage.c
@@ -425,7 +425,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
 BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
           struct sock *, sk)
 {
-       if (in_irq() || in_nmi())
+       if (in_nmi())
                return -EPERM;

        return ____bpf_sk_storage_delete(map, sk);

[...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 21:58       ` Martin KaFai Lau
@ 2021-01-11 23:45         ` Song Liu
  2021-01-12 16:32           ` Yonghong Song
  0 siblings, 1 reply; 29+ messages in thread
From: Song Liu @ 2021-01-11 23:45 UTC (permalink / raw)
  To: Martin Lau
  Cc: KP Singh, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot



> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
> 
> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>> 
>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>>> 
>>> [ ... ]
>>> 
>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
>>>> --- a/kernel/bpf/bpf_local_storage.c
>>>> +++ b/kernel/bpf/bpf_local_storage.c
>>>> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>>>> {
>>>>      struct bpf_local_storage *local_storage;
>>>>      bool free_local_storage = false;
>>>> +     unsigned long flags;
>>>> 
>>>>      if (unlikely(!selem_linked_to_storage(selem)))
>>>>              /* selem has already been unlinked from sk */
>>>>              return;
>>>> 
>>>>      local_storage = rcu_dereference(selem->local_storage);
>>>> -     raw_spin_lock_bh(&local_storage->lock);
>>>> +     raw_spin_lock_irqsave(&local_storage->lock, flags);
>>> It will be useful to have a few words in commit message on this change
>>> for future reference purpose.
>>> 
>>> Please also remove the in_irq() check from bpf_sk_storage.c
>>> to avoid confusion in the future.  It probably should
>>> be in a separate patch.
>>> 
>>> [ ... ]
>>> 
>>>> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
>>>> index 4ef1959a78f27..f654b56907b69 100644
>>>> diff --git a/kernel/fork.c b/kernel/fork.c
>>>> index 7425b3224891d..3d65c8ebfd594 100644
>>> [ ... ]
>>> 
>>>> --- a/kernel/fork.c
>>>> +++ b/kernel/fork.c
>>>> @@ -96,6 +96,7 @@
>>>> #include <linux/kasan.h>
>>>> #include <linux/scs.h>
>>>> #include <linux/io_uring.h>
>>>> +#include <linux/bpf.h>
>>>> 
>>>> #include <asm/pgalloc.h>
>>>> #include <linux/uaccess.h>
>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>>>>      cgroup_free(tsk);
>>>>      task_numa_free(tsk, true);
>>>>      security_task_free(tsk);
>>>> +     bpf_task_storage_free(tsk);
>>>>      exit_creds(tsk);
>>> If exit_creds() is traced by a bpf and this bpf is doing
>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
>>> new task storage will be created after bpf_task_storage_free().
>>> 
>>> I recalled there was an earlier discussion with KP and KP mentioned
>>> BPF_LSM will not be called with a task that is going away.
>>> It seems enabling bpf task storage in bpf tracing will break
>>> this assumption and needs to be addressed?
>> 
>> For tracing programs, I think we will need an allow list where
>> task local storage can be used.
> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?

I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:

diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
index f654b56907b69..93d01b0a010e6 100644
--- i/kernel/bpf/bpf_task_storage.c
+++ w/kernel/bpf/bpf_task_storage.c
@@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
         * by an RCU read-side critical section.
         */
        if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+               if (!refcount_inc_not_zero(&task->usage))
+                       return -EBUSY;
+
                sdata = bpf_local_storage_update(
                        task, (struct bpf_local_storage_map *)map, value,
                        BPF_NOEXIST);

But where shall we add the refcount_dec()? IIUC, we cannot add it to 
__put_task_struct(). 

Thanks,
Song

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
  2021-01-11 22:54     ` Song Liu
@ 2021-01-12  3:24       ` Yonghong Song
  2021-01-12  7:14         ` Andrii Nakryiko
  0 siblings, 1 reply; 29+ messages in thread
From: Yonghong Song @ 2021-01-12  3:24 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, linux-kernel, mingo, peterz, ast, daniel,
	andrii, john.fastabend, kpsingh, Kernel Team, haoluo



On 1/11/21 2:54 PM, Song Liu wrote:
> 
> 
>> On Jan 11, 2021, at 9:49 AM, Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 1/8/21 3:19 PM, Song Liu wrote:
>>> Replace hashtab with task local storage in runqslower. This improves the
>>> performance of these BPF programs. The following table summarizes average
>>> runtime of these programs, in nanoseconds:
>>>                            task-local   hash-prealloc   hash-no-prealloc
>>> handle__sched_wakeup             125             340               3124
>>> handle__sched_wakeup_new        2812            1510               2998
>>> handle__sched_switch             151             208                991
>>> Note that, task local storage gives better performance than hashtab for
>>> handle__sched_wakeup and handle__sched_switch. On the other hand, for
>>> handle__sched_wakeup_new, task local storage is slower than hashtab with
>>> prealloc. This is because handle__sched_wakeup_new accesses the data for
>>> the first time, so it has to allocate the data for task local storage.
>>> Once the initial allocation is done, subsequent accesses, as those in
>>> handle__sched_wakeup, are much faster with task local storage. If we
>>> disable hashtab prealloc, task local storage is much faster for all 3
>>> functions.
>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>> ---
>>>   tools/bpf/runqslower/runqslower.bpf.c | 26 +++++++++++++++-----------
>>>   1 file changed, 15 insertions(+), 11 deletions(-)
>>> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
>>> index 1f18a409f0443..c4de4179a0a17 100644
>>> --- a/tools/bpf/runqslower/runqslower.bpf.c
>>> +++ b/tools/bpf/runqslower/runqslower.bpf.c
>>> @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0;
>>>   const volatile pid_t targ_pid = 0;
>>>     struct {
>>> -	__uint(type, BPF_MAP_TYPE_HASH);
>>> -	__uint(max_entries, 10240);
>>> -	__type(key, u32);
>>> +	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>>> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>>> +	__type(key, int);
>>>   	__type(value, u64);
>>>   } start SEC(".maps");
>>>   @@ -25,15 +25,19 @@ struct {
>>>     /* record enqueue timestamp */
>>>   __always_inline
>>> -static int trace_enqueue(u32 tgid, u32 pid)
>>> +static int trace_enqueue(struct task_struct *t)
>>>   {
>>> -	u64 ts;
>>> +	u32 pid = t->pid;
>>> +	u64 ts, *ptr;
>>>     	if (!pid || (targ_pid && targ_pid != pid))
>>>   		return 0;
>>>     	ts = bpf_ktime_get_ns();
>>> -	bpf_map_update_elem(&start, &pid, &ts, 0);
>>> +	ptr = bpf_task_storage_get(&start, t, 0,
>>> +				   BPF_LOCAL_STORAGE_GET_F_CREATE);
>>> +	if (ptr)
>>> +		*ptr = ts;
>>>   	return 0;
>>>   }
>>>   @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx)
>>>   	/* TP_PROTO(struct task_struct *p) */
>>>   	struct task_struct *p = (void *)ctx[0];
>>>   -	return trace_enqueue(p->tgid, p->pid);
>>> +	return trace_enqueue(p);
>>>   }
>>>     SEC("tp_btf/sched_wakeup_new")
>>> @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx)
>>>   	/* TP_PROTO(struct task_struct *p) */
>>>   	struct task_struct *p = (void *)ctx[0];
>>>   -	return trace_enqueue(p->tgid, p->pid);
>>> +	return trace_enqueue(p);
>>>   }
>>>     SEC("tp_btf/sched_switch")
>>> @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx)
>>>     	/* ivcsw: treat like an enqueue event and store timestamp */
>>>   	if (prev->state == TASK_RUNNING)
>>> -		trace_enqueue(prev->tgid, prev->pid);
>>> +		trace_enqueue(prev);
>>>     	pid = next->pid;
>>>     	/* fetch timestamp and calculate delta */
>>> -	tsp = bpf_map_lookup_elem(&start, &pid);
>>> +	tsp = bpf_task_storage_get(&start, next, 0, 0);
>>>   	if (!tsp)
>>>   		return 0;   /* missed enqueue */
>>
>> Previously, hash table may overflow so we may have missed enqueue.
>> Here with task local storage, is it possible to add additional pid
>> filtering in the beginning of handle__sched_switch such that
>> missed enqueue here can be treated as an error?
> 
> IIUC, hashtab overflow is not the only reason of missed enqueue. If the
> wakeup (which calls trace_enqueue) happens before runqslower starts, we
> may still get missed enqueue in sched_switch, no?

the wakeup won't happen before runqslower starts since runqslower needs
to start to do attachment first and then trace_enqueue() can run.

For the current implementation trace_enqueue() will happen for any non-0 
pid before setting test_progs tgid, and will happen for any non-0 and 
test_progs tgid if it is set, so this should be okay if we do filtering
in handle__sched_switch. Maybe you can do an experiment to prove whether
my point is correct or not.

> 
> Thanks,
> Song
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for task local storage
       [not found] ` <20210108231950.3844417-3-songliubraving@fb.com>
  2021-01-11 17:30   ` [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for " Yonghong Song
@ 2021-01-12  7:06   ` Andrii Nakryiko
  1 sibling, 0 replies; 29+ messages in thread
From: Andrii Nakryiko @ 2021-01-12  7:06 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, open list, Ingo Molnar, Peter Ziljstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	john fastabend, KP Singh, Kernel Team, Hao Luo

On Fri, Jan 8, 2021 at 3:30 PM Song Liu <songliubraving@fb.com> wrote:
>
> Task local storage is enabled for tracing programs. Add a test for it
> without CONFIG_BPF_LSM.
>
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  .../bpf/prog_tests/test_task_local_storage.c  | 34 +++++++++++++++++
>  .../selftests/bpf/progs/task_local_storage.c  | 37 +++++++++++++++++++
>  2 files changed, 71 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
>  create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> new file mode 100644
> index 0000000000000..7de7a154ebbe6
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c
> @@ -0,0 +1,34 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +
> +#include <sys/types.h>
> +#include <unistd.h>
> +#include <test_progs.h>
> +#include "task_local_storage.skel.h"
> +
> +static unsigned int duration;
> +
> +void test_test_task_local_storage(void)

nit: let's not use "test_test_" tautology. It can be simply
test_task_local_storage() and this file itself should be called
task_local_storage.c.

> +{
> +       struct task_local_storage *skel;
> +       const int count = 10;
> +       int i, err;
> +
> +       skel = task_local_storage__open_and_load();
> +
> +       if (CHECK(!skel, "skel_open_and_load", "skeleton open and load failed\n"))

btw, can you ASSERT_OK_PTR(skel, "skel_open_and_load"); and save
yourself a bunch of typing

> +               return;
> +
> +       err = task_local_storage__attach(skel);
> +
> +       if (CHECK(err, "skel_attach", "skeleton attach failed\n"))
> +               goto out;

similarly, ASSERT_OK(err, "skel_attach")

> +
> +       for (i = 0; i < count; i++)
> +               usleep(1000);
> +       CHECK(skel->bss->value < count, "task_local_storage_value",
> +             "task local value too small\n");
> +
> +out:
> +       task_local_storage__destroy(skel);
> +}

[...]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
  2021-01-12  3:24       ` Yonghong Song
@ 2021-01-12  7:14         ` Andrii Nakryiko
  2021-01-12  7:33           ` Yonghong Song
  0 siblings, 1 reply; 29+ messages in thread
From: Andrii Nakryiko @ 2021-01-12  7:14 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Song Liu, bpf, Networking, linux-kernel, mingo, peterz, ast,
	daniel, andrii, john.fastabend, kpsingh, Kernel Team, haoluo

On Mon, Jan 11, 2021 at 7:24 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 1/11/21 2:54 PM, Song Liu wrote:
> >
> >
> >> On Jan 11, 2021, at 9:49 AM, Yonghong Song <yhs@fb.com> wrote:
> >>
> >>
> >>
> >> On 1/8/21 3:19 PM, Song Liu wrote:
> >>> Replace hashtab with task local storage in runqslower. This improves the
> >>> performance of these BPF programs. The following table summarizes average
> >>> runtime of these programs, in nanoseconds:
> >>>                            task-local   hash-prealloc   hash-no-prealloc
> >>> handle__sched_wakeup             125             340               3124
> >>> handle__sched_wakeup_new        2812            1510               2998
> >>> handle__sched_switch             151             208                991
> >>> Note that, task local storage gives better performance than hashtab for
> >>> handle__sched_wakeup and handle__sched_switch. On the other hand, for
> >>> handle__sched_wakeup_new, task local storage is slower than hashtab with
> >>> prealloc. This is because handle__sched_wakeup_new accesses the data for
> >>> the first time, so it has to allocate the data for task local storage.
> >>> Once the initial allocation is done, subsequent accesses, as those in
> >>> handle__sched_wakeup, are much faster with task local storage. If we
> >>> disable hashtab prealloc, task local storage is much faster for all 3
> >>> functions.
> >>> Signed-off-by: Song Liu <songliubraving@fb.com>
> >>> ---
> >>>   tools/bpf/runqslower/runqslower.bpf.c | 26 +++++++++++++++-----------
> >>>   1 file changed, 15 insertions(+), 11 deletions(-)
> >>> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
> >>> index 1f18a409f0443..c4de4179a0a17 100644
> >>> --- a/tools/bpf/runqslower/runqslower.bpf.c
> >>> +++ b/tools/bpf/runqslower/runqslower.bpf.c
> >>> @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0;
> >>>   const volatile pid_t targ_pid = 0;
> >>>     struct {
> >>> -   __uint(type, BPF_MAP_TYPE_HASH);
> >>> -   __uint(max_entries, 10240);
> >>> -   __type(key, u32);
> >>> +   __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> >>> +   __uint(map_flags, BPF_F_NO_PREALLOC);
> >>> +   __type(key, int);
> >>>     __type(value, u64);
> >>>   } start SEC(".maps");
> >>>   @@ -25,15 +25,19 @@ struct {
> >>>     /* record enqueue timestamp */
> >>>   __always_inline
> >>> -static int trace_enqueue(u32 tgid, u32 pid)
> >>> +static int trace_enqueue(struct task_struct *t)
> >>>   {
> >>> -   u64 ts;
> >>> +   u32 pid = t->pid;
> >>> +   u64 ts, *ptr;
> >>>             if (!pid || (targ_pid && targ_pid != pid))
> >>>             return 0;
> >>>             ts = bpf_ktime_get_ns();
> >>> -   bpf_map_update_elem(&start, &pid, &ts, 0);
> >>> +   ptr = bpf_task_storage_get(&start, t, 0,
> >>> +                              BPF_LOCAL_STORAGE_GET_F_CREATE);
> >>> +   if (ptr)
> >>> +           *ptr = ts;
> >>>     return 0;
> >>>   }
> >>>   @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx)
> >>>     /* TP_PROTO(struct task_struct *p) */
> >>>     struct task_struct *p = (void *)ctx[0];
> >>>   - return trace_enqueue(p->tgid, p->pid);
> >>> +   return trace_enqueue(p);
> >>>   }
> >>>     SEC("tp_btf/sched_wakeup_new")
> >>> @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx)
> >>>     /* TP_PROTO(struct task_struct *p) */
> >>>     struct task_struct *p = (void *)ctx[0];
> >>>   - return trace_enqueue(p->tgid, p->pid);
> >>> +   return trace_enqueue(p);
> >>>   }
> >>>     SEC("tp_btf/sched_switch")
> >>> @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx)
> >>>             /* ivcsw: treat like an enqueue event and store timestamp */
> >>>     if (prev->state == TASK_RUNNING)
> >>> -           trace_enqueue(prev->tgid, prev->pid);
> >>> +           trace_enqueue(prev);
> >>>             pid = next->pid;
> >>>             /* fetch timestamp and calculate delta */
> >>> -   tsp = bpf_map_lookup_elem(&start, &pid);
> >>> +   tsp = bpf_task_storage_get(&start, next, 0, 0);
> >>>     if (!tsp)
> >>>             return 0;   /* missed enqueue */
> >>
> >> Previously, hash table may overflow so we may have missed enqueue.
> >> Here with task local storage, is it possible to add additional pid
> >> filtering in the beginning of handle__sched_switch such that
> >> missed enqueue here can be treated as an error?
> >
> > IIUC, hashtab overflow is not the only reason of missed enqueue. If the
> > wakeup (which calls trace_enqueue) happens before runqslower starts, we
> > may still get missed enqueue in sched_switch, no?
>
> the wakeup won't happen before runqslower starts since runqslower needs
> to start to do attachment first and then trace_enqueue() can run.

I think Song is right. Given wakeup and sched_switch need to be
matched, depending at which exact time we attach BPF programs, we can
end up missing wakeup, but not missing sched_switch, no? So it's not
an error.

>
> For the current implementation trace_enqueue() will happen for any non-0
> pid before setting test_progs tgid, and will happen for any non-0 and
> test_progs tgid if it is set, so this should be okay if we do filtering
> in handle__sched_switch. Maybe you can do an experiment to prove whether
> my point is correct or not.
>
> >
> > Thanks,
> > Song
> >

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
  2021-01-12  7:14         ` Andrii Nakryiko
@ 2021-01-12  7:33           ` Yonghong Song
  0 siblings, 0 replies; 29+ messages in thread
From: Yonghong Song @ 2021-01-12  7:33 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Song Liu, bpf, Networking, linux-kernel, mingo, peterz, ast,
	daniel, andrii, john.fastabend, kpsingh, Kernel Team, haoluo



On 1/11/21 11:14 PM, Andrii Nakryiko wrote:
> On Mon, Jan 11, 2021 at 7:24 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 1/11/21 2:54 PM, Song Liu wrote:
>>>
>>>
>>>> On Jan 11, 2021, at 9:49 AM, Yonghong Song <yhs@fb.com> wrote:
>>>>
>>>>
>>>>
>>>> On 1/8/21 3:19 PM, Song Liu wrote:
>>>>> Replace hashtab with task local storage in runqslower. This improves the
>>>>> performance of these BPF programs. The following table summarizes average
>>>>> runtime of these programs, in nanoseconds:
>>>>>                             task-local   hash-prealloc   hash-no-prealloc
>>>>> handle__sched_wakeup             125             340               3124
>>>>> handle__sched_wakeup_new        2812            1510               2998
>>>>> handle__sched_switch             151             208                991
>>>>> Note that, task local storage gives better performance than hashtab for
>>>>> handle__sched_wakeup and handle__sched_switch. On the other hand, for
>>>>> handle__sched_wakeup_new, task local storage is slower than hashtab with
>>>>> prealloc. This is because handle__sched_wakeup_new accesses the data for
>>>>> the first time, so it has to allocate the data for task local storage.
>>>>> Once the initial allocation is done, subsequent accesses, as those in
>>>>> handle__sched_wakeup, are much faster with task local storage. If we
>>>>> disable hashtab prealloc, task local storage is much faster for all 3
>>>>> functions.
>>>>> Signed-off-by: Song Liu <songliubraving@fb.com>
>>>>> ---
>>>>>    tools/bpf/runqslower/runqslower.bpf.c | 26 +++++++++++++++-----------
>>>>>    1 file changed, 15 insertions(+), 11 deletions(-)
>>>>> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
>>>>> index 1f18a409f0443..c4de4179a0a17 100644
>>>>> --- a/tools/bpf/runqslower/runqslower.bpf.c
>>>>> +++ b/tools/bpf/runqslower/runqslower.bpf.c
>>>>> @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0;
>>>>>    const volatile pid_t targ_pid = 0;
>>>>>      struct {
>>>>> -   __uint(type, BPF_MAP_TYPE_HASH);
>>>>> -   __uint(max_entries, 10240);
>>>>> -   __type(key, u32);
>>>>> +   __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>>>>> +   __uint(map_flags, BPF_F_NO_PREALLOC);
>>>>> +   __type(key, int);
>>>>>      __type(value, u64);
>>>>>    } start SEC(".maps");
>>>>>    @@ -25,15 +25,19 @@ struct {
>>>>>      /* record enqueue timestamp */
>>>>>    __always_inline
>>>>> -static int trace_enqueue(u32 tgid, u32 pid)
>>>>> +static int trace_enqueue(struct task_struct *t)
>>>>>    {
>>>>> -   u64 ts;
>>>>> +   u32 pid = t->pid;
>>>>> +   u64 ts, *ptr;
>>>>>              if (!pid || (targ_pid && targ_pid != pid))
>>>>>              return 0;
>>>>>              ts = bpf_ktime_get_ns();
>>>>> -   bpf_map_update_elem(&start, &pid, &ts, 0);
>>>>> +   ptr = bpf_task_storage_get(&start, t, 0,
>>>>> +                              BPF_LOCAL_STORAGE_GET_F_CREATE);
>>>>> +   if (ptr)
>>>>> +           *ptr = ts;
>>>>>      return 0;
>>>>>    }
>>>>>    @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx)
>>>>>      /* TP_PROTO(struct task_struct *p) */
>>>>>      struct task_struct *p = (void *)ctx[0];
>>>>>    - return trace_enqueue(p->tgid, p->pid);
>>>>> +   return trace_enqueue(p);
>>>>>    }
>>>>>      SEC("tp_btf/sched_wakeup_new")
>>>>> @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx)
>>>>>      /* TP_PROTO(struct task_struct *p) */
>>>>>      struct task_struct *p = (void *)ctx[0];
>>>>>    - return trace_enqueue(p->tgid, p->pid);
>>>>> +   return trace_enqueue(p);
>>>>>    }
>>>>>      SEC("tp_btf/sched_switch")
>>>>> @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx)
>>>>>              /* ivcsw: treat like an enqueue event and store timestamp */
>>>>>      if (prev->state == TASK_RUNNING)
>>>>> -           trace_enqueue(prev->tgid, prev->pid);
>>>>> +           trace_enqueue(prev);
>>>>>              pid = next->pid;
>>>>>              /* fetch timestamp and calculate delta */
>>>>> -   tsp = bpf_map_lookup_elem(&start, &pid);
>>>>> +   tsp = bpf_task_storage_get(&start, next, 0, 0);
>>>>>      if (!tsp)
>>>>>              return 0;   /* missed enqueue */
>>>>
>>>> Previously, hash table may overflow so we may have missed enqueue.
>>>> Here with task local storage, is it possible to add additional pid
>>>> filtering in the beginning of handle__sched_switch such that
>>>> missed enqueue here can be treated as an error?
>>>
>>> IIUC, hashtab overflow is not the only reason of missed enqueue. If the
>>> wakeup (which calls trace_enqueue) happens before runqslower starts, we
>>> may still get missed enqueue in sched_switch, no?
>>
>> the wakeup won't happen before runqslower starts since runqslower needs
>> to start to do attachment first and then trace_enqueue() can run.
> 
> I think Song is right. Given wakeup and sched_switch need to be
> matched, depending at which exact time we attach BPF programs, we can
> end up missing wakeup, but not missing sched_switch, no? So it's not
> an error.

The current approach works fine. What I suggested is to
tighten sched_switch only for target_pid. wakeup (doing queuing) will
be more relaxed than sched_switch to ensure task local storage creation
is always there for target_pid regardless of attachment timing.
I think it should work, but we have to experiment to see actual
results...

> 
>>
>> For the current implementation trace_enqueue() will happen for any non-0
>> pid before setting test_progs tgid, and will happen for any non-0 and
>> test_progs tgid if it is set, so this should be okay if we do filtering
>> in handle__sched_switch. Maybe you can do an experiment to prove whether
>> my point is correct or not.
>>
>>>
>>> Thanks,
>>> Song
>>>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 23:45         ` Song Liu
@ 2021-01-12 16:32           ` Yonghong Song
  2021-01-12 16:53             ` KP Singh
  0 siblings, 1 reply; 29+ messages in thread
From: Yonghong Song @ 2021-01-12 16:32 UTC (permalink / raw)
  To: Song Liu, Martin Lau
  Cc: KP Singh, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot



On 1/11/21 3:45 PM, Song Liu wrote:
> 
> 
>> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
>>
>> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
>>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>>>
>>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>>>>
>>>> [ ... ]
>>>>
>>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
>>>>> --- a/kernel/bpf/bpf_local_storage.c
>>>>> +++ b/kernel/bpf/bpf_local_storage.c
>>>>> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
>>>>> {
>>>>>       struct bpf_local_storage *local_storage;
>>>>>       bool free_local_storage = false;
>>>>> +     unsigned long flags;
>>>>>
>>>>>       if (unlikely(!selem_linked_to_storage(selem)))
>>>>>               /* selem has already been unlinked from sk */
>>>>>               return;
>>>>>
>>>>>       local_storage = rcu_dereference(selem->local_storage);
>>>>> -     raw_spin_lock_bh(&local_storage->lock);
>>>>> +     raw_spin_lock_irqsave(&local_storage->lock, flags);
>>>> It will be useful to have a few words in commit message on this change
>>>> for future reference purpose.
>>>>
>>>> Please also remove the in_irq() check from bpf_sk_storage.c
>>>> to avoid confusion in the future.  It probably should
>>>> be in a separate patch.
>>>>
>>>> [ ... ]
>>>>
>>>>> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
>>>>> index 4ef1959a78f27..f654b56907b69 100644
>>>>> diff --git a/kernel/fork.c b/kernel/fork.c
>>>>> index 7425b3224891d..3d65c8ebfd594 100644
>>>> [ ... ]
>>>>
>>>>> --- a/kernel/fork.c
>>>>> +++ b/kernel/fork.c
>>>>> @@ -96,6 +96,7 @@
>>>>> #include <linux/kasan.h>
>>>>> #include <linux/scs.h>
>>>>> #include <linux/io_uring.h>
>>>>> +#include <linux/bpf.h>
>>>>>
>>>>> #include <asm/pgalloc.h>
>>>>> #include <linux/uaccess.h>
>>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>>>>>       cgroup_free(tsk);
>>>>>       task_numa_free(tsk, true);
>>>>>       security_task_free(tsk);
>>>>> +     bpf_task_storage_free(tsk);
>>>>>       exit_creds(tsk);
>>>> If exit_creds() is traced by a bpf and this bpf is doing
>>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
>>>> new task storage will be created after bpf_task_storage_free().
>>>>
>>>> I recalled there was an earlier discussion with KP and KP mentioned
>>>> BPF_LSM will not be called with a task that is going away.
>>>> It seems enabling bpf task storage in bpf tracing will break
>>>> this assumption and needs to be addressed?
>>>
>>> For tracing programs, I think we will need an allow list where
>>> task local storage can be used.
>> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?
> 
> I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:
> 
> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
> index f654b56907b69..93d01b0a010e6 100644
> --- i/kernel/bpf/bpf_task_storage.c
> +++ w/kernel/bpf/bpf_task_storage.c
> @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>           * by an RCU read-side critical section.
>           */
>          if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> +               if (!refcount_inc_not_zero(&task->usage))
> +                       return -EBUSY;
> +
>                  sdata = bpf_local_storage_update(
>                          task, (struct bpf_local_storage_map *)map, value,
>                          BPF_NOEXIST);
> 
> But where shall we add the refcount_dec()? IIUC, we cannot add it to
> __put_task_struct().

Maybe put_task_struct()?

> Thanks,
> Song
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-12 16:32           ` Yonghong Song
@ 2021-01-12 16:53             ` KP Singh
  2021-01-15 23:34               ` Song Liu
  0 siblings, 1 reply; 29+ messages in thread
From: KP Singh @ 2021-01-12 16:53 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Song Liu, Martin Lau, bpf, Networking, open list, mingo,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Kernel Team, Hao Luo,
	kernel test robot

On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 1/11/21 3:45 PM, Song Liu wrote:
> >
> >
> >> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
> >>
> >> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
> >>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >>>>
> >>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
> >>>>
> >>>> [ ... ]
> >>>>
> >>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> >>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
> >>>>> --- a/kernel/bpf/bpf_local_storage.c
> >>>>> +++ b/kernel/bpf/bpf_local_storage.c

[...]

> >>>>> +#include <linux/bpf.h>
> >>>>>
> >>>>> #include <asm/pgalloc.h>
> >>>>> #include <linux/uaccess.h>
> >>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
> >>>>>       cgroup_free(tsk);
> >>>>>       task_numa_free(tsk, true);
> >>>>>       security_task_free(tsk);
> >>>>> +     bpf_task_storage_free(tsk);
> >>>>>       exit_creds(tsk);
> >>>> If exit_creds() is traced by a bpf and this bpf is doing
> >>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
> >>>> new task storage will be created after bpf_task_storage_free().
> >>>>
> >>>> I recalled there was an earlier discussion with KP and KP mentioned
> >>>> BPF_LSM will not be called with a task that is going away.
> >>>> It seems enabling bpf task storage in bpf tracing will break
> >>>> this assumption and needs to be addressed?
> >>>
> >>> For tracing programs, I think we will need an allow list where
> >>> task local storage can be used.
> >> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?
> >
> > I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:
> >
> > diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
> > index f654b56907b69..93d01b0a010e6 100644
> > --- i/kernel/bpf/bpf_task_storage.c
> > +++ w/kernel/bpf/bpf_task_storage.c
> > @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
> >           * by an RCU read-side critical section.
> >           */
> >          if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> > +               if (!refcount_inc_not_zero(&task->usage))
> > +                       return -EBUSY;
> > +
> >                  sdata = bpf_local_storage_update(
> >                          task, (struct bpf_local_storage_map *)map, value,
> >                          BPF_NOEXIST);
> >
> > But where shall we add the refcount_dec()? IIUC, we cannot add it to
> > __put_task_struct().
>
> Maybe put_task_struct()?

Yeah, something like, or if you find a more elegant alternative :)

--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t);

 static inline void put_task_struct(struct task_struct *t)
 {
-       if (refcount_dec_and_test(&t->usage))
+
+       if (rcu_access_pointer(t->bpf_storage)) {
+               if (refcount_sub_and_test(2, &t->usage))
+                       __put_task_struct(t);
+       } else if (refcount_dec_and_test(&t->usage))
                __put_task_struct(t);
 }

 static inline void put_task_struct_many(struct task_struct *t, int nr)
 {
-       if (refcount_sub_and_test(nr, &t->usage))
+       if (rcu_access_pointer(t->bpf_storage)) {
+               if (refcount_sub_and_test(nr + 1, &t->usage))
+                       __put_task_struct(t);
+       } else if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
 }


I may be missing something but shouldn't bpf_storage be an __rcu
member like we have for sk_bpf_storage?

#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *sk_bpf_storage;
#endif


>
> > Thanks,
> > Song
> >

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-11 23:41     ` Song Liu
@ 2021-01-12 18:21       ` Martin KaFai Lau
  0 siblings, 0 replies; 29+ messages in thread
From: Martin KaFai Lau @ 2021-01-12 18:21 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Networking, open list, mingo, peterz, ast, daniel, andrii,
	john.fastabend, kpsingh, Kernel Team, haoluo, kernel test robot

On Mon, Jan 11, 2021 at 03:41:26PM -0800, Song Liu wrote:
> 
> 
> > On Jan 11, 2021, at 10:56 AM, Martin Lau <kafai@fb.com> wrote:
> > 
> > On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
> > 
> > [ ... ]
> > 
> >> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> >> index dd5aedee99e73..9bd47ad2b26f1 100644
> >> --- a/kernel/bpf/bpf_local_storage.c
> >> +++ b/kernel/bpf/bpf_local_storage.c
> >> @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
> >> {
> >> 	struct bpf_local_storage *local_storage;
> >> 	bool free_local_storage = false;
> >> +	unsigned long flags;
> >> 
> >> 	if (unlikely(!selem_linked_to_storage(selem)))
> >> 		/* selem has already been unlinked from sk */
> >> 		return;
> >> 
> >> 	local_storage = rcu_dereference(selem->local_storage);
> >> -	raw_spin_lock_bh(&local_storage->lock);
> >> +	raw_spin_lock_irqsave(&local_storage->lock, flags);
> > It will be useful to have a few words in commit message on this change
> > for future reference purpose.
> > 
> > Please also remove the in_irq() check from bpf_sk_storage.c
> > to avoid confusion in the future.  It probably should
> > be in a separate patch.
> 
> Do you mean we allow bpf_sk_storage_get_tracing() and 
> bpf_sk_storage_delete_tracing() in irq context? Like
Right.

However, after another thought, may be lets skip that for now
till a use case comes up and a test can be written.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-12 16:53             ` KP Singh
@ 2021-01-15 23:34               ` Song Liu
  2021-01-16  0:55                 ` Yonghong Song
  0 siblings, 1 reply; 29+ messages in thread
From: Song Liu @ 2021-01-15 23:34 UTC (permalink / raw)
  To: KP Singh
  Cc: Yonghong Song, Martin Lau, bpf, Networking, open list, mingo,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Kernel Team, Hao Luo,
	kernel test robot



> On Jan 12, 2021, at 8:53 AM, KP Singh <kpsingh@kernel.org> wrote:
> 
> On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song <yhs@fb.com> wrote:
>> 
>> 
>> 
>> On 1/11/21 3:45 PM, Song Liu wrote:
>>> 
>>> 
>>>> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
>>>> 
>>>> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
>>>>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>>>>> 
>>>>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>>>>>> 
>>>>>> [ ... ]
>>>>>> 
>>>>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>>>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
>>>>>>> --- a/kernel/bpf/bpf_local_storage.c
>>>>>>> +++ b/kernel/bpf/bpf_local_storage.c
> 
> [...]
> 
>>>>>>> +#include <linux/bpf.h>
>>>>>>> 
>>>>>>> #include <asm/pgalloc.h>
>>>>>>> #include <linux/uaccess.h>
>>>>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>>>>>>>      cgroup_free(tsk);
>>>>>>>      task_numa_free(tsk, true);
>>>>>>>      security_task_free(tsk);
>>>>>>> +     bpf_task_storage_free(tsk);
>>>>>>>      exit_creds(tsk);
>>>>>> If exit_creds() is traced by a bpf and this bpf is doing
>>>>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
>>>>>> new task storage will be created after bpf_task_storage_free().
>>>>>> 
>>>>>> I recalled there was an earlier discussion with KP and KP mentioned
>>>>>> BPF_LSM will not be called with a task that is going away.
>>>>>> It seems enabling bpf task storage in bpf tracing will break
>>>>>> this assumption and needs to be addressed?
>>>>> 
>>>>> For tracing programs, I think we will need an allow list where
>>>>> task local storage can be used.
>>>> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?
>>> 
>>> I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:
>>> 
>>> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
>>> index f654b56907b69..93d01b0a010e6 100644
>>> --- i/kernel/bpf/bpf_task_storage.c
>>> +++ w/kernel/bpf/bpf_task_storage.c
>>> @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>>>          * by an RCU read-side critical section.
>>>          */
>>>         if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>>> +               if (!refcount_inc_not_zero(&task->usage))
>>> +                       return -EBUSY;
>>> +
>>>                 sdata = bpf_local_storage_update(
>>>                         task, (struct bpf_local_storage_map *)map, value,
>>>                         BPF_NOEXIST);
>>> 
>>> But where shall we add the refcount_dec()? IIUC, we cannot add it to
>>> __put_task_struct().
>> 
>> Maybe put_task_struct()?
> 
> Yeah, something like, or if you find a more elegant alternative :)
> 
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t);
> 
> static inline void put_task_struct(struct task_struct *t)
> {
> -       if (refcount_dec_and_test(&t->usage))
> +
> +       if (rcu_access_pointer(t->bpf_storage)) {
> +               if (refcount_sub_and_test(2, &t->usage))
> +                       __put_task_struct(t);
> +       } else if (refcount_dec_and_test(&t->usage))
>                __put_task_struct(t);
> }
> 
> static inline void put_task_struct_many(struct task_struct *t, int nr)
> {
> -       if (refcount_sub_and_test(nr, &t->usage))
> +       if (rcu_access_pointer(t->bpf_storage)) {
> +               if (refcount_sub_and_test(nr + 1, &t->usage))
> +                       __put_task_struct(t);
> +       } else if (refcount_sub_and_test(nr, &t->usage))
>                __put_task_struct(t);
> }

It is not ideal to leak bpf_storage here. How about we only add the
following:

diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
index f654b56907b69..2811b9fc47233 100644
--- i/kernel/bpf/bpf_task_storage.c
+++ w/kernel/bpf/bpf_task_storage.c
@@ -216,6 +216,10 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
         * by an RCU read-side critical section.
         */
        if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+               /* the task_struct is being freed, fail over*/
+               if (!refcount_read(&task->usage))
+                       return -EBUSY;
+
                sdata = bpf_local_storage_update(
                        task, (struct bpf_local_storage_map *)map, value,
                        BPF_NOEXIST);

> 
> 
> I may be missing something but shouldn't bpf_storage be an __rcu
> member like we have for sk_bpf_storage?

Good catch! I will fix this in v2. 

Thanks,
Song

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-15 23:34               ` Song Liu
@ 2021-01-16  0:55                 ` Yonghong Song
  2021-01-16  1:12                   ` Song Liu
  0 siblings, 1 reply; 29+ messages in thread
From: Yonghong Song @ 2021-01-16  0:55 UTC (permalink / raw)
  To: Song Liu, KP Singh
  Cc: Martin Lau, bpf, Networking, open list, mingo, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Kernel Team, Hao Luo, kernel test robot



On 1/15/21 3:34 PM, Song Liu wrote:
> 
> 
>> On Jan 12, 2021, at 8:53 AM, KP Singh <kpsingh@kernel.org> wrote:
>>
>> On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song <yhs@fb.com> wrote:
>>>
>>>
>>>
>>> On 1/11/21 3:45 PM, Song Liu wrote:
>>>>
>>>>
>>>>> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
>>>>>
>>>>> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
>>>>>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>>>>>>
>>>>>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>>>>>>>
>>>>>>> [ ... ]
>>>>>>>
>>>>>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>>>>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
>>>>>>>> --- a/kernel/bpf/bpf_local_storage.c
>>>>>>>> +++ b/kernel/bpf/bpf_local_storage.c
>>
>> [...]
>>
>>>>>>>> +#include <linux/bpf.h>
>>>>>>>>
>>>>>>>> #include <asm/pgalloc.h>
>>>>>>>> #include <linux/uaccess.h>
>>>>>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>>>>>>>>       cgroup_free(tsk);
>>>>>>>>       task_numa_free(tsk, true);
>>>>>>>>       security_task_free(tsk);
>>>>>>>> +     bpf_task_storage_free(tsk);
>>>>>>>>       exit_creds(tsk);
>>>>>>> If exit_creds() is traced by a bpf and this bpf is doing
>>>>>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
>>>>>>> new task storage will be created after bpf_task_storage_free().
>>>>>>>
>>>>>>> I recalled there was an earlier discussion with KP and KP mentioned
>>>>>>> BPF_LSM will not be called with a task that is going away.
>>>>>>> It seems enabling bpf task storage in bpf tracing will break
>>>>>>> this assumption and needs to be addressed?
>>>>>>
>>>>>> For tracing programs, I think we will need an allow list where
>>>>>> task local storage can be used.
>>>>> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?
>>>>
>>>> I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:
>>>>
>>>> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
>>>> index f654b56907b69..93d01b0a010e6 100644
>>>> --- i/kernel/bpf/bpf_task_storage.c
>>>> +++ w/kernel/bpf/bpf_task_storage.c
>>>> @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>>>>           * by an RCU read-side critical section.
>>>>           */
>>>>          if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>>>> +               if (!refcount_inc_not_zero(&task->usage))
>>>> +                       return -EBUSY;
>>>> +
>>>>                  sdata = bpf_local_storage_update(
>>>>                          task, (struct bpf_local_storage_map *)map, value,
>>>>                          BPF_NOEXIST);
>>>>
>>>> But where shall we add the refcount_dec()? IIUC, we cannot add it to
>>>> __put_task_struct().
>>>
>>> Maybe put_task_struct()?
>>
>> Yeah, something like, or if you find a more elegant alternative :)
>>
>> --- a/include/linux/sched/task.h
>> +++ b/include/linux/sched/task.h
>> @@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t);
>>
>> static inline void put_task_struct(struct task_struct *t)
>> {
>> -       if (refcount_dec_and_test(&t->usage))
>> +
>> +       if (rcu_access_pointer(t->bpf_storage)) {
>> +               if (refcount_sub_and_test(2, &t->usage))
>> +                       __put_task_struct(t);
>> +       } else if (refcount_dec_and_test(&t->usage))
>>                 __put_task_struct(t);
>> }
>>
>> static inline void put_task_struct_many(struct task_struct *t, int nr)
>> {
>> -       if (refcount_sub_and_test(nr, &t->usage))
>> +       if (rcu_access_pointer(t->bpf_storage)) {
>> +               if (refcount_sub_and_test(nr + 1, &t->usage))
>> +                       __put_task_struct(t);
>> +       } else if (refcount_sub_and_test(nr, &t->usage))
>>                 __put_task_struct(t);
>> }
> 
> It is not ideal to leak bpf_storage here. How about we only add the
> following:
> 
> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
> index f654b56907b69..2811b9fc47233 100644
> --- i/kernel/bpf/bpf_task_storage.c
> +++ w/kernel/bpf/bpf_task_storage.c
> @@ -216,6 +216,10 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>           * by an RCU read-side critical section.
>           */
>          if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
> +               /* the task_struct is being freed, fail over*/
> +               if (!refcount_read(&task->usage))
> +                       return -EBUSY;

This may not work? Even we check here and task->usage is not 0, it could 
still become 0 immediately after the above refcount_read, right?

> +
>                  sdata = bpf_local_storage_update(
>                          task, (struct bpf_local_storage_map *)map, value,
>                          BPF_NOEXIST);
> 
>>
>>
>> I may be missing something but shouldn't bpf_storage be an __rcu
>> member like we have for sk_bpf_storage?
> 
> Good catch! I will fix this in v2.
> 
> Thanks,
> Song
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-16  0:55                 ` Yonghong Song
@ 2021-01-16  1:12                   ` Song Liu
  2021-01-16  1:50                     ` Yonghong Song
  0 siblings, 1 reply; 29+ messages in thread
From: Song Liu @ 2021-01-16  1:12 UTC (permalink / raw)
  To: Yonghong Song
  Cc: KP Singh, Martin Lau, bpf, Networking, open list, mingo,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Kernel Team, Hao Luo,
	kernel test robot



> On Jan 15, 2021, at 4:55 PM, Yonghong Song <yhs@fb.com> wrote:
> 
> 
> 
> On 1/15/21 3:34 PM, Song Liu wrote:
>>> On Jan 12, 2021, at 8:53 AM, KP Singh <kpsingh@kernel.org> wrote:
>>> 
>>> On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song <yhs@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>> On 1/11/21 3:45 PM, Song Liu wrote:
>>>>> 
>>>>> 
>>>>>> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
>>>>>> 
>>>>>> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
>>>>>>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>>>>>>> 
>>>>>>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>>>>>>>> 
>>>>>>>> [ ... ]
>>>>>>>> 
>>>>>>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>>>>>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
>>>>>>>>> --- a/kernel/bpf/bpf_local_storage.c
>>>>>>>>> +++ b/kernel/bpf/bpf_local_storage.c
>>> 
>>> [...]
>>> 
>>>>>>>>> +#include <linux/bpf.h>
>>>>>>>>> 
>>>>>>>>> #include <asm/pgalloc.h>
>>>>>>>>> #include <linux/uaccess.h>
>>>>>>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>>>>>>>>>      cgroup_free(tsk);
>>>>>>>>>      task_numa_free(tsk, true);
>>>>>>>>>      security_task_free(tsk);
>>>>>>>>> +     bpf_task_storage_free(tsk);
>>>>>>>>>      exit_creds(tsk);
>>>>>>>> If exit_creds() is traced by a bpf and this bpf is doing
>>>>>>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
>>>>>>>> new task storage will be created after bpf_task_storage_free().
>>>>>>>> 
>>>>>>>> I recalled there was an earlier discussion with KP and KP mentioned
>>>>>>>> BPF_LSM will not be called with a task that is going away.
>>>>>>>> It seems enabling bpf task storage in bpf tracing will break
>>>>>>>> this assumption and needs to be addressed?
>>>>>>> 
>>>>>>> For tracing programs, I think we will need an allow list where
>>>>>>> task local storage can be used.
>>>>>> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?
>>>>> 
>>>>> I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:
>>>>> 
>>>>> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
>>>>> index f654b56907b69..93d01b0a010e6 100644
>>>>> --- i/kernel/bpf/bpf_task_storage.c
>>>>> +++ w/kernel/bpf/bpf_task_storage.c
>>>>> @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>>>>>          * by an RCU read-side critical section.
>>>>>          */
>>>>>         if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>>>>> +               if (!refcount_inc_not_zero(&task->usage))
>>>>> +                       return -EBUSY;
>>>>> +
>>>>>                 sdata = bpf_local_storage_update(
>>>>>                         task, (struct bpf_local_storage_map *)map, value,
>>>>>                         BPF_NOEXIST);
>>>>> 
>>>>> But where shall we add the refcount_dec()? IIUC, we cannot add it to
>>>>> __put_task_struct().
>>>> 
>>>> Maybe put_task_struct()?
>>> 
>>> Yeah, something like, or if you find a more elegant alternative :)
>>> 
>>> --- a/include/linux/sched/task.h
>>> +++ b/include/linux/sched/task.h
>>> @@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t);
>>> 
>>> static inline void put_task_struct(struct task_struct *t)
>>> {
>>> -       if (refcount_dec_and_test(&t->usage))
>>> +
>>> +       if (rcu_access_pointer(t->bpf_storage)) {
>>> +               if (refcount_sub_and_test(2, &t->usage))
>>> +                       __put_task_struct(t);
>>> +       } else if (refcount_dec_and_test(&t->usage))
>>>                __put_task_struct(t);
>>> }
>>> 
>>> static inline void put_task_struct_many(struct task_struct *t, int nr)
>>> {
>>> -       if (refcount_sub_and_test(nr, &t->usage))
>>> +       if (rcu_access_pointer(t->bpf_storage)) {
>>> +               if (refcount_sub_and_test(nr + 1, &t->usage))
>>> +                       __put_task_struct(t);
>>> +       } else if (refcount_sub_and_test(nr, &t->usage))
>>>                __put_task_struct(t);
>>> }
>> It is not ideal to leak bpf_storage here. How about we only add the
>> following:
>> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
>> index f654b56907b69..2811b9fc47233 100644
>> --- i/kernel/bpf/bpf_task_storage.c
>> +++ w/kernel/bpf/bpf_task_storage.c
>> @@ -216,6 +216,10 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>>          * by an RCU read-side critical section.
>>          */
>>         if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>> +               /* the task_struct is being freed, fail over*/
>> +               if (!refcount_read(&task->usage))
>> +                       return -EBUSY;
> 
> This may not work? Even we check here and task->usage is not 0, it could still become 0 immediately after the above refcount_read, right?

We call bpf_task_storage_get() with "task" that has valid BTF, so "task"
should not go away during the BPF program? Whatever mechanism that 
triggers the BPF program should either hold a reference to task (usage > 0)
or be the only one owning it (usage == 0, in __put_task_struct). Did I miss
anything?

Thanks,
Song

> 
>> +
>>                 sdata = bpf_local_storage_update(
>>                         task, (struct bpf_local_storage_map *)map, value,
>>                         BPF_NOEXIST);
>>> 
>>> 
>>> I may be missing something but shouldn't bpf_storage be an __rcu
>>> member like we have for sk_bpf_storage?
>> Good catch! I will fix this in v2.
>> Thanks,
>> Song


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
  2021-01-16  1:12                   ` Song Liu
@ 2021-01-16  1:50                     ` Yonghong Song
  0 siblings, 0 replies; 29+ messages in thread
From: Yonghong Song @ 2021-01-16  1:50 UTC (permalink / raw)
  To: Song Liu
  Cc: KP Singh, Martin Lau, bpf, Networking, open list, mingo,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Kernel Team, Hao Luo,
	kernel test robot



On 1/15/21 5:12 PM, Song Liu wrote:
> 
> 
>> On Jan 15, 2021, at 4:55 PM, Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 1/15/21 3:34 PM, Song Liu wrote:
>>>> On Jan 12, 2021, at 8:53 AM, KP Singh <kpsingh@kernel.org> wrote:
>>>>
>>>> On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song <yhs@fb.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 1/11/21 3:45 PM, Song Liu wrote:
>>>>>>
>>>>>>
>>>>>>> On Jan 11, 2021, at 1:58 PM, Martin Lau <kafai@fb.com> wrote:
>>>>>>>
>>>>>>> On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote:
>>>>>>>> On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>>>>>>>>
>>>>>>>>> On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote:
>>>>>>>>>
>>>>>>>>> [ ... ]
>>>>>>>>>
>>>>>>>>>> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
>>>>>>>>>> index dd5aedee99e73..9bd47ad2b26f1 100644
>>>>>>>>>> --- a/kernel/bpf/bpf_local_storage.c
>>>>>>>>>> +++ b/kernel/bpf/bpf_local_storage.c
>>>>
>>>> [...]
>>>>
>>>>>>>>>> +#include <linux/bpf.h>
>>>>>>>>>>
>>>>>>>>>> #include <asm/pgalloc.h>
>>>>>>>>>> #include <linux/uaccess.h>
>>>>>>>>>> @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
>>>>>>>>>>       cgroup_free(tsk);
>>>>>>>>>>       task_numa_free(tsk, true);
>>>>>>>>>>       security_task_free(tsk);
>>>>>>>>>> +     bpf_task_storage_free(tsk);
>>>>>>>>>>       exit_creds(tsk);
>>>>>>>>> If exit_creds() is traced by a bpf and this bpf is doing
>>>>>>>>> bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE),
>>>>>>>>> new task storage will be created after bpf_task_storage_free().
>>>>>>>>>
>>>>>>>>> I recalled there was an earlier discussion with KP and KP mentioned
>>>>>>>>> BPF_LSM will not be called with a task that is going away.
>>>>>>>>> It seems enabling bpf task storage in bpf tracing will break
>>>>>>>>> this assumption and needs to be addressed?
>>>>>>>>
>>>>>>>> For tracing programs, I think we will need an allow list where
>>>>>>>> task local storage can be used.
>>>>>>> Instead of whitelist, can refcount_inc_not_zero(&tsk->usage) be used?
>>>>>>
>>>>>> I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like:
>>>>>>
>>>>>> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
>>>>>> index f654b56907b69..93d01b0a010e6 100644
>>>>>> --- i/kernel/bpf/bpf_task_storage.c
>>>>>> +++ w/kernel/bpf/bpf_task_storage.c
>>>>>> @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>>>>>>           * by an RCU read-side critical section.
>>>>>>           */
>>>>>>          if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>>>>>> +               if (!refcount_inc_not_zero(&task->usage))
>>>>>> +                       return -EBUSY;
>>>>>> +
>>>>>>                  sdata = bpf_local_storage_update(
>>>>>>                          task, (struct bpf_local_storage_map *)map, value,
>>>>>>                          BPF_NOEXIST);
>>>>>>
>>>>>> But where shall we add the refcount_dec()? IIUC, we cannot add it to
>>>>>> __put_task_struct().
>>>>>
>>>>> Maybe put_task_struct()?
>>>>
>>>> Yeah, something like, or if you find a more elegant alternative :)
>>>>
>>>> --- a/include/linux/sched/task.h
>>>> +++ b/include/linux/sched/task.h
>>>> @@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t);
>>>>
>>>> static inline void put_task_struct(struct task_struct *t)
>>>> {
>>>> -       if (refcount_dec_and_test(&t->usage))
>>>> +
>>>> +       if (rcu_access_pointer(t->bpf_storage)) {
>>>> +               if (refcount_sub_and_test(2, &t->usage))
>>>> +                       __put_task_struct(t);
>>>> +       } else if (refcount_dec_and_test(&t->usage))
>>>>                 __put_task_struct(t);
>>>> }
>>>>
>>>> static inline void put_task_struct_many(struct task_struct *t, int nr)
>>>> {
>>>> -       if (refcount_sub_and_test(nr, &t->usage))
>>>> +       if (rcu_access_pointer(t->bpf_storage)) {
>>>> +               if (refcount_sub_and_test(nr + 1, &t->usage))
>>>> +                       __put_task_struct(t);
>>>> +       } else if (refcount_sub_and_test(nr, &t->usage))
>>>>                 __put_task_struct(t);
>>>> }
>>> It is not ideal to leak bpf_storage here. How about we only add the
>>> following:
>>> diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c
>>> index f654b56907b69..2811b9fc47233 100644
>>> --- i/kernel/bpf/bpf_task_storage.c
>>> +++ w/kernel/bpf/bpf_task_storage.c
>>> @@ -216,6 +216,10 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
>>>           * by an RCU read-side critical section.
>>>           */
>>>          if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
>>> +               /* the task_struct is being freed, fail over*/
>>> +               if (!refcount_read(&task->usage))
>>> +                       return -EBUSY;
>>
>> This may not work? Even we check here and task->usage is not 0, it could still become 0 immediately after the above refcount_read, right?
> 
> We call bpf_task_storage_get() with "task" that has valid BTF, so "task"
> should not go away during the BPF program? Whatever mechanism that

Oh, right. this is true. Otherwise, we cannot use task ptr in the helper.

> triggers the BPF program should either hold a reference to task (usage > 0)
> or be the only one owning it (usage == 0, in __put_task_struct). Did I miss
> anything?

Sorry. I think you are right. Not sure lsm requirement. There are two
more possible ways to check task is exiting which happens before 
__put_task_struct():
   . check task->exit_state
   . check task->flags & PF_EXITING (used in bpf_trace.c)

Not sure which condition is the correct one to check.

> 
> Thanks,
> Song
> 
>>
>>> +
>>>                  sdata = bpf_local_storage_update(
>>>                          task, (struct bpf_local_storage_map *)map, value,
>>>                          BPF_NOEXIST);
>>>>
>>>>
>>>> I may be missing something but shouldn't bpf_storage be an __rcu
>>>> member like we have for sk_bpf_storage?
>>> Good catch! I will fix this in v2.
>>> Thanks,
>>> Song
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, back to index

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20210108231950.3844417-1-songliubraving@fb.com>
     [not found] ` <20210108231950.3844417-2-songliubraving@fb.com>
2021-01-11  6:27   ` [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs Yonghong Song
2021-01-11 10:17     ` KP Singh
2021-01-11 15:56       ` Yonghong Song
2021-01-11 10:14   ` KP Singh
2021-01-11 23:16     ` Song Liu
2021-01-11 17:16   ` Yonghong Song
2021-01-11 18:56   ` Martin KaFai Lau
2021-01-11 21:35     ` KP Singh
2021-01-11 21:58       ` Martin KaFai Lau
2021-01-11 23:45         ` Song Liu
2021-01-12 16:32           ` Yonghong Song
2021-01-12 16:53             ` KP Singh
2021-01-15 23:34               ` Song Liu
2021-01-16  0:55                 ` Yonghong Song
2021-01-16  1:12                   ` Song Liu
2021-01-16  1:50                     ` Yonghong Song
2021-01-11 23:41     ` Song Liu
2021-01-12 18:21       ` Martin KaFai Lau
     [not found] ` <20210108231950.3844417-4-songliubraving@fb.com>
2021-01-11 17:37   ` [PATCH bpf-next 3/4] bpf: runqslower: prefer use local vmlinux Yonghong Song
     [not found] ` <20210108231950.3844417-5-songliubraving@fb.com>
2021-01-11 17:49   ` [PATCH bpf-next 4/4] bpf: runqslower: use task local storage Yonghong Song
2021-01-11 22:54     ` Song Liu
2021-01-12  3:24       ` Yonghong Song
2021-01-12  7:14         ` Andrii Nakryiko
2021-01-12  7:33           ` Yonghong Song
     [not found] ` <20210108231950.3844417-3-songliubraving@fb.com>
2021-01-11 17:30   ` [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for " Yonghong Song
2021-01-11 17:44     ` KP Singh
2021-01-11 22:50       ` Song Liu
2021-01-11 22:49     ` Song Liu
2021-01-12  7:06   ` Andrii Nakryiko

BPF Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/bpf/0 bpf/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 bpf bpf/ https://lore.kernel.org/bpf \
		bpf@vger.kernel.org
	public-inbox-index bpf

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.bpf


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git