[RFC] bpf: Add helpers to read useful task_struct members
diff mbox series

Message ID 20171103065833.8076-1-sandipan@linux.vnet.ibm.com
State New, archived
Headers show
Series
  • [RFC] bpf: Add helpers to read useful task_struct members
Related show

Commit Message

Sandipan Das Nov. 3, 2017, 6:58 a.m. UTC
For added security, the layout of some structures can be
randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
such structure is task_struct. To build BPF programs, we
use Clang which does not support this feature. So, if we
attempt to read a field of a structure with a randomized
layout within a BPF program, we do not get the expected
value because of incorrect offsets. To observe this, it
is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
enabled because the structure annotations/members added
for this purpose are enough to cause this. So, all kernel
builds are affected.

For example, considering samples/bpf/offwaketime_kern.c,
if we try to print the values of pid and comm inside the
task_struct passed to waker() by adding the following
lines of code at the appropriate place

  char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
  bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));

it is seen that upon rebuilding and running this sample
followed by inspecting /sys/kernel/debug/tracing/trace,
the output looks like the following

                               _-----=> irqs-off
                              / _----=> need-resched
                             | / _---=> hardirq/softirq
                             || / _--=> preempt-depth
                             ||| /     delay
            TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
               | |       |   ||||       |         |
          <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
          <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
 systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): p->pid = 0, p->comm =
 systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): p->pid = 0, p->comm =
 systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): p->pid = 0, p->comm =

To avoid this, we add new BPF helpers that read the
correct values for some of the important task_struct
members such as pid, tgid, comm and flags which are
extensively used in BPF-based analysis tools such as
bcc. Since these helpers are built with GCC, they use
the correct offsets when referencing a member.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 include/linux/bpf.h                       |  3 ++
 include/uapi/linux/bpf.h                  | 13 ++++++
 kernel/bpf/core.c                         |  3 ++
 kernel/bpf/helpers.c                      | 75 +++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c                  |  6 +++
 tools/testing/selftests/bpf/bpf_helpers.h |  9 ++++
 6 files changed, 109 insertions(+)

Comments

Alexei Starovoitov Nov. 4, 2017, 9:34 a.m. UTC | #1
On 11/3/17 3:58 PM, Sandipan Das wrote:
> For added security, the layout of some structures can be
> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
> such structure is task_struct. To build BPF programs, we
> use Clang which does not support this feature. So, if we
> attempt to read a field of a structure with a randomized
> layout within a BPF program, we do not get the expected
> value because of incorrect offsets. To observe this, it
> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
> enabled because the structure annotations/members added
> for this purpose are enough to cause this. So, all kernel
> builds are affected.
>
> For example, considering samples/bpf/offwaketime_kern.c,
> if we try to print the values of pid and comm inside the
> task_struct passed to waker() by adding the following
> lines of code at the appropriate place
>
>   char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>   bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
>
> it is seen that upon rebuilding and running this sample
> followed by inspecting /sys/kernel/debug/tracing/trace,
> the output looks like the following
>
>                                _-----=> irqs-off
>                               / _----=> need-resched
>                              | / _---=> hardirq/softirq
>                              || / _--=> preempt-depth
>                              ||| /     delay
>             TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>                | |       |   ||||       |         |
>           <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
>           <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
>  systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): p->pid = 0, p->comm =
>  systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): p->pid = 0, p->comm =
>  systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): p->pid = 0, p->comm =
>
> To avoid this, we add new BPF helpers that read the
> correct values for some of the important task_struct
> members such as pid, tgid, comm and flags which are
> extensively used in BPF-based analysis tools such as
> bcc. Since these helpers are built with GCC, they use
> the correct offsets when referencing a member.
>
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
...
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f90860d1f897..324508d27bd2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -338,6 +338,16 @@ union bpf_attr {
>   *     @skb: pointer to skb
>   *     Return: classid if != 0
>   *
> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
> + *     Return: task->tgid << 32 | task->pid
> + *
> + * int bpf_get_task_comm(struct task_struct *task)
> + *     Stores task->comm into buf
> + *     Return: 0 on success or negative error
> + *
> + * u32 bpf_get_task_flags(struct task_struct *task)
> + *     Return: task->flags
> + *

I don't think it's a solution.
Tracing scripts read other fields too.
Making it work for these 3 fields is a drop in a bucket.
If randomization is used I think we have to accept
that existing bpf scripts won't be usable.
Long term solution is to support 'BPF Type Format' or BTF
(which is old C-Type Format) for kernel data structures,
so bcc scripts wouldn't need to use kernel headers and clang.
The proper offsets will be described in BTF.
We were planning to use it initially to describe map key/value,
but it applies for this case as well.
There will be a tool that will take dwarf from vmlinux and
compress it into BTF. Kernel will also be able to verify
that BTF is a valid BTF.
I'm assuming that gcc randomization plugin produces dwarf
with correct offsets, if not, it would have to be fixed.
Naveen N. Rao Nov. 4, 2017, 5:31 p.m. UTC | #2
Hi Alexei,

Alexei Starovoitov wrote:
> On 11/3/17 3:58 PM, Sandipan Das wrote:
>> For added security, the layout of some structures can be
>> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
>> such structure is task_struct. To build BPF programs, we
>> use Clang which does not support this feature. So, if we
>> attempt to read a field of a structure with a randomized
>> layout within a BPF program, we do not get the expected
>> value because of incorrect offsets. To observe this, it
>> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
>> enabled because the structure annotations/members added
>> for this purpose are enough to cause this. So, all kernel
>> builds are affected.
>>
>> For example, considering samples/bpf/offwaketime_kern.c,
>> if we try to print the values of pid and comm inside the
>> task_struct passed to waker() by adding the following
>> lines of code at the appropriate place
>>
>>   char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>>   bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
>>
>> it is seen that upon rebuilding and running this sample
>> followed by inspecting /sys/kernel/debug/tracing/trace,
>> the output looks like the following
>>
>>                                _-----=> irqs-off
>>                               / _----=> need-resched
>>                              | / _---=> hardirq/softirq
>>                              || / _--=> preempt-depth
>>                              ||| /     delay
>>             TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>>                | |       |   ||||       |         |
>>           <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
>>           <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
>>  systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): 
>>  p->pid = 0, p->comm =
>>  systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): 
>>  p->pid = 0, p->comm =
>>  systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): 
>>  p->pid = 0, p->comm =
>>
>> To avoid this, we add new BPF helpers that read the
>> correct values for some of the important task_struct
>> members such as pid, tgid, comm and flags which are
>> extensively used in BPF-based analysis tools such as
>> bcc. Since these helpers are built with GCC, they use
>> the correct offsets when referencing a member.
>>
>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ...
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index f90860d1f897..324508d27bd2 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -338,6 +338,16 @@ union bpf_attr {
>>   *     @skb: pointer to skb
>>   *     Return: classid if != 0
>>   *
>> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
>> + *     Return: task->tgid << 32 | task->pid
>> + *
>> + * int bpf_get_task_comm(struct task_struct *task)
>> + *     Stores task->comm into buf
>> + *     Return: 0 on success or negative error
>> + *
>> + * u32 bpf_get_task_flags(struct task_struct *task)
>> + *     Return: task->flags
>> + *
> 
> I don't think it's a solution.
> Tracing scripts read other fields too.
> Making it work for these 3 fields is a drop in a bucket.

Indeed. However...

> If randomization is used I think we have to accept
> that existing bpf scripts won't be usable.

... the actual issue is that randomization isn't necessary for this to 
show up. The annotations added to mark off the structure members results 
in some structure members being moved into an anonymous structure, which 
would then get padded differently. So, *all* kernels since v4.13 are 
affected, afaict.

As such, we wanted to propose this as a short term solution, but I do 
agree that this doesn't solve the real issue.

> Long term solution is to support 'BPF Type Format' or BTF
> (which is old C-Type Format) for kernel data structures,
> so bcc scripts wouldn't need to use kernel headers and clang.
> The proper offsets will be described in BTF.
> We were planning to use it initially to describe map key/value,
> but it applies for this case as well.
> There will be a tool that will take dwarf from vmlinux and
> compress it into BTF. Kernel will also be able to verify
> that BTF is a valid BTF.

This is the first that I've heard about BTF. Can you share more details 
about it, or point me to some place where it has been discussed?

We considered having tools derive the structure offsets from debuginfo, 
but debuginfo may not always be present on production systems. So, it 
isn't clear if having that dependency is fine. I'm not sure how BTF will
be different.

> I'm assuming that gcc randomization plugin produces dwarf
> with correct offsets, if not, it would have to be fixed.

I think the offsets described in dwarf were incorrect with 
CONFIG_GCC_PLUGIN_RANDSTRUCT, but I'll let Sandipan confirm that.


- Naveen
Alexei Starovoitov Nov. 4, 2017, 9:10 p.m. UTC | #3
On 11/5/17 2:31 AM, Naveen N. Rao wrote:
> Hi Alexei,
>
> Alexei Starovoitov wrote:
>> On 11/3/17 3:58 PM, Sandipan Das wrote:
>>> For added security, the layout of some structures can be
>>> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
>>> such structure is task_struct. To build BPF programs, we
>>> use Clang which does not support this feature. So, if we
>>> attempt to read a field of a structure with a randomized
>>> layout within a BPF program, we do not get the expected
>>> value because of incorrect offsets. To observe this, it
>>> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
>>> enabled because the structure annotations/members added
>>> for this purpose are enough to cause this. So, all kernel
>>> builds are affected.
>>>
>>> For example, considering samples/bpf/offwaketime_kern.c,
>>> if we try to print the values of pid and comm inside the
>>> task_struct passed to waker() by adding the following
>>> lines of code at the appropriate place
>>>
>>>   char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>>>   bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
>>>
>>> it is seen that upon rebuilding and running this sample
>>> followed by inspecting /sys/kernel/debug/tracing/trace,
>>> the output looks like the following
>>>
>>>                                _-----=> irqs-off
>>>                               / _----=> need-resched
>>>                              | / _---=> hardirq/softirq
>>>                              || / _--=> preempt-depth
>>>                              ||| /     delay
>>>             TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>>>                | |       |   ||||       |         |
>>>           <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>           <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker():
>>> p->pid = 0, p->comm =
>>>  systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker():
>>>  p->pid = 0, p->comm =
>>>  systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker():
>>>  p->pid = 0, p->comm =
>>>  systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker():
>>>  p->pid = 0, p->comm =
>>>
>>> To avoid this, we add new BPF helpers that read the
>>> correct values for some of the important task_struct
>>> members such as pid, tgid, comm and flags which are
>>> extensively used in BPF-based analysis tools such as
>>> bcc. Since these helpers are built with GCC, they use
>>> the correct offsets when referencing a member.
>>>
>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>> ...
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index f90860d1f897..324508d27bd2 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -338,6 +338,16 @@ union bpf_attr {
>>>   *     @skb: pointer to skb
>>>   *     Return: classid if != 0
>>>   *
>>> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
>>> + *     Return: task->tgid << 32 | task->pid
>>> + *
>>> + * int bpf_get_task_comm(struct task_struct *task)
>>> + *     Stores task->comm into buf
>>> + *     Return: 0 on success or negative error
>>> + *
>>> + * u32 bpf_get_task_flags(struct task_struct *task)
>>> + *     Return: task->flags
>>> + *
>>
>> I don't think it's a solution.
>> Tracing scripts read other fields too.
>> Making it work for these 3 fields is a drop in a bucket.
>
> Indeed. However...
>
>> If randomization is used I think we have to accept
>> that existing bpf scripts won't be usable.
>
> ... the actual issue is that randomization isn't necessary for this to
> show up. The annotations added to mark off the structure members results
> in some structure members being moved into an anonymous structure, which
> would then get padded differently. So, *all* kernels since v4.13 are
> affected, afaict.

hmm. why would all 4.13+ be affected?
It's just an anonymous struct inside task_struct.
Are you saying that due to clang not adding this 'struct { };' treatment 
to task_struct?
I thought such struct shouldn't change layout.
If it is we need to fix include/linux/compiler-clang.h to do that
anon struct as well.

> As such, we wanted to propose this as a short term solution, but I do
> agree that this doesn't solve the real issue.
>
>> Long term solution is to support 'BPF Type Format' or BTF
>> (which is old C-Type Format) for kernel data structures,
>> so bcc scripts wouldn't need to use kernel headers and clang.
>> The proper offsets will be described in BTF.
>> We were planning to use it initially to describe map key/value,
>> but it applies for this case as well.
>> There will be a tool that will take dwarf from vmlinux and
>> compress it into BTF. Kernel will also be able to verify
>> that BTF is a valid BTF.
>
> This is the first that I've heard about BTF. Can you share more details
> about it, or point me to some place where it has been discussed?
>
> We considered having tools derive the structure offsets from debuginfo,
> but debuginfo may not always be present on production systems. So, it
> isn't clear if having that dependency is fine. I'm not sure how BTF will
> be different.

It was discussed at this year plumbers:
https://lwn.net/Articles/734453/

btw the name BTF is work in progress. We started with CTF, but
it conflicts with all other meanings of this abbreviation.
Likely we will call it something different at the end.

Initial goal was to describe key/map values of bpf maps to
make debugging easier, but now we want to use this compressed
type format for tracing as well, since installing kernel headers
everywhere doesn't scale well while CTF can be embedded in vmlinux

We were also thinking to improve verifier with CTF knowledge too.
Like if CTF describes that map value is two u32, but bpf program
is doing 8-byte access then something is wrong and either warn
or reject such program.
Sandipan Das Nov. 6, 2017, 5:16 a.m. UTC | #4
Hi Alexei, Naveen,

On 11/04/2017 11:01 PM, Naveen N. Rao wrote:
> 
> I think the offsets described in dwarf were incorrect with CONFIG_GCC_PLUGIN_RANDSTRUCT, but I'll let Sandipan confirm that.
> 

I think that the offsets described in dwarf are probably incorrect when
CONFIG_GCC_PLUGIN_RANDSTRUCT is enabled. To verify this, I used perf
to attach a probe to try_to_wake_up() which is the also the function to
which waker() is attached in the previously mentioned kernel sample. So,
if the run the following:

# perf probe "try_to_wake_up" "p->pid"
# perf record -a -e probe:try_to_wake_up
# perf script

The value of p->pid is reported as 0. Similarly, if I try to read
p->comm, it is reported to be an empty string. The same problem is
seen with systemtap as well.

Also, if I do a printk with offsetof(struct task_struct, pid) and
offsetof(struct task_struct, comm) inside the kernel code and then
compare the values with the offsets reported by pahole, they are
completely different.

- Sandipan
Naveen N. Rao Nov. 6, 2017, 3:55 p.m. UTC | #5
Alexei Starovoitov wrote:
> On 11/5/17 2:31 AM, Naveen N. Rao wrote:
>> Hi Alexei,
>>
>> Alexei Starovoitov wrote:
>>> On 11/3/17 3:58 PM, Sandipan Das wrote:
>>>> For added security, the layout of some structures can be
>>>> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
>>>> such structure is task_struct. To build BPF programs, we
>>>> use Clang which does not support this feature. So, if we
>>>> attempt to read a field of a structure with a randomized
>>>> layout within a BPF program, we do not get the expected
>>>> value because of incorrect offsets. To observe this, it
>>>> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
>>>> enabled because the structure annotations/members added
>>>> for this purpose are enough to cause this. So, all kernel
>>>> builds are affected.
>>>>

...

>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index f90860d1f897..324508d27bd2 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -338,6 +338,16 @@ union bpf_attr {
>>>>   *     @skb: pointer to skb
>>>>   *     Return: classid if != 0
>>>>   *
>>>> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
>>>> + *     Return: task->tgid << 32 | task->pid
>>>> + *
>>>> + * int bpf_get_task_comm(struct task_struct *task)
>>>> + *     Stores task->comm into buf
>>>> + *     Return: 0 on success or negative error
>>>> + *
>>>> + * u32 bpf_get_task_flags(struct task_struct *task)
>>>> + *     Return: task->flags
>>>> + *
>>>
>>> I don't think it's a solution.
>>> Tracing scripts read other fields too.
>>> Making it work for these 3 fields is a drop in a bucket.
>>
>> Indeed. However...
>>
>>> If randomization is used I think we have to accept
>>> that existing bpf scripts won't be usable.
>>
>> ... the actual issue is that randomization isn't necessary for this to
>> show up. The annotations added to mark off the structure members results
>> in some structure members being moved into an anonymous structure, which
>> would then get padded differently. So, *all* kernels since v4.13 are
>> affected, afaict.
> 
> hmm. why would all 4.13+ be affected?
> It's just an anonymous struct inside task_struct.
> Are you saying that due to clang not adding this 'struct { };' treatment 
> to task_struct?

Yes, that's what it looked like.

> I thought such struct shouldn't change layout.
> If it is we need to fix include/linux/compiler-clang.h to do that
> anon struct as well.

We considered that, but it looked to be very dependent on the version of 
gcc used to build the kernel. But, this may be a simpler approach for 
the shorter term.

> 
>> As such, we wanted to propose this as a short term solution, but I do
>> agree that this doesn't solve the real issue.
>>
>>> Long term solution is to support 'BPF Type Format' or BTF
>>> (which is old C-Type Format) for kernel data structures,
>>> so bcc scripts wouldn't need to use kernel headers and clang.
>>> The proper offsets will be described in BTF.
>>> We were planning to use it initially to describe map key/value,
>>> but it applies for this case as well.
>>> There will be a tool that will take dwarf from vmlinux and
>>> compress it into BTF. Kernel will also be able to verify
>>> that BTF is a valid BTF.
>>
>> This is the first that I've heard about BTF. Can you share more details
>> about it, or point me to some place where it has been discussed?
>>
>> We considered having tools derive the structure offsets from debuginfo,
>> but debuginfo may not always be present on production systems. So, it
>> isn't clear if having that dependency is fine. I'm not sure how BTF will
>> be different.
> 
> It was discussed at this year plumbers:
> https://lwn.net/Articles/734453/
> 
> btw the name BTF is work in progress. We started with CTF, but
> it conflicts with all other meanings of this abbreviation.
> Likely we will call it something different at the end.
> 
> Initial goal was to describe key/map values of bpf maps to
> make debugging easier, but now we want to use this compressed
> type format for tracing as well, since installing kernel headers
> everywhere doesn't scale well while CTF can be embedded in vmlinux

Makes sense, though I'm curious on how you're planning to have this work
without the kernel headers :)

> 
> We were also thinking to improve verifier with CTF knowledge too.
> Like if CTF describes that map value is two u32, but bpf program
> is doing 8-byte access then something is wrong and either warn
> or reject such program.

Sounds good. I look forward to more details/patches on this front once 
you're ready to share more.

Thanks,
- Naveen
Tushar Dave Nov. 7, 2017, 12:16 a.m. UTC | #6
On 11/02/2017 11:58 PM, Sandipan Das wrote:
> For added security, the layout of some structures can be
> randomized by enabling CONFIG_GCC_PLUGIN_RANDSTRUCT. One
> such structure is task_struct. To build BPF programs, we
> use Clang which does not support this feature. So, if we
> attempt to read a field of a structure with a randomized
> layout within a BPF program, we do not get the expected
> value because of incorrect offsets. To observe this, it
> is not mandatory to have CONFIG_GCC_PLUGIN_RANDSTRUCT
> enabled because the structure annotations/members added
> for this purpose are enough to cause this. So, all kernel
> builds are affected.
> 
> For example, considering samples/bpf/offwaketime_kern.c,
> if we try to print the values of pid and comm inside the
> task_struct passed to waker() by adding the following
> lines of code at the appropriate place
> 
>    char fmt[] = "waker(): p->pid = %u, p->comm = %s\n";
>    bpf_trace_printk(fmt, sizeof(fmt), _(p->pid), _(p->comm));
> 
> it is seen that upon rebuilding and running this sample
> followed by inspecting /sys/kernel/debug/tracing/trace,
> the output looks like the following
> 
>                                 _-----=> irqs-off
>                                / _----=> need-resched
>                               | / _---=> hardirq/softirq
>                               || / _--=> preempt-depth
>                               ||| /     delay
>              TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>                 | |       |   ||||       |         |
>            <idle>-0     [007] d.s.  1883.443594: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [018] d.s.  1883.453588: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [007] d.s.  1883.463584: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [009] d.s.  1883.483586: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [005] d.s.  1883.493583: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [009] d.s.  1883.503583: 0x00000001: waker(): p->pid = 0, p->comm =
>            <idle>-0     [018] d.s.  1883.513578: 0x00000001: waker(): p->pid = 0, p->comm =
>   systemd-journal-3140  [003] d...  1883.627660: 0x00000001: waker(): p->pid = 0, p->comm =
>   systemd-journal-3140  [003] d...  1883.627704: 0x00000001: waker(): p->pid = 0, p->comm =
>   systemd-journal-3140  [003] d...  1883.627723: 0x00000001: waker(): p->pid = 0, p->comm =
> 
> To avoid this, we add new BPF helpers that read the
> correct values for some of the important task_struct
> members such as pid, tgid, comm and flags which are
> extensively used in BPF-based analysis tools such as
> bcc. Since these helpers are built with GCC, they use
> the correct offsets when referencing a member.
Just to add that we were seeing the same issue (but had no clue until 
looked at this patch , thanks). Its easy to reproduce by running bcc 
example task_switch.py where pid (prev_pid) is retrieved from struct 
task_struct and that is always zero. we tried printing other task_struct 
members such as 'comm' and see that as empty string as well.


-Tushar
> 
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ---
>   include/linux/bpf.h                       |  3 ++
>   include/uapi/linux/bpf.h                  | 13 ++++++
>   kernel/bpf/core.c                         |  3 ++
>   kernel/bpf/helpers.c                      | 75 +++++++++++++++++++++++++++++++
>   kernel/trace/bpf_trace.c                  |  6 +++
>   tools/testing/selftests/bpf/bpf_helpers.h |  9 ++++
>   6 files changed, 109 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f1af7d63d678..5993a0f5262b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -418,6 +418,9 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
>   extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
>   extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
>   extern const struct bpf_func_proto bpf_get_current_comm_proto;
> +extern const struct bpf_func_proto bpf_get_task_pid_tgid_proto;
> +extern const struct bpf_func_proto bpf_get_task_comm_proto;
> +extern const struct bpf_func_proto bpf_get_task_flags_proto;
>   extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
>   extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
>   extern const struct bpf_func_proto bpf_get_stackid_proto;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f90860d1f897..324508d27bd2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -338,6 +338,16 @@ union bpf_attr {
>    *     @skb: pointer to skb
>    *     Return: classid if != 0
>    *
> + * u64 bpf_get_task_pid_tgid(struct task_struct *task)
> + *     Return: task->tgid << 32 | task->pid
> + *
> + * int bpf_get_task_comm(struct task_struct *task)
> + *     Stores task->comm into buf
> + *     Return: 0 on success or negative error
> + *
> + * u32 bpf_get_task_flags(struct task_struct *task)
> + *     Return: task->flags
> + *
>    * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
>    *     Return: 0 on success or negative error
>    *
> @@ -602,6 +612,9 @@ union bpf_attr {
>   	FN(get_current_uid_gid),	\
>   	FN(get_current_comm),		\
>   	FN(get_cgroup_classid),		\
> +	FN(get_task_pid_tgid),		\
> +	FN(get_task_comm),		\
> +	FN(get_task_flags),		\
>   	FN(skb_vlan_push),		\
>   	FN(skb_vlan_pop),		\
>   	FN(skb_get_tunnel_key),		\
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 7b62df86be1d..c69c17d6514a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1438,6 +1438,9 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
>   const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
>   const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
>   const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> +const struct bpf_func_proto bpf_get_task_pid_tgid_proto __weak;
> +const struct bpf_func_proto bpf_get_task_comm_proto __weak;
> +const struct bpf_func_proto bpf_get_task_flags_proto __weak;
>   const struct bpf_func_proto bpf_sock_map_update_proto __weak;
>   
>   const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 3d24e238221e..f45259dce117 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -179,3 +179,78 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
>   	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
>   	.arg2_type	= ARG_CONST_SIZE,
>   };
> +
> +BPF_CALL_1(bpf_get_task_pid_tgid, struct task_struct *, task)
> +{
> +	int ret;
> +	u32 pid, tgid;
> +
> +	ret = probe_kernel_read(&pid, &task->pid, sizeof(pid));
> +	if (unlikely(ret < 0))
> +		goto err;
> +
> +	ret = probe_kernel_read(&tgid, &task->tgid, sizeof(tgid));
> +	if (unlikely(ret < 0))
> +		goto err;
> +
> +	return (u64) tgid << 32 | pid;
> +err:
> +	return -EINVAL;
> +}
> +
> +const struct bpf_func_proto bpf_get_task_pid_tgid_proto = {
> +	.func		= bpf_get_task_pid_tgid,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_3(bpf_get_task_comm, struct task_struct *, task, char *, buf, u32, size)
> +{
> +	int ret;
> +	char comm[TASK_COMM_LEN];
> +
> +	ret = probe_kernel_read(comm, task->comm, sizeof(comm));
> +	if (unlikely(ret < 0))
> +		goto err_clear;
> +
> +	strncpy(buf, comm, size);
> +
> +	/* Verifier guarantees that size > 0. For task->comm exceeding
> +	 * size, guarantee that buf is %NUL-terminated. Unconditionally
> +	 * done here to save the size test.
> +	 */
> +	buf[size - 1] = 0;
> +	return 0;
> +err_clear:
> +	memset(buf, 0, size);
> +	return -EINVAL;
> +}
> +
> +const struct bpf_func_proto bpf_get_task_comm_proto = {
> +	.func		= bpf_get_task_comm,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +};
> +
> +BPF_CALL_1(bpf_get_task_flags, struct task_struct *, task)
> +{
> +	int ret;
> +	unsigned int flags;
> +
> +	ret = probe_kernel_read(&flags, &task->flags, sizeof(flags));
> +	if (unlikely(ret < 0))
> +		return -EINVAL;
> +
> +	return flags;
> +}
> +
> +const struct bpf_func_proto bpf_get_task_flags_proto = {
> +	.func		= bpf_get_task_flags,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_ANYTHING,
> +};
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index dc498b605d5d..a31f5cf68cbc 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -477,6 +477,12 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
>   		return &bpf_get_smp_processor_id_proto;
>   	case BPF_FUNC_get_numa_node_id:
>   		return &bpf_get_numa_node_id_proto;
> +	case BPF_FUNC_get_task_pid_tgid:
> +		return &bpf_get_task_pid_tgid_proto;
> +	case BPF_FUNC_get_task_comm:
> +		return &bpf_get_task_comm_proto;
> +	case BPF_FUNC_get_task_flags:
> +		return &bpf_get_task_flags_proto;
>   	case BPF_FUNC_perf_event_read:
>   		return &bpf_perf_event_read_proto;
>   	case BPF_FUNC_probe_write_user:
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index b2e02bdcd098..8c64df027d2c 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -1,6 +1,8 @@
>   #ifndef __BPF_HELPERS_H
>   #define __BPF_HELPERS_H
>   
> +struct task_struct;
> +
>   /* helper macro to place programs, maps, license in
>    * different sections in elf_bpf file. Section names
>    * are interpreted by elf_bpf loader
> @@ -31,6 +33,13 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
>   	(void *) BPF_FUNC_get_current_uid_gid;
>   static int (*bpf_get_current_comm)(void *buf, int buf_size) =
>   	(void *) BPF_FUNC_get_current_comm;
> +static unsigned long long (*bpf_get_task_pid_tgid)(struct task_struct *task) =
> +	(void *) BPF_FUNC_get_task_pid_tgid;
> +static int (*bpf_get_task_comm)(struct task_struct *task,
> +				void *buf, int buf_size) =
> +	(void *) BPF_FUNC_get_task_comm;
> +static unsigned int (*bpf_get_task_flags)(struct task_struct *task) =
> +	(void *) BPF_FUNC_get_task_flags;
>   static unsigned long long (*bpf_perf_event_read)(void *map,
>   						 unsigned long long flags) =
>   	(void *) BPF_FUNC_perf_event_read;
>
Alexei Starovoitov Nov. 7, 2017, 8:08 a.m. UTC | #7
On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>> I thought such struct shouldn't change layout.
>> If it is we need to fix include/linux/compiler-clang.h to do that
>> anon struct as well.
>
> We considered that, but it looked to be very dependent on the version of
> gcc used to build the kernel. But, this may be a simpler approach for
> the shorter term.
>

why it would depend on version of gcc?
We just need this, no?

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index de179993e039..4e29ab6187cb 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -15,3 +15,6 @@
   * with any version that can compile the kernel
   */
  #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), 
__COUNTER__)
+
+#define randomized_struct_fields_start struct {
+#define randomized_struct_fields_end   };

since offsets are mandated by C standard.
Naveen N. Rao Nov. 7, 2017, 8:37 a.m. UTC | #8
Alexei Starovoitov wrote:
> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>> I thought such struct shouldn't change layout.
>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>> anon struct as well.
>>
>> We considered that, but it looked to be very dependent on the version of
>> gcc used to build the kernel. But, this may be a simpler approach for
>> the shorter term.
>>
> 
> why it would depend on version of gcc?

>From what I can see, randomized_struct_fields_start is defined only for 
gcc >= 4.6. For older versions, it does not get mapped to an anonymous 
structure. We may not care for older gcc versions, but..

The other issue was that __randomize_layout maps to __designated_init 
when randstruct plugin is not enabled, which is in turn an attribute on 
gcc >= v5.1, but not otherwise.

> We just need this, no?
> 
> diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
> index de179993e039..4e29ab6187cb 100644
> --- a/include/linux/compiler-clang.h
> +++ b/include/linux/compiler-clang.h
> @@ -15,3 +15,6 @@
>    * with any version that can compile the kernel
>    */
>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), 
> __COUNTER__)
> +
> +#define randomized_struct_fields_start struct {
> +#define randomized_struct_fields_end   };
> 
> since offsets are mandated by C standard.

Yes, this is what we're testing with and is probably sufficient for our 
purposes.

- Naveen
Y Song Nov. 7, 2017, 9:14 p.m. UTC | #9
On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
<naveen.n.rao@linux.vnet.ibm.com> wrote:
> Alexei Starovoitov wrote:
>>
>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>
>>>> I thought such struct shouldn't change layout.
>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>> anon struct as well.
>>>
>>>
>>> We considered that, but it looked to be very dependent on the version of
>>> gcc used to build the kernel. But, this may be a simpler approach for
>>> the shorter term.
>>>
>>
>> why it would depend on version of gcc?
>
>
> From what I can see, randomized_struct_fields_start is defined only for gcc
>>= 4.6. For older versions, it does not get mapped to an anonymous
> structure. We may not care for older gcc versions, but..
>
> The other issue was that __randomize_layout maps to __designated_init when
> randstruct plugin is not enabled, which is in turn an attribute on gcc >=
> v5.1, but not otherwise.
>
>> We just need this, no?
>>
>> diff --git a/include/linux/compiler-clang.h
>> b/include/linux/compiler-clang.h
>> index de179993e039..4e29ab6187cb 100644
>> --- a/include/linux/compiler-clang.h
>> +++ b/include/linux/compiler-clang.h
>> @@ -15,3 +15,6 @@
>>    * with any version that can compile the kernel
>>    */
>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>> __COUNTER__)
>> +
>> +#define randomized_struct_fields_start struct {
>> +#define randomized_struct_fields_end   };
>>
>> since offsets are mandated by C standard.
>
>
> Yes, this is what we're testing with and is probably sufficient for our
> purposes.

Just tested this with bcc. bcc actually complains. the rewriter
is not able to rewrite prev->pid where prev is "struct task_struct *prev".
I will change bcc rewriter to see whether the field value is correct or not.

Not sure my understanding is correct or not, but I am afraid that
the above approach for clang compiler change may not work.
If clang calculates the field offset based on header file, the offset
may not be the same as kernel one....

I verified that the drawf info with randomized structure config does not
match randomized structure member offset. Specifically, I tried
linux/proc_ns.h struct proc_ns_operations,
dwarf says:
  field name: offset 0
  field real_ns_name: offset 8
But if you print out the real offset at runtime, you get 40 and 16 respectively.

>
> - Naveen
>
>
Atish Patra Nov. 7, 2017, 9:31 p.m. UTC | #10
On 11/07/2017 03:14 PM, Y Song wrote:
> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>> Alexei Starovoitov wrote:
>>>
>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>
>>>>> I thought such struct shouldn't change layout.
>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>> anon struct as well.
>>>>
>>>>
>>>> We considered that, but it looked to be very dependent on the version of
>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>> the shorter term.
>>>>
>>>
>>> why it would depend on version of gcc?
>>
>>
>> From what I can see, randomized_struct_fields_start is defined only for gcc
>>> = 4.6. For older versions, it does not get mapped to an anonymous
>> structure. We may not care for older gcc versions, but..
>>
>> The other issue was that __randomize_layout maps to __designated_init when
>> randstruct plugin is not enabled, which is in turn an attribute on gcc >=
>> v5.1, but not otherwise.
>>
>>> We just need this, no?
>>>
>>> diff --git a/include/linux/compiler-clang.h
>>> b/include/linux/compiler-clang.h
>>> index de179993e039..4e29ab6187cb 100644
>>> --- a/include/linux/compiler-clang.h
>>> +++ b/include/linux/compiler-clang.h
>>> @@ -15,3 +15,6 @@
>>>    * with any version that can compile the kernel
>>>    */
>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>> __COUNTER__)
>>> +
>>> +#define randomized_struct_fields_start struct {
>>> +#define randomized_struct_fields_end   };
>>>
>>> since offsets are mandated by C standard.
>>
>>
>> Yes, this is what we're testing with and is probably sufficient for our
>> purposes.
>
> Just tested this with bcc. bcc actually complains. the rewriter
> is not able to rewrite prev->pid where prev is "struct task_struct *prev".
> I will change bcc rewriter to see whether the field value is correct or not.
>
> Not sure my understanding is correct or not, but I am afraid that
> the above approach for clang compiler change may not work.
> If clang calculates the field offset based on header file, the offset
> may not be the same as kernel one....
>
> I verified that the drawf info with randomized structure config does not
> match randomized structure member offset. Specifically, I tried
> linux/proc_ns.h struct proc_ns_operations,
> dwarf says:
>   field name: offset 0
>   field real_ns_name: offset 8
> But if you print out the real offset at runtime, you get 40 and 16 respectively.
>
I am also trying to get it work with bcc as any python scripts that 
access task_struct gives wrong output (task_switch.py, runqlen.py). I 
recompiled my kernel (4.14-rc7 & bcc) with the patch.

I am seeing following error.
# ./task_switch.py
/virtual/main.c:17:18: warning: implicit declaration of function 
'bpf_get_task_pid_tgid' is invalid in C99
       [-Wimplicit-function-declaration]
   key.prev_pid = bpf_get_task_pid_tgid(prev);
                  ^
1 warning generated.
LLVM ERROR: Program used external function 'bpf_get_task_pid_tgid' which 
could not be resolved!

Are you also seeing the same issue or something else ?

Regards,
Atish
>>
>> - Naveen
>>
>>
>
Alexei Starovoitov Nov. 7, 2017, 9:39 p.m. UTC | #11
On 11/8/17 6:14 AM, Y Song wrote:
> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>> Alexei Starovoitov wrote:
>>>
>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>
>>>>> I thought such struct shouldn't change layout.
>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>> anon struct as well.
>>>>
>>>>
>>>> We considered that, but it looked to be very dependent on the version of
>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>> the shorter term.
>>>>
>>>
>>> why it would depend on version of gcc?
>>
>>
>> From what I can see, randomized_struct_fields_start is defined only for gcc
>>> = 4.6. For older versions, it does not get mapped to an anonymous
>> structure. We may not care for older gcc versions, but..
>>
>> The other issue was that __randomize_layout maps to __designated_init when
>> randstruct plugin is not enabled, which is in turn an attribute on gcc >=
>> v5.1, but not otherwise.
>>
>>> We just need this, no?
>>>
>>> diff --git a/include/linux/compiler-clang.h
>>> b/include/linux/compiler-clang.h
>>> index de179993e039..4e29ab6187cb 100644
>>> --- a/include/linux/compiler-clang.h
>>> +++ b/include/linux/compiler-clang.h
>>> @@ -15,3 +15,6 @@
>>>    * with any version that can compile the kernel
>>>    */
>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>> __COUNTER__)
>>> +
>>> +#define randomized_struct_fields_start struct {
>>> +#define randomized_struct_fields_end   };
>>>
>>> since offsets are mandated by C standard.
>>
>>
>> Yes, this is what we're testing with and is probably sufficient for our
>> purposes.
>
> Just tested this with bcc. bcc actually complains. the rewriter
> is not able to rewrite prev->pid where prev is "struct task_struct *prev".
> I will change bcc rewriter to see whether the field value is correct or not.
>
> Not sure my understanding is correct or not, but I am afraid that
> the above approach for clang compiler change may not work.
> If clang calculates the field offset based on header file, the offset
> may not be the same as kernel one....

why is that?
When randomization is off both gcc and clang must generate the same
offsets, since it's C standard.

bcc rewriter issue is odd. I suspect it was broken from day one.
Meaning that bcc didn't support poking into anonymous union and structs.

> I verified that the drawf info with randomized structure config does not
> match randomized structure member offset. Specifically, I tried
> linux/proc_ns.h struct proc_ns_operations,
> dwarf says:
>   field name: offset 0
>   field real_ns_name: offset 8
> But if you print out the real offset at runtime, you get 40 and 16 respectively.

thanks for confirming. It means that gcc randomization plugin is broken
and has to be fixed with regard to adjusting debug info while
randomizing the fields.
Y Song Nov. 7, 2017, 9:45 p.m. UTC | #12
On Tue, Nov 7, 2017 at 1:31 PM, Atish Patra <atish.patra@oracle.com> wrote:
> On 11/07/2017 03:14 PM, Y Song wrote:
>>
>> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
>> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>>
>>> Alexei Starovoitov wrote:
>>>>
>>>>
>>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>>
>>>>>>
>>>>>> I thought such struct shouldn't change layout.
>>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>>> anon struct as well.
>>>>>
>>>>>
>>>>>
>>>>> We considered that, but it looked to be very dependent on the version
>>>>> of
>>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>>> the shorter term.
>>>>>
>>>>
>>>> why it would depend on version of gcc?
>>>
>>>
>>>
>>> From what I can see, randomized_struct_fields_start is defined only for
>>> gcc
>>>>
>>>> = 4.6. For older versions, it does not get mapped to an anonymous
>>>
>>> structure. We may not care for older gcc versions, but..
>>>
>>> The other issue was that __randomize_layout maps to __designated_init
>>> when
>>> randstruct plugin is not enabled, which is in turn an attribute on gcc >=
>>> v5.1, but not otherwise.
>>>
>>>> We just need this, no?
>>>>
>>>> diff --git a/include/linux/compiler-clang.h
>>>> b/include/linux/compiler-clang.h
>>>> index de179993e039..4e29ab6187cb 100644
>>>> --- a/include/linux/compiler-clang.h
>>>> +++ b/include/linux/compiler-clang.h
>>>> @@ -15,3 +15,6 @@
>>>>    * with any version that can compile the kernel
>>>>    */
>>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>>> __COUNTER__)
>>>> +
>>>> +#define randomized_struct_fields_start struct {
>>>> +#define randomized_struct_fields_end   };
>>>>
>>>> since offsets are mandated by C standard.
>>>
>>>
>>>
>>> Yes, this is what we're testing with and is probably sufficient for our
>>> purposes.
>>
>>
>> Just tested this with bcc. bcc actually complains. the rewriter
>> is not able to rewrite prev->pid where prev is "struct task_struct *prev".
>> I will change bcc rewriter to see whether the field value is correct or
>> not.

Just verified in bcc with the following hack:
--- a/examples/tracing/task_switch.py
+++ b/examples/tracing/task_switch.py
@@ -20,7 +20,8 @@ int count_sched(struct pt_regs *ctx, struct
task_struct *prev) {
   u64 zero = 0, *val;

   key.curr_pid = bpf_get_current_pid_tgid();
-  key.prev_pid = prev->pid;
+  bpf_probe_read(&key.prev_pid, 4, &prev->pid);
+  // key.prev_pid = prev->pid;

   val = stats.lookup_or_init(&key, &zero);
   (*val)++;
diff --git a/src/cc/frontends/clang/b_frontend_action.cc
b/src/cc/frontends/clang/b_frontend_action.cc
index c11d89e..df48115 100644
--- a/src/cc/frontends/clang/b_frontend_action.cc
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -827,6 +827,7 @@ void
BTypeConsumer::HandleTranslationUnit(ASTContext &Context) {
    */
   for (it = DC->decls_begin(); it != DC->decls_end(); it++) {
     Decl *D = *it;
+#if 0
     if (FunctionDecl *F = dyn_cast<FunctionDecl>(D)) {
       if (fe_.is_rewritable_ext_func(F)) {
         for (auto arg : F->parameters()) {
@@ -836,6 +837,7 @@ void
BTypeConsumer::HandleTranslationUnit(ASTContext &Context) {
         probe_visitor_.TraverseDecl(D);
       }
     }
+#endif

Basically, explicitly using bpf_probe_read instead of letting rewriter
to change it.
Also disable probe rewriting part in the frontend.

I confirmed that value of prev->pid is always 0 (wrong).

The linux patch I have is:
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 54dfef70a072..5d9609ea8e30 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -16,3 +16,6 @@
  * with any version that can compile the kernel
  */
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#define randomized_struct_fields_start struct {
+#define randomized_struct_fields_end   };

Therefore, getting bpf program to access randomized structure
through either BTF or fixed dwarfinfo may be the best option.
I know dwarfinfo is too big so smaller BTF is more desirable.

>>
>> Not sure my understanding is correct or not, but I am afraid that
>> the above approach for clang compiler change may not work.
>> If clang calculates the field offset based on header file, the offset
>> may not be the same as kernel one....
>>
>> I verified that the drawf info with randomized structure config does not
>> match randomized structure member offset. Specifically, I tried
>> linux/proc_ns.h struct proc_ns_operations,
>> dwarf says:
>>   field name: offset 0
>>   field real_ns_name: offset 8
>> But if you print out the real offset at runtime, you get 40 and 16
>> respectively.
>>
> I am also trying to get it work with bcc as any python scripts that access
> task_struct gives wrong output (task_switch.py, runqlen.py). I recompiled my
> kernel (4.14-rc7 & bcc) with the patch.
>
> I am seeing following error.
> # ./task_switch.py
> /virtual/main.c:17:18: warning: implicit declaration of function
> 'bpf_get_task_pid_tgid' is invalid in C99
>       [-Wimplicit-function-declaration]
>   key.prev_pid = bpf_get_task_pid_tgid(prev);
>                  ^
> 1 warning generated.
> LLVM ERROR: Program used external function 'bpf_get_task_pid_tgid' which
> could not be resolved!
>
> Are you also seeing the same issue or something else ?

I did not apply the patch so I did not use bpf_get_task_pid_tgid helper.
When applying the above compiler-clang.h patch, I got:
[yhs@localhost tracing]$ sudo ./task_switch.py
/virtual/main.c:16:18: error: internal error: opLoc is invalid while
preparing probe rewrite
  key.prev_pid = prev->pid;
                 ^
1 error generated.
Traceback (most recent call last):
  File "./task_switch.py", line 29, in <module>
    """)
  File "/usr/lib/python2.7/site-packages/bcc/__init__.py", line 296, in __init__
    raise Exception("Failed to compile BPF module %s" % src_file)
Exception: Failed to compile BPF module

I just hacked bcc with the above patch and the issue is going away.

>
> Regards,
> Atish
>>>
>>>
>>> - Naveen
>>>
>>>
>>
>
Y Song Nov. 7, 2017, 9:47 p.m. UTC | #13
On Tue, Nov 7, 2017 at 1:39 PM, Alexei Starovoitov <ast@fb.com> wrote:
> On 11/8/17 6:14 AM, Y Song wrote:
>>
>> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
>> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>>
>>> Alexei Starovoitov wrote:
>>>>
>>>>
>>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>>
>>>>>>
>>>>>> I thought such struct shouldn't change layout.
>>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>>> anon struct as well.
>>>>>
>>>>>
>>>>>
>>>>> We considered that, but it looked to be very dependent on the version
>>>>> of
>>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>>> the shorter term.
>>>>>
>>>>
>>>> why it would depend on version of gcc?
>>>
>>>
>>>
>>> From what I can see, randomized_struct_fields_start is defined only for
>>> gcc
>>>>
>>>> = 4.6. For older versions, it does not get mapped to an anonymous
>>>
>>> structure. We may not care for older gcc versions, but..
>>>
>>> The other issue was that __randomize_layout maps to __designated_init
>>> when
>>> randstruct plugin is not enabled, which is in turn an attribute on gcc >=
>>> v5.1, but not otherwise.
>>>
>>>> We just need this, no?
>>>>
>>>> diff --git a/include/linux/compiler-clang.h
>>>> b/include/linux/compiler-clang.h
>>>> index de179993e039..4e29ab6187cb 100644
>>>> --- a/include/linux/compiler-clang.h
>>>> +++ b/include/linux/compiler-clang.h
>>>> @@ -15,3 +15,6 @@
>>>>    * with any version that can compile the kernel
>>>>    */
>>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>>> __COUNTER__)
>>>> +
>>>> +#define randomized_struct_fields_start struct {
>>>> +#define randomized_struct_fields_end   };
>>>>
>>>> since offsets are mandated by C standard.
>>>
>>>
>>>
>>> Yes, this is what we're testing with and is probably sufficient for our
>>> purposes.
>>
>>
>> Just tested this with bcc. bcc actually complains. the rewriter
>> is not able to rewrite prev->pid where prev is "struct task_struct *prev".
>> I will change bcc rewriter to see whether the field value is correct or
>> not.
>>
>> Not sure my understanding is correct or not, but I am afraid that
>> the above approach for clang compiler change may not work.
>> If clang calculates the field offset based on header file, the offset
>> may not be the same as kernel one....
>
>
> why is that?
> When randomization is off both gcc and clang must generate the same
> offsets, since it's C standard.

The patch changed compiler-clang.h, so gcc still do randomization.

>
> bcc rewriter issue is odd. I suspect it was broken from day one.
> Meaning that bcc didn't support poking into anonymous union and structs.

This seems right.

>
>> I verified that the drawf info with randomized structure config does not
>> match randomized structure member offset. Specifically, I tried
>> linux/proc_ns.h struct proc_ns_operations,
>> dwarf says:
>>   field name: offset 0
>>   field real_ns_name: offset 8
>> But if you print out the real offset at runtime, you get 40 and 16
>> respectively.
>
>
> thanks for confirming. It means that gcc randomization plugin is broken
> and has to be fixed with regard to adjusting debug info while
> randomizing the fields.
>
Alexei Starovoitov Nov. 7, 2017, 10:04 p.m. UTC | #14
On 11/8/17 6:47 AM, Y Song wrote:
> On Tue, Nov 7, 2017 at 1:39 PM, Alexei Starovoitov <ast@fb.com> wrote:
>> On 11/8/17 6:14 AM, Y Song wrote:
>>>
>>> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
>>> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>>>
>>>> Alexei Starovoitov wrote:
>>>>>
>>>>>
>>>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>>>
>>>>>>>
>>>>>>> I thought such struct shouldn't change layout.
>>>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>>>> anon struct as well.
>>>>>>
>>>>>>
>>>>>>
>>>>>> We considered that, but it looked to be very dependent on the version
>>>>>> of
>>>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>>>> the shorter term.
>>>>>>
>>>>>
>>>>> why it would depend on version of gcc?
>>>>
>>>>
>>>>
>>>> From what I can see, randomized_struct_fields_start is defined only for
>>>> gcc
>>>>>
>>>>> = 4.6. For older versions, it does not get mapped to an anonymous
>>>>
>>>> structure. We may not care for older gcc versions, but..
>>>>
>>>> The other issue was that __randomize_layout maps to __designated_init
>>>> when
>>>> randstruct plugin is not enabled, which is in turn an attribute on gcc >=
>>>> v5.1, but not otherwise.
>>>>
>>>>> We just need this, no?
>>>>>
>>>>> diff --git a/include/linux/compiler-clang.h
>>>>> b/include/linux/compiler-clang.h
>>>>> index de179993e039..4e29ab6187cb 100644
>>>>> --- a/include/linux/compiler-clang.h
>>>>> +++ b/include/linux/compiler-clang.h
>>>>> @@ -15,3 +15,6 @@
>>>>>    * with any version that can compile the kernel
>>>>>    */
>>>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>>>> __COUNTER__)
>>>>> +
>>>>> +#define randomized_struct_fields_start struct {
>>>>> +#define randomized_struct_fields_end   };
>>>>>
>>>>> since offsets are mandated by C standard.
>>>>
>>>>
>>>>
>>>> Yes, this is what we're testing with and is probably sufficient for our
>>>> purposes.
>>>
>>>
>>> Just tested this with bcc. bcc actually complains. the rewriter
>>> is not able to rewrite prev->pid where prev is "struct task_struct *prev".
>>> I will change bcc rewriter to see whether the field value is correct or
>>> not.
>>>
>>> Not sure my understanding is correct or not, but I am afraid that
>>> the above approach for clang compiler change may not work.
>>> If clang calculates the field offset based on header file, the offset
>>> may not be the same as kernel one....
>>
>>
>> why is that?
>> When randomization is off both gcc and clang must generate the same
>> offsets, since it's C standard.
>
> The patch changed compiler-clang.h, so gcc still do randomization.

gcc_plugins are off by default and randomization will not be
turned on for any sane distro or datacenter that cares about
performance and stability.
So imo above compiler-clang.h patch together with bcc fix would
be enough.
Y Song Nov. 7, 2017, 10:42 p.m. UTC | #15
On Tue, Nov 7, 2017 at 2:04 PM, Alexei Starovoitov <ast@fb.com> wrote:
> On 11/8/17 6:47 AM, Y Song wrote:
>>
>> On Tue, Nov 7, 2017 at 1:39 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>>
>>> On 11/8/17 6:14 AM, Y Song wrote:
>>>>
>>>>
>>>> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
>>>> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>>>>
>>>>>
>>>>> Alexei Starovoitov wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> I thought such struct shouldn't change layout.
>>>>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>>>>> anon struct as well.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> We considered that, but it looked to be very dependent on the version
>>>>>>> of
>>>>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>>>>> the shorter term.
>>>>>>>
>>>>>>
>>>>>> why it would depend on version of gcc?
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> From what I can see, randomized_struct_fields_start is defined only for
>>>>> gcc
>>>>>>
>>>>>>
>>>>>> = 4.6. For older versions, it does not get mapped to an anonymous
>>>>>
>>>>>
>>>>> structure. We may not care for older gcc versions, but..
>>>>>
>>>>> The other issue was that __randomize_layout maps to __designated_init
>>>>> when
>>>>> randstruct plugin is not enabled, which is in turn an attribute on gcc
>>>>> >=
>>>>> v5.1, but not otherwise.
>>>>>
>>>>>> We just need this, no?
>>>>>>
>>>>>> diff --git a/include/linux/compiler-clang.h
>>>>>> b/include/linux/compiler-clang.h
>>>>>> index de179993e039..4e29ab6187cb 100644
>>>>>> --- a/include/linux/compiler-clang.h
>>>>>> +++ b/include/linux/compiler-clang.h
>>>>>> @@ -15,3 +15,6 @@
>>>>>>    * with any version that can compile the kernel
>>>>>>    */
>>>>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>>>>> __COUNTER__)
>>>>>> +
>>>>>> +#define randomized_struct_fields_start struct {
>>>>>> +#define randomized_struct_fields_end   };
>>>>>>
>>>>>> since offsets are mandated by C standard.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Yes, this is what we're testing with and is probably sufficient for our
>>>>> purposes.
>>>>
>>>>
>>>>
>>>> Just tested this with bcc. bcc actually complains. the rewriter
>>>> is not able to rewrite prev->pid where prev is "struct task_struct
>>>> *prev".
>>>> I will change bcc rewriter to see whether the field value is correct or
>>>> not.
>>>>
>>>> Not sure my understanding is correct or not, but I am afraid that
>>>> the above approach for clang compiler change may not work.
>>>> If clang calculates the field offset based on header file, the offset
>>>> may not be the same as kernel one....
>>>
>>>
>>>
>>> why is that?
>>> When randomization is off both gcc and clang must generate the same
>>> offsets, since it's C standard.
>>
>>
>> The patch changed compiler-clang.h, so gcc still do randomization.
>
>
> gcc_plugins are off by default and randomization will not be
> turned on for any sane distro or datacenter that cares about
> performance and stability.
> So imo above compiler-clang.h patch together with bcc fix would
> be enough.

Agree that short time the suggested fix should be enough.
Long time, disto could become "insane" someday :-)
Atish Patra Nov. 8, 2017, 12:29 a.m. UTC | #16
On 11/07/2017 04:42 PM, Y Song wrote:
> On Tue, Nov 7, 2017 at 2:04 PM, Alexei Starovoitov <ast@fb.com> wrote:
>> On 11/8/17 6:47 AM, Y Song wrote:
>>>
>>> On Tue, Nov 7, 2017 at 1:39 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>
>>>> On 11/8/17 6:14 AM, Y Song wrote:
>>>>>
>>>>>
>>>>> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
>>>>> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>>>>>
>>>>>>
>>>>>> Alexei Starovoitov wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I thought such struct shouldn't change layout.
>>>>>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>>>>>> anon struct as well.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> We considered that, but it looked to be very dependent on the version
>>>>>>>> of
>>>>>>>> gcc used to build the kernel. But, this may be a simpler approach for
>>>>>>>> the shorter term.
>>>>>>>>
>>>>>>>
>>>>>>> why it would depend on version of gcc?
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> From what I can see, randomized_struct_fields_start is defined only for
>>>>>> gcc
>>>>>>>
>>>>>>>
>>>>>>> = 4.6. For older versions, it does not get mapped to an anonymous
>>>>>>
>>>>>>
>>>>>> structure. We may not care for older gcc versions, but..
>>>>>>
>>>>>> The other issue was that __randomize_layout maps to __designated_init
>>>>>> when
>>>>>> randstruct plugin is not enabled, which is in turn an attribute on gcc
>>>>>>> =
>>>>>> v5.1, but not otherwise.
>>>>>>
>>>>>>> We just need this, no?
>>>>>>>
>>>>>>> diff --git a/include/linux/compiler-clang.h
>>>>>>> b/include/linux/compiler-clang.h
>>>>>>> index de179993e039..4e29ab6187cb 100644
>>>>>>> --- a/include/linux/compiler-clang.h
>>>>>>> +++ b/include/linux/compiler-clang.h
>>>>>>> @@ -15,3 +15,6 @@
>>>>>>>    * with any version that can compile the kernel
>>>>>>>    */
>>>>>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>>>>>> __COUNTER__)
>>>>>>> +
>>>>>>> +#define randomized_struct_fields_start struct {
>>>>>>> +#define randomized_struct_fields_end   };
>>>>>>>
>>>>>>> since offsets are mandated by C standard.
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Yes, this is what we're testing with and is probably sufficient for our
>>>>>> purposes.
>>>>>
>>>>>
>>>>>
>>>>> Just tested this with bcc. bcc actually complains. the rewriter
>>>>> is not able to rewrite prev->pid where prev is "struct task_struct
>>>>> *prev".
>>>>> I will change bcc rewriter to see whether the field value is correct or
>>>>> not.
>>>>>
>>>>> Not sure my understanding is correct or not, but I am afraid that
>>>>> the above approach for clang compiler change may not work.
>>>>> If clang calculates the field offset based on header file, the offset
>>>>> may not be the same as kernel one....
>>>>
>>>>
>>>>
>>>> why is that?
>>>> When randomization is off both gcc and clang must generate the same
>>>> offsets, since it's C standard.
>>>
>>>
>>> The patch changed compiler-clang.h, so gcc still do randomization.
>>
>>
>> gcc_plugins are off by default and randomization will not be
>> turned on for any sane distro or datacenter that cares about
>> performance and stability.
>> So imo above compiler-clang.h patch together with bcc fix would
>> be enough.
>
> Agree that short time the suggested fix should be enough.
> Long time, disto could become "insane" someday :-)
>

Yup it works with clang compiler & bcc hack. Thanks :).
I verified it with task_switch.py & runqlat.py.

Are you going to push the bcc hacks to github repo ?

Regards,
Atish
Y Song Nov. 8, 2017, 1:25 a.m. UTC | #17
On Tue, Nov 7, 2017 at 4:29 PM, Atish Patra <atish.patra@oracle.com> wrote:
> On 11/07/2017 04:42 PM, Y Song wrote:
>>
>> On Tue, Nov 7, 2017 at 2:04 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>>
>>> On 11/8/17 6:47 AM, Y Song wrote:
>>>>
>>>>
>>>> On Tue, Nov 7, 2017 at 1:39 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>>>>
>>>>>
>>>>> On 11/8/17 6:14 AM, Y Song wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On Tue, Nov 7, 2017 at 12:37 AM, Naveen N. Rao
>>>>>> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Alexei Starovoitov wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 11/7/17 12:55 AM, Naveen N. Rao wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I thought such struct shouldn't change layout.
>>>>>>>>>> If it is we need to fix include/linux/compiler-clang.h to do that
>>>>>>>>>> anon struct as well.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> We considered that, but it looked to be very dependent on the
>>>>>>>>> version
>>>>>>>>> of
>>>>>>>>> gcc used to build the kernel. But, this may be a simpler approach
>>>>>>>>> for
>>>>>>>>> the shorter term.
>>>>>>>>>
>>>>>>>>
>>>>>>>> why it would depend on version of gcc?
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> From what I can see, randomized_struct_fields_start is defined only
>>>>>>> for
>>>>>>> gcc
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> = 4.6. For older versions, it does not get mapped to an anonymous
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> structure. We may not care for older gcc versions, but..
>>>>>>>
>>>>>>> The other issue was that __randomize_layout maps to __designated_init
>>>>>>> when
>>>>>>> randstruct plugin is not enabled, which is in turn an attribute on
>>>>>>> gcc
>>>>>>>>
>>>>>>>> =
>>>>>>>
>>>>>>> v5.1, but not otherwise.
>>>>>>>
>>>>>>>> We just need this, no?
>>>>>>>>
>>>>>>>> diff --git a/include/linux/compiler-clang.h
>>>>>>>> b/include/linux/compiler-clang.h
>>>>>>>> index de179993e039..4e29ab6187cb 100644
>>>>>>>> --- a/include/linux/compiler-clang.h
>>>>>>>> +++ b/include/linux/compiler-clang.h
>>>>>>>> @@ -15,3 +15,6 @@
>>>>>>>>    * with any version that can compile the kernel
>>>>>>>>    */
>>>>>>>>   #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix),
>>>>>>>> __COUNTER__)
>>>>>>>> +
>>>>>>>> +#define randomized_struct_fields_start struct {
>>>>>>>> +#define randomized_struct_fields_end   };
>>>>>>>>
>>>>>>>> since offsets are mandated by C standard.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Yes, this is what we're testing with and is probably sufficient for
>>>>>>> our
>>>>>>> purposes.
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Just tested this with bcc. bcc actually complains. the rewriter
>>>>>> is not able to rewrite prev->pid where prev is "struct task_struct
>>>>>> *prev".
>>>>>> I will change bcc rewriter to see whether the field value is correct
>>>>>> or
>>>>>> not.
>>>>>>
>>>>>> Not sure my understanding is correct or not, but I am afraid that
>>>>>> the above approach for clang compiler change may not work.
>>>>>> If clang calculates the field offset based on header file, the offset
>>>>>> may not be the same as kernel one....
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> why is that?
>>>>> When randomization is off both gcc and clang must generate the same
>>>>> offsets, since it's C standard.
>>>>
>>>>
>>>>
>>>> The patch changed compiler-clang.h, so gcc still do randomization.
>>>
>>>
>>>
>>> gcc_plugins are off by default and randomization will not be
>>> turned on for any sane distro or datacenter that cares about
>>> performance and stability.
>>> So imo above compiler-clang.h patch together with bcc fix would
>>> be enough.
>>
>>
>> Agree that short time the suggested fix should be enough.
>> Long time, disto could become "insane" someday :-)
>>
>
> Yup it works with clang compiler & bcc hack. Thanks :).
> I verified it with task_switch.py & runqlat.py.
>
> Are you going to push the bcc hacks to github repo ?

I will need to have proper implementation than a hack. Yes, once done, will
push into bcc repo. Should be done soon.

>
> Regards,
> Atish

Patch
diff mbox series

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f1af7d63d678..5993a0f5262b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -418,6 +418,9 @@  extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
+extern const struct bpf_func_proto bpf_get_task_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_get_task_comm_proto;
+extern const struct bpf_func_proto bpf_get_task_flags_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f90860d1f897..324508d27bd2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -338,6 +338,16 @@  union bpf_attr {
  *     @skb: pointer to skb
  *     Return: classid if != 0
  *
+ * u64 bpf_get_task_pid_tgid(struct task_struct *task)
+ *     Return: task->tgid << 32 | task->pid
+ *
+ * int bpf_get_task_comm(struct task_struct *task)
+ *     Stores task->comm into buf
+ *     Return: 0 on success or negative error
+ *
+ * u32 bpf_get_task_flags(struct task_struct *task)
+ *     Return: task->flags
+ *
  * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
  *     Return: 0 on success or negative error
  *
@@ -602,6 +612,9 @@  union bpf_attr {
 	FN(get_current_uid_gid),	\
 	FN(get_current_comm),		\
 	FN(get_cgroup_classid),		\
+	FN(get_task_pid_tgid),		\
+	FN(get_task_comm),		\
+	FN(get_task_flags),		\
 	FN(skb_vlan_push),		\
 	FN(skb_vlan_pop),		\
 	FN(skb_get_tunnel_key),		\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..c69c17d6514a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1438,6 +1438,9 @@  const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_get_task_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_task_comm_proto __weak;
+const struct bpf_func_proto bpf_get_task_flags_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..f45259dce117 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,78 @@  const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+BPF_CALL_1(bpf_get_task_pid_tgid, struct task_struct *, task)
+{
+	int ret;
+	u32 pid, tgid;
+
+	ret = probe_kernel_read(&pid, &task->pid, sizeof(pid));
+	if (unlikely(ret < 0))
+		goto err;
+
+	ret = probe_kernel_read(&tgid, &task->tgid, sizeof(tgid));
+	if (unlikely(ret < 0))
+		goto err;
+
+	return (u64) tgid << 32 | pid;
+err:
+	return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_task_pid_tgid_proto = {
+	.func		= bpf_get_task_pid_tgid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_get_task_comm, struct task_struct *, task, char *, buf, u32, size)
+{
+	int ret;
+	char comm[TASK_COMM_LEN];
+
+	ret = probe_kernel_read(comm, task->comm, sizeof(comm));
+	if (unlikely(ret < 0))
+		goto err_clear;
+
+	strncpy(buf, comm, size);
+
+	/* Verifier guarantees that size > 0. For task->comm exceeding
+	 * size, guarantee that buf is %NUL-terminated. Unconditionally
+	 * done here to save the size test.
+	 */
+	buf[size - 1] = 0;
+	return 0;
+err_clear:
+	memset(buf, 0, size);
+	return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_task_comm_proto = {
+	.func		= bpf_get_task_comm,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_1(bpf_get_task_flags, struct task_struct *, task)
+{
+	int ret;
+	unsigned int flags;
+
+	ret = probe_kernel_read(&flags, &task->flags, sizeof(flags));
+	if (unlikely(ret < 0))
+		return -EINVAL;
+
+	return flags;
+}
+
+const struct bpf_func_proto bpf_get_task_flags_proto = {
+	.func		= bpf_get_task_flags,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..a31f5cf68cbc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -477,6 +477,12 @@  static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_get_numa_node_id:
 		return &bpf_get_numa_node_id_proto;
+	case BPF_FUNC_get_task_pid_tgid:
+		return &bpf_get_task_pid_tgid_proto;
+	case BPF_FUNC_get_task_comm:
+		return &bpf_get_task_comm_proto;
+	case BPF_FUNC_get_task_flags:
+		return &bpf_get_task_flags_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_probe_write_user:
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index b2e02bdcd098..8c64df027d2c 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -1,6 +1,8 @@ 
 #ifndef __BPF_HELPERS_H
 #define __BPF_HELPERS_H
 
+struct task_struct;
+
 /* helper macro to place programs, maps, license in
  * different sections in elf_bpf file. Section names
  * are interpreted by elf_bpf loader
@@ -31,6 +33,13 @@  static unsigned long long (*bpf_get_current_uid_gid)(void) =
 	(void *) BPF_FUNC_get_current_uid_gid;
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
 	(void *) BPF_FUNC_get_current_comm;
+static unsigned long long (*bpf_get_task_pid_tgid)(struct task_struct *task) =
+	(void *) BPF_FUNC_get_task_pid_tgid;
+static int (*bpf_get_task_comm)(struct task_struct *task,
+				void *buf, int buf_size) =
+	(void *) BPF_FUNC_get_task_comm;
+static unsigned int (*bpf_get_task_flags)(struct task_struct *task) =
+	(void *) BPF_FUNC_get_task_flags;
 static unsigned long long (*bpf_perf_event_read)(void *map,
 						 unsigned long long flags) =
 	(void *) BPF_FUNC_perf_event_read;