* [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
@ 2016-10-16 16:41 William Tu
2016-10-19 0:50 ` Alexei Starovoitov
0 siblings, 1 reply; 10+ messages in thread
From: William Tu @ 2016-10-16 16:41 UTC (permalink / raw)
To: netdev
When running bpf_map_lookup on percpu elements, the bytes copied to
userspace depends on num_possible_cpus() * value_size, which could
potentially be larger than memory allocated from user, which depends
on sysconf(_SC_NPROCESSORS_CONF) to get the current cpu num. As a
result, the inconsistency might corrupt the user's stack.
The fact that sysconf(_SC_NPROCESSORS_CONF) != num_possible_cpu()
happens when cpu hotadd is enabled. For example, in Fusion when
setting vcpu.hotadd = "TRUE" or in KVM, setting
./qemu-system-x86_64 -smp 2, maxcpus=4 ...
the num_possible_cpu() will be 4 and sysconf() will be 2[1].
Currently the any percpu map lookup suffers this issue, try
samples/bpf/test_maps.c or tracex3.c.
Th RFC patch adds additional 'size' param from userspace so that
kernel knows the maximum memory it should copy to the user.
[1] https://www.mail-archive.com/netdev@vger.kernel.org/msg121183.html
Signed-off-by: William Tu <u9012063@gmail.com>
---
include/uapi/linux/bpf.h | 5 ++++-
kernel/bpf/syscall.c | 5 +++--
samples/bpf/fds_example.c | 2 +-
samples/bpf/libbpf.c | 3 ++-
samples/bpf/libbpf.h | 2 +-
samples/bpf/test_maps.c | 30 +++++++++++++++---------------
tools/include/uapi/linux/bpf.h | 5 ++++-
7 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b..fa0c40b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -123,7 +123,10 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
- __u64 flags;
+ union {
+ __u64 flags;
+ __u32 size; /* number of bytes allocated in userspace */
+ };
};
struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..be211ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -264,13 +264,14 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD size
static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
+ u32 usize = attr->size;
struct bpf_map *map;
void *key, *value, *ptr;
u32 value_size;
@@ -324,7 +325,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_value;
err = -EFAULT;
- if (copy_to_user(uvalue, value, value_size) != 0)
+ if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
goto free_value;
err = 0;
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index 625e797..5b833d8 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -88,7 +88,7 @@ static int bpf_do_map(const char *file, uint32_t flags, uint32_t key,
ret, strerror(errno));
assert(ret == 0);
} else if (flags & BPF_F_KEY) {
- ret = bpf_lookup_elem(fd, &key, &value);
+ ret = bpf_lookup_elem(fd, &key, &value, sizeof(value));
printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value,
ret, strerror(errno));
assert(ret == 0);
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 9969e35..9f0a1c3 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -44,12 +44,13 @@ int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
-int bpf_lookup_elem(int fd, void *key, void *value)
+int bpf_lookup_elem(int fd, void *key, void *value, int size)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
+ .size = size,
};
return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index ac6edb6..b911185 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -7,7 +7,7 @@ struct bpf_insn;
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
int max_entries, int map_flags);
int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
-int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_lookup_elem(int fd, void *key, void *value, int size);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index cce2b59..a6a8fbe 100644
--- a/samples/bpf/test_maps.c
+++ b/samples/bpf/test_maps.c
@@ -47,11 +47,11 @@ static void test_hashmap_sanity(int i, void *data)
assert(bpf_update_elem(map_fd, &key, &value, -1) == -1 && errno == EINVAL);
/* check that key=1 can be found */
- assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == 0 && value == 1234);
key = 2;
/* check that key=2 is not found */
- assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == -1 && errno == ENOENT);
/* BPF_EXIST means: update existing element */
assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
@@ -139,11 +139,11 @@ static void test_percpu_hashmap_sanity(int task, void *data)
* was run from a different cpu.
*/
value[0] = 1;
- assert(bpf_lookup_elem(map_fd, &key, value) == 0 && value[0] == 100);
+ assert(bpf_lookup_elem(map_fd, &key, value, sizeof(value)) == 0 && value[0] == 100);
key = 2;
/* check that key=2 is not found */
- assert(bpf_lookup_elem(map_fd, &key, value) == -1 && errno == ENOENT);
+ assert(bpf_lookup_elem(map_fd, &key, value, sizeof(value)) == -1 && errno == ENOENT);
/* BPF_EXIST means: update existing element */
assert(bpf_update_elem(map_fd, &key, value, BPF_EXIST) == -1 &&
@@ -170,7 +170,7 @@ static void test_percpu_hashmap_sanity(int task, void *data)
assert((expected_key_mask & next_key) == next_key);
expected_key_mask &= ~next_key;
- assert(bpf_lookup_elem(map_fd, &next_key, value) == 0);
+ assert(bpf_lookup_elem(map_fd, &next_key, value, sizeof(value)) == 0);
for (i = 0; i < nr_cpus; i++)
assert(value[i] == i + 100);
@@ -218,11 +218,11 @@ static void test_arraymap_sanity(int i, void *data)
errno == EEXIST);
/* check that key=1 can be found */
- assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == 0 && value == 1234);
key = 0;
/* check that key=0 is also found and zero initialized */
- assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == 0 && value == 0);
/* key=0 and key=1 were inserted, check that key=2 cannot be inserted
@@ -233,7 +233,7 @@ static void test_arraymap_sanity(int i, void *data)
errno == E2BIG);
/* check that key = 2 doesn't exist */
- assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == -1 && errno == ENOENT);
/* iterate over two elements */
assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
@@ -274,7 +274,7 @@ static void test_percpu_arraymap_many_keys(void)
for (key = 0; key < nr_keys; key++) {
for (i = 0; i < nr_cpus; i++)
values[i] = 0;
- assert(bpf_lookup_elem(map_fd, &key, values) == 0);
+ assert(bpf_lookup_elem(map_fd, &key, values, sizeof(values)) == 0);
for (i = 0; i < nr_cpus; i++)
assert(values[i] == i + 10);
}
@@ -307,11 +307,11 @@ static void test_percpu_arraymap_sanity(int i, void *data)
errno == EEXIST);
/* check that key=1 can be found */
- assert(bpf_lookup_elem(map_fd, &key, values) == 0 && values[0] == 100);
+ assert(bpf_lookup_elem(map_fd, &key, values, sizeof(values)) == 0 && values[0] == 100);
key = 0;
/* check that key=0 is also found and zero initialized */
- assert(bpf_lookup_elem(map_fd, &key, values) == 0 &&
+ assert(bpf_lookup_elem(map_fd, &key, values, sizeof(values)) == 0 &&
values[0] == 0 && values[nr_cpus - 1] == 0);
@@ -321,7 +321,7 @@ static void test_percpu_arraymap_sanity(int i, void *data)
errno == E2BIG);
/* check that key = 2 doesn't exist */
- assert(bpf_lookup_elem(map_fd, &key, values) == -1 && errno == ENOENT);
+ assert(bpf_lookup_elem(map_fd, &key, values, sizeof(values)) == -1 && errno == ENOENT);
/* iterate over two elements */
assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
@@ -371,9 +371,9 @@ static void test_map_large(void)
assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
key.c = 0;
- assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == 0 && value == 0);
key.a = 1;
- assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == -1 && errno == ENOENT);
close(map_fd);
}
@@ -466,7 +466,7 @@ static void test_map_parallel(void)
/* another check for all elements */
for (i = 0; i < MAP_SIZE; i++) {
key = MAP_SIZE - i - 1;
- assert(bpf_lookup_elem(map_fd, &key, &value) == 0 &&
+ assert(bpf_lookup_elem(map_fd, &key, &value, sizeof(value)) == 0 &&
value == key);
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9e5fc16..719fa25 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -122,7 +122,10 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
- __u64 flags;
+ union {
+ __u64 flags;
+ __u32 size; /* number of bytes allocated in userspace */
+ };
};
struct { /* anonymous struct used by BPF_PROG_LOAD command */
--
2.5.0
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-16 16:41 [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user William Tu
@ 2016-10-19 0:50 ` Alexei Starovoitov
2016-10-19 5:31 ` William Tu
0 siblings, 1 reply; 10+ messages in thread
From: Alexei Starovoitov @ 2016-10-19 0:50 UTC (permalink / raw)
To: William Tu; +Cc: netdev
On Sun, Oct 16, 2016 at 09:41:28AM -0700, William Tu wrote:
> When running bpf_map_lookup on percpu elements, the bytes copied to
> userspace depends on num_possible_cpus() * value_size, which could
> potentially be larger than memory allocated from user, which depends
> on sysconf(_SC_NPROCESSORS_CONF) to get the current cpu num. As a
> result, the inconsistency might corrupt the user's stack.
>
> The fact that sysconf(_SC_NPROCESSORS_CONF) != num_possible_cpu()
> happens when cpu hotadd is enabled. For example, in Fusion when
> setting vcpu.hotadd = "TRUE" or in KVM, setting
> ./qemu-system-x86_64 -smp 2, maxcpus=4 ...
> the num_possible_cpu() will be 4 and sysconf() will be 2[1].
> Currently the any percpu map lookup suffers this issue, try
> samples/bpf/test_maps.c or tracex3.c.
>
> Th RFC patch adds additional 'size' param from userspace so that
> kernel knows the maximum memory it should copy to the user.
>
> [1] https://www.mail-archive.com/netdev@vger.kernel.org/msg121183.html
>
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
> include/uapi/linux/bpf.h | 5 ++++-
> kernel/bpf/syscall.c | 5 +++--
> samples/bpf/fds_example.c | 2 +-
> samples/bpf/libbpf.c | 3 ++-
> samples/bpf/libbpf.h | 2 +-
> samples/bpf/test_maps.c | 30 +++++++++++++++---------------
> tools/include/uapi/linux/bpf.h | 5 ++++-
> 7 files changed, 30 insertions(+), 22 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f09c70b..fa0c40b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -123,7 +123,10 @@ union bpf_attr {
> __aligned_u64 value;
> __aligned_u64 next_key;
> };
> - __u64 flags;
> + union {
> + __u64 flags;
> + __u32 size; /* number of bytes allocated in userspace */
> + };
...
> - if (copy_to_user(uvalue, value, value_size) != 0)
> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
> goto free_value;
I think such approach won't actually fix anything. User space
may lose some of the values and won't have any idea what was lost.
I think we need to fix sample code to avoid using sysconf(_SC_NPROCESSORS_CONF)
and use /sys/devices/system/cpu/possible instead.
I would argue that glibc should be fixed as well since relying on
ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-19 0:50 ` Alexei Starovoitov
@ 2016-10-19 5:31 ` William Tu
2016-10-19 10:05 ` Daniel Borkmann
2016-10-19 15:15 ` Daniel Borkmann
0 siblings, 2 replies; 10+ messages in thread
From: William Tu @ 2016-10-19 5:31 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: Linux Kernel Network Developers
> ...
>> - if (copy_to_user(uvalue, value, value_size) != 0)
>> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
>> goto free_value;
>
> I think such approach won't actually fix anything. User space
> may lose some of the values and won't have any idea what was lost.
> I think we need to fix sample code to avoid using sysconf(_SC_NPROCESSORS_CONF)
> and use /sys/devices/system/cpu/possible instead.
> I would argue that glibc should be fixed as well since relying on
> ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.
>
Thanks for the feedback. I think glibc is correct. The
_SC_NPROCESSORS_CONF presents the number of processors
configured/populated and is indeed "ls
/sys/devices/system/cpu/cpu[0-9]*|wc -l". This means the actual number
of CPUs installed on your system. On the other hand, the
num_possible_cpus() includes both the installed CPUs and the empty CPU
socket/slot, in order to support CPU hotplug.
As a example, one of my dual socket motherboard with 1 CPU installed has
# /sys/devices/system/cpu/possible
0-239
# /sys/devices/system/cpu/cpu[0-9]*|wc -l
12
Note that these 12 cpus could be online/offline by
# echo 1/0 > /sys/devices/system/cpu/cpuX/online
Even if it is offline, the entry is still there.
Thinking about another solution, maybe we should use
"num_present_cpus()" which means the configured/populated CPUs and the
value is the same as sysconf(_SC_NPROCESSORS_CONF). Consider:
1) cpuX is online/offline: the num_present_cpus() remains the same.
2) new cpu is hotplug into the empty socket: the num_present_cpus()
gets updates, and also the sysconf(_SC_NPROCESSORS_CONF).
+++ b/kernel/bpf/syscall.c
@@ -297,7 +297,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_size = round_up(map->value_size, 8) * num_present_cpus();
else
value_size = map->value_size;
Thanks. Regards,
William
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-19 5:31 ` William Tu
@ 2016-10-19 10:05 ` Daniel Borkmann
2016-10-20 19:21 ` William Tu
2016-10-19 15:15 ` Daniel Borkmann
1 sibling, 1 reply; 10+ messages in thread
From: Daniel Borkmann @ 2016-10-19 10:05 UTC (permalink / raw)
To: William Tu; +Cc: Alexei Starovoitov, Linux Kernel Network Developers
On 10/19/2016 07:31 AM, William Tu wrote:
>> ...
>>> - if (copy_to_user(uvalue, value, value_size) != 0)
>>> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
>>> goto free_value;
>>
>> I think such approach won't actually fix anything. User space
>> may lose some of the values and won't have any idea what was lost.
>> I think we need to fix sample code to avoid using sysconf(_SC_NPROCESSORS_CONF)
>> and use /sys/devices/system/cpu/possible instead.
>> I would argue that glibc should be fixed as well since relying on
>> ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.
>
> Thanks for the feedback. I think glibc is correct. The
> _SC_NPROCESSORS_CONF presents the number of processors
> configured/populated and is indeed "ls
> /sys/devices/system/cpu/cpu[0-9]*|wc -l". This means the actual number
> of CPUs installed on your system. On the other hand, the
In glibc __get_nprocs_conf() seems to try a number of things, first it
tries equivalent of /sys/devices/system/cpu/cpu[0-9]*|wc -l, if that fails,
depending on the config, it either tries to count cpus in /proc/cpuinfo,
or returns the _SC_NPROCESSORS_ONLN value instead. If /proc/cpuinfo has
some issue, it returns just 1 worst case. _SC_NPROCESSORS_ONLN will parse
/sys/devices/system/cpu/online, if that fails it looks into /proc/stat
for cpuX entries, and if also that fails for some reason, /proc/cpuinfo
is consulted (and returning 1 if unlikely all breaks down).
> num_possible_cpus() includes both the installed CPUs and the empty CPU
> socket/slot, in order to support CPU hotplug.
Correct.
> As a example, one of my dual socket motherboard with 1 CPU installed has
> # /sys/devices/system/cpu/possible
> 0-239
> # /sys/devices/system/cpu/cpu[0-9]*|wc -l
> 12
> Note that these 12 cpus could be online/offline by
> # echo 1/0 > /sys/devices/system/cpu/cpuX/online
> Even if it is offline, the entry is still there.
>
> Thinking about another solution, maybe we should use
> "num_present_cpus()" which means the configured/populated CPUs and the
> value is the same as sysconf(_SC_NPROCESSORS_CONF). Consider:
> 1) cpuX is online/offline: the num_present_cpus() remains the same.
> 2) new cpu is hotplug into the empty socket: the num_present_cpus()
> gets updates, and also the sysconf(_SC_NPROCESSORS_CONF).
>
> +++ b/kernel/bpf/syscall.c
> @@ -297,7 +297,7 @@ static int map_lookup_elem(union bpf_attr *attr)
>
> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
> - value_size = round_up(map->value_size, 8) * num_possible_cpus();
> + value_size = round_up(map->value_size, 8) * num_present_cpus();
> else
> value_size = map->value_size;
But as you say in 2) that also has a chance of being racy on CPU hotplug
compared to num_possible_cpus() which is fixed at boot time.
Documentation/cputopology.txt +106 says /sys/devices/system/cpu/possible
outputs cpu_possible_mask. That is the same as in num_possible_cpus(), so
first step would be to fix the buggy example code, imho.
What perhaps could be done in a second step to reduce overhead is an option
for bpf(2) to pass in a cpu mask similarly as for sched_{get,set}affinity()
syscalls, where user space can construct a mask via CPU_SET(3). For the
syscall time, kernel would lock hot plugging via get_online_cpus() and
put_online_cpus(), it would check whether passed CPUs are online to query
and if so then it would copy the values into the user provided buffer. I'd
think this might be useful in a number of ways anyway.
Thanks,
Daniel
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-19 5:31 ` William Tu
2016-10-19 10:05 ` Daniel Borkmann
@ 2016-10-19 15:15 ` Daniel Borkmann
2016-10-20 16:04 ` Daniel Borkmann
1 sibling, 1 reply; 10+ messages in thread
From: Daniel Borkmann @ 2016-10-19 15:15 UTC (permalink / raw)
To: William Tu; +Cc: Alexei Starovoitov, Linux Kernel Network Developers
On 10/19/2016 07:31 AM, William Tu wrote:
>> ...
>>> - if (copy_to_user(uvalue, value, value_size) != 0)
>>> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
>>> goto free_value;
>>
>> I think such approach won't actually fix anything. User space
>> may lose some of the values and won't have any idea what was lost.
>> I think we need to fix sample code to avoid using sysconf(_SC_NPROCESSORS_CONF)
>> and use /sys/devices/system/cpu/possible instead.
>> I would argue that glibc should be fixed as well since relying on
>> ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.
>
> Thanks for the feedback. I think glibc is correct. The
> _SC_NPROCESSORS_CONF presents the number of processors
> configured/populated and is indeed "ls
> /sys/devices/system/cpu/cpu[0-9]*|wc -l". This means the actual number
> of CPUs installed on your system. On the other hand, the
In glibc __get_nprocs_conf() seems to try a number of things, first it
tries equivalent of /sys/devices/system/cpu/cpu[0-9]*|wc -l, if that fails,
depending on the config, it either tries to count cpus in /proc/cpuinfo,
or returns the _SC_NPROCESSORS_ONLN value instead. If /proc/cpuinfo has
some issue, it returns just 1 worst case. _SC_NPROCESSORS_ONLN will parse
/sys/devices/system/cpu/online, if that fails it looks into /proc/stat
for cpuX entries, and if also that fails for some reason, /proc/cpuinfo
is consulted (and returning 1 if unlikely all breaks down).
> num_possible_cpus() includes both the installed CPUs and the empty CPU
> socket/slot, in order to support CPU hotplug.
Correct.
> As a example, one of my dual socket motherboard with 1 CPU installed has
> # /sys/devices/system/cpu/possible
> 0-239
> # /sys/devices/system/cpu/cpu[0-9]*|wc -l
> 12
> Note that these 12 cpus could be online/offline by
> # echo 1/0 > /sys/devices/system/cpu/cpuX/online
> Even if it is offline, the entry is still there.
>
> Thinking about another solution, maybe we should use
> "num_present_cpus()" which means the configured/populated CPUs and the
> value is the same as sysconf(_SC_NPROCESSORS_CONF). Consider:
> 1) cpuX is online/offline: the num_present_cpus() remains the same.
> 2) new cpu is hotplug into the empty socket: the num_present_cpus()
> gets updates, and also the sysconf(_SC_NPROCESSORS_CONF).
>
> +++ b/kernel/bpf/syscall.c
> @@ -297,7 +297,7 @@ static int map_lookup_elem(union bpf_attr *attr)
>
> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
> - value_size = round_up(map->value_size, 8) * num_possible_cpus();
> + value_size = round_up(map->value_size, 8) * num_present_cpus();
> else
> value_size = map->value_size;
But as you say in 2) that also has a chance of being racy on CPU hotplug
compared to num_possible_cpus() which is fixed at boot time.
Documentation/cputopology.txt +106 says /sys/devices/system/cpu/possible
outputs cpu_possible_mask. That is the same as in num_possible_cpus(), so
first step would be to fix the buggy example code, imho.
What perhaps could be done in a second step to reduce overhead is an option
for bpf(2) to pass in a cpu mask similarly as for sched_{get,set}affinity()
syscalls, where user space can construct a mask via CPU_SET(3). For the
syscall time, kernel would lock hot plugging via get_online_cpus() and
put_online_cpus(), it would check whether passed CPUs are online to query
and if so then it would copy the values into the user provided buffer. I'd
think this might be useful in a number of ways anyway.
Thanks,
Daniel
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-19 15:15 ` Daniel Borkmann
@ 2016-10-20 16:04 ` Daniel Borkmann
2016-10-20 16:58 ` Alexei Starovoitov
0 siblings, 1 reply; 10+ messages in thread
From: Daniel Borkmann @ 2016-10-20 16:04 UTC (permalink / raw)
To: William Tu; +Cc: Alexei Starovoitov, Linux Kernel Network Developers
On 10/19/2016 05:15 PM, Daniel Borkmann wrote:
> On 10/19/2016 07:31 AM, William Tu wrote:
>>> ...
>>>> - if (copy_to_user(uvalue, value, value_size) != 0)
>>>> + if (copy_to_user(uvalue, value, min_t(u32, usize, value_size)) != 0)
>>>> goto free_value;
>>>
>>> I think such approach won't actually fix anything. User space
>>> may lose some of the values and won't have any idea what was lost.
>>> I think we need to fix sample code to avoid using sysconf(_SC_NPROCESSORS_CONF)
>>> and use /sys/devices/system/cpu/possible instead.
>>> I would argue that glibc should be fixed as well since relying on
>>> ls -d /sys/devices/system/cpu/cpu[0-9]*|wc -l turned out to be incorrect.
>>
>> Thanks for the feedback. I think glibc is correct. The
>> _SC_NPROCESSORS_CONF presents the number of processors
>> configured/populated and is indeed "ls
>> /sys/devices/system/cpu/cpu[0-9]*|wc -l". This means the actual number
>> of CPUs installed on your system. On the other hand, the
>
> In glibc __get_nprocs_conf() seems to try a number of things, first it
> tries equivalent of /sys/devices/system/cpu/cpu[0-9]*|wc -l, if that fails,
> depending on the config, it either tries to count cpus in /proc/cpuinfo,
> or returns the _SC_NPROCESSORS_ONLN value instead. If /proc/cpuinfo has
> some issue, it returns just 1 worst case. _SC_NPROCESSORS_ONLN will parse
> /sys/devices/system/cpu/online, if that fails it looks into /proc/stat
> for cpuX entries, and if also that fails for some reason, /proc/cpuinfo
> is consulted (and returning 1 if unlikely all breaks down).
>
>> num_possible_cpus() includes both the installed CPUs and the empty CPU
>> socket/slot, in order to support CPU hotplug.
>
> Correct.
>
>> As a example, one of my dual socket motherboard with 1 CPU installed has
>> # /sys/devices/system/cpu/possible
>> 0-239
>> # /sys/devices/system/cpu/cpu[0-9]*|wc -l
>> 12
>> Note that these 12 cpus could be online/offline by
>> # echo 1/0 > /sys/devices/system/cpu/cpuX/online
>> Even if it is offline, the entry is still there.
>>
>> Thinking about another solution, maybe we should use
>> "num_present_cpus()" which means the configured/populated CPUs and the
>> value is the same as sysconf(_SC_NPROCESSORS_CONF). Consider:
>> 1) cpuX is online/offline: the num_present_cpus() remains the same.
>> 2) new cpu is hotplug into the empty socket: the num_present_cpus()
>> gets updates, and also the sysconf(_SC_NPROCESSORS_CONF).
>>
>> +++ b/kernel/bpf/syscall.c
>> @@ -297,7 +297,7 @@ static int map_lookup_elem(union bpf_attr *attr)
>>
>> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
>> map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
>> - value_size = round_up(map->value_size, 8) * num_possible_cpus();
>> + value_size = round_up(map->value_size, 8) * num_present_cpus();
>> else
>> value_size = map->value_size;
>
> But as you say in 2) that also has a chance of being racy on CPU hotplug
> compared to num_possible_cpus() which is fixed at boot time.
>
> Documentation/cputopology.txt +106 says /sys/devices/system/cpu/possible
> outputs cpu_possible_mask. That is the same as in num_possible_cpus(), so
> first step would be to fix the buggy example code, imho.
>
> What perhaps could be done in a second step to reduce overhead is an option
> for bpf(2) to pass in a cpu mask similarly as for sched_{get,set}affinity()
> syscalls, where user space can construct a mask via CPU_SET(3). For the
> syscall time, kernel would lock hot plugging via get_online_cpus() and
> put_online_cpus(), it would check whether passed CPUs are online to query
> and if so then it would copy the values into the user provided buffer. I'd
> think this might be useful in a number of ways anyway.
Maybe something like this as mentioned first step to fix the examples. Does
this work for you, William?
tools/testing/selftests/bpf/test_maps.c | 33 ++++++++++++++++++++++++++++++---
1 file changed, 30 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index ee384f0..d4832e8 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -25,6 +25,33 @@
static int map_flags;
+static unsigned int num_possible_cpus(void)
+{
+ static const char *fcpu = "/sys/devices/system/cpu/possible";
+ unsigned int val, possible_cpus = 0;
+ char buff[128];
+ FILE *fp;
+
+ fp = fopen(fcpu, "r");
+ if (!fp) {
+ printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno));
+ exit(1);
+ }
+
+ while (fgets(buff, sizeof(buff), fp)) {
+ if (sscanf(buff, "%*u-%u", &val) == 1)
+ possible_cpus = val;
+ }
+
+ fclose(fp);
+ if (!possible_cpus) {
+ printf("Failed to retrieve # possible CPUs!\n");
+ exit(1);
+ }
+
+ return possible_cpus;
+}
+
static void test_hashmap(int task, void *data)
{
long long key, next_key, value;
@@ -110,7 +137,7 @@ static void test_hashmap(int task, void *data)
static void test_hashmap_percpu(int task, void *data)
{
- unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ unsigned int nr_cpus = num_possible_cpus();
long long value[nr_cpus];
long long key, next_key;
int expected_key_mask = 0;
@@ -258,7 +285,7 @@ static void test_arraymap(int task, void *data)
static void test_arraymap_percpu(int task, void *data)
{
- unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ unsigned int nr_cpus = num_possible_cpus();
int key, next_key, fd, i;
long values[nr_cpus];
@@ -313,7 +340,7 @@ static void test_arraymap_percpu(int task, void *data)
static void test_arraymap_percpu_many_keys(void)
{
- unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ unsigned int nr_cpus = num_possible_cpus();
unsigned int nr_keys = 20000;
long values[nr_cpus];
int key, fd, i;
--
1.9.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-20 16:04 ` Daniel Borkmann
@ 2016-10-20 16:58 ` Alexei Starovoitov
2016-10-20 18:41 ` William Tu
0 siblings, 1 reply; 10+ messages in thread
From: Alexei Starovoitov @ 2016-10-20 16:58 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: William Tu, Linux Kernel Network Developers
On Thu, Oct 20, 2016 at 06:04:38PM +0200, Daniel Borkmann wrote:
>
> diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
> index ee384f0..d4832e8 100644
> --- a/tools/testing/selftests/bpf/test_maps.c
> +++ b/tools/testing/selftests/bpf/test_maps.c
> @@ -25,6 +25,33 @@
>
> static int map_flags;
>
> +static unsigned int num_possible_cpus(void)
> +{
> + static const char *fcpu = "/sys/devices/system/cpu/possible";
> + unsigned int val, possible_cpus = 0;
> + char buff[128];
> + FILE *fp;
> +
> + fp = fopen(fcpu, "r");
> + if (!fp) {
> + printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno));
> + exit(1);
> + }
> +
> + while (fgets(buff, sizeof(buff), fp)) {
> + if (sscanf(buff, "%*u-%u", &val) == 1)
> + possible_cpus = val;
> + }
looks great to me.
Could you move it into bpf_sys.h or somehow make it common in libbpf
and reuse it in samples/bpf/ ?
Since quite a few samples need this fix as well.
Thanks!
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-20 16:58 ` Alexei Starovoitov
@ 2016-10-20 18:41 ` William Tu
2016-10-20 19:39 ` Daniel Borkmann
0 siblings, 1 reply; 10+ messages in thread
From: William Tu @ 2016-10-20 18:41 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: Daniel Borkmann, Linux Kernel Network Developers
On Thu, Oct 20, 2016 at 9:58 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Oct 20, 2016 at 06:04:38PM +0200, Daniel Borkmann wrote:
>>
>> diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
>> index ee384f0..d4832e8 100644
>> --- a/tools/testing/selftests/bpf/test_maps.c
>> +++ b/tools/testing/selftests/bpf/test_maps.c
>> @@ -25,6 +25,33 @@
>>
>> static int map_flags;
>>
>> +static unsigned int num_possible_cpus(void)
>> +{
>> + static const char *fcpu = "/sys/devices/system/cpu/possible";
>> + unsigned int val, possible_cpus = 0;
>> + char buff[128];
>> + FILE *fp;
>> +
>> + fp = fopen(fcpu, "r");
>> + if (!fp) {
>> + printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno));
>> + exit(1);
>> + }
>> +
>> + while (fgets(buff, sizeof(buff), fp)) {
>> + if (sscanf(buff, "%*u-%u", &val) == 1)
>> + possible_cpus = val;
>> + }
>
> looks great to me.
> Could you move it into bpf_sys.h or somehow make it common in libbpf
> and reuse it in samples/bpf/ ?
> Since quite a few samples need this fix as well.
> Thanks!
>
Looks good to me. I tested it and it works fine.
Thanks!
William
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-19 10:05 ` Daniel Borkmann
@ 2016-10-20 19:21 ` William Tu
0 siblings, 0 replies; 10+ messages in thread
From: William Tu @ 2016-10-20 19:21 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: Alexei Starovoitov, Linux Kernel Network Developers
> Documentation/cputopology.txt +106 says /sys/devices/system/cpu/possible
> outputs cpu_possible_mask. That is the same as in num_possible_cpus(), so
> first step would be to fix the buggy example code, imho.
>
> What perhaps could be done in a second step to reduce overhead is an option
> for bpf(2) to pass in a cpu mask similarly as for sched_{get,set}affinity()
> syscalls, where user space can construct a mask via CPU_SET(3). For the
> syscall time, kernel would lock hot plugging via get_online_cpus() and
> put_online_cpus(), it would check whether passed CPUs are online to query
> and if so then it would copy the values into the user provided buffer. I'd
> think this might be useful in a number of ways anyway.
>
I like this idea. So in this case, the only the data at the cpu
specified by user in the CPU_SET is copied to userspace, potentially
have better performance than always copying the data *
num_possible_cpus() bytes.
Regards,
William
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user.
2016-10-20 18:41 ` William Tu
@ 2016-10-20 19:39 ` Daniel Borkmann
0 siblings, 0 replies; 10+ messages in thread
From: Daniel Borkmann @ 2016-10-20 19:39 UTC (permalink / raw)
To: William Tu, Alexei Starovoitov; +Cc: Linux Kernel Network Developers
On 10/20/2016 08:41 PM, William Tu wrote:
> On Thu, Oct 20, 2016 at 9:58 AM, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>> On Thu, Oct 20, 2016 at 06:04:38PM +0200, Daniel Borkmann wrote:
>>>
>>> diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
>>> index ee384f0..d4832e8 100644
>>> --- a/tools/testing/selftests/bpf/test_maps.c
>>> +++ b/tools/testing/selftests/bpf/test_maps.c
>>> @@ -25,6 +25,33 @@
>>>
>>> static int map_flags;
>>>
>>> +static unsigned int num_possible_cpus(void)
>>> +{
>>> + static const char *fcpu = "/sys/devices/system/cpu/possible";
>>> + unsigned int val, possible_cpus = 0;
>>> + char buff[128];
>>> + FILE *fp;
>>> +
>>> + fp = fopen(fcpu, "r");
>>> + if (!fp) {
>>> + printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno));
>>> + exit(1);
>>> + }
>>> +
>>> + while (fgets(buff, sizeof(buff), fp)) {
>>> + if (sscanf(buff, "%*u-%u", &val) == 1)
>>> + possible_cpus = val;
>>> + }
>>
>> looks great to me.
>> Could you move it into bpf_sys.h or somehow make it common in libbpf
>> and reuse it in samples/bpf/ ?
>> Since quite a few samples need this fix as well.
Ahh, true.
>> Thanks!
>
> Looks good to me. I tested it and it works fine.
Okay, thanks. I'll fix that up, mid-term we should try and move most of
that over to kernel selftests/bpf, and reuse tools/lib/bpf/.
> Thanks!
> William
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2016-10-20 19:39 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-16 16:41 [RFC PATCH net-next] bpf: fix potential percpu map overcopy to user William Tu
2016-10-19 0:50 ` Alexei Starovoitov
2016-10-19 5:31 ` William Tu
2016-10-19 10:05 ` Daniel Borkmann
2016-10-20 19:21 ` William Tu
2016-10-19 15:15 ` Daniel Borkmann
2016-10-20 16:04 ` Daniel Borkmann
2016-10-20 16:58 ` Alexei Starovoitov
2016-10-20 18:41 ` William Tu
2016-10-20 19:39 ` Daniel Borkmann
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.