Re: [PATCH v4 bpf-next 5/9] bpf: add batch ops to all htab bpf map

From: Yonghong Song <yhs@fb.com>
To: Brian Vazquez <brianvv@google.com>
Cc: Brian Vazquez <brianvv.kernel@gmail.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"David S . Miller" <davem@davemloft.net>,
	Andrii Nakryiko <andrii.nakryiko@gmail.com>,
	"Stanislav Fomichev" <sdf@google.com>,
	Petar Penkov <ppenkov@google.com>,
	"Willem de Bruijn" <willemb@google.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"bpf@vger.kernel.org" <bpf@vger.kernel.org>
Subject: Re: [PATCH v4 bpf-next 5/9] bpf: add batch ops to all htab bpf map
Date: Wed, 15 Jan 2020 01:03:34 +0000	[thread overview]
Message-ID: <98e5e51d-25da-ee4b-8a21-d4cfb5909c08@fb.com> (raw)
In-Reply-To: <CAMzD94SDk=y5=hpcYOUYnKnqpQbFCyNu-FiXatJPisancvr7gw@mail.gmail.com>

On 1/14/20 3:49 PM, Brian Vazquez wrote:
> Hi Yonghong, thanks for reviewing it!
> 
> On Tue, Jan 14, 2020 at 2:56 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 1/14/20 8:46 AM, Brian Vazquez wrote:
>>> From: Yonghong Song <yhs@fb.com>
>>>
>>> htab can't use generic batch support due some problematic behaviours
>>> inherent to the data structre, i.e. while iterating the bpf map  a
>>> concurrent program might delete the next entry that batch was about to
>>> use, in that case there's no easy solution to retrieve the next entry,
>>> the issue has been discussed multiple times (see [1] and [2]).
>>>
>>> The only way hmap can be traversed without the problem previously
>>> exposed is by making sure that the map is traversing entire buckets.
>>> This commit implements those strict requirements for hmap, the
>>> implementation follows the same interaction that generic support with
>>> some exceptions:
>>>
>>>    - If keys/values buffer are not big enough to traverse a bucket,
>>>      ENOSPC will be returned.
>>>    - out_batch contains the value of the next bucket in the iteration, not
>>>      the next key, but this is transparent for the user since the user
>>>      should never use out_batch for other than bpf batch syscalls.
>>>
>>> This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new
>>> command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete
>>> batch ops it is possible to use the generic implementations.
>>>
>>> [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/
>>> [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/
>>>
>>> Signed-off-by: Yonghong Song <yhs@fb.com>
>>> Signed-off-by: Brian Vazquez <brianvv@google.com>
>>> ---
>>>    include/linux/bpf.h      |   3 +
>>>    include/uapi/linux/bpf.h |   1 +
>>>    kernel/bpf/hashtab.c     | 258 +++++++++++++++++++++++++++++++++++++++
>>>    kernel/bpf/syscall.c     |   9 +-
>>>    4 files changed, 270 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 05466ad6cf1c5..3517e32149a4f 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -46,6 +46,9 @@ struct bpf_map_ops {
>>>        void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
>>>        int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
>>>                                union bpf_attr __user *uattr);
>>> +     int (*map_lookup_and_delete_batch)(struct bpf_map *map,
>>> +                                        const union bpf_attr *attr,
>>> +                                        union bpf_attr __user *uattr);
>>>        int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
>>>                                union bpf_attr __user *uattr);
>>>        int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index e8df9ca680e0c..9536729a03d57 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -108,6 +108,7 @@ enum bpf_cmd {
>>>        BPF_MAP_FREEZE,
>>>        BPF_BTF_GET_NEXT_ID,
>>>        BPF_MAP_LOOKUP_BATCH,
>>> +     BPF_MAP_LOOKUP_AND_DELETE_BATCH,
>>>        BPF_MAP_UPDATE_BATCH,
>>>        BPF_MAP_DELETE_BATCH,
>>>    };
>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>> index 22066a62c8c97..d9888acfd632b 100644
>>> --- a/kernel/bpf/hashtab.c
>>> +++ b/kernel/bpf/hashtab.c
>>> @@ -17,6 +17,16 @@
>>>        (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
>>>         BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
>>>
>>> +#define BATCH_OPS(_name)                     \
>>> +     .map_lookup_batch =                     \
>>> +     _name##_map_lookup_batch,               \
>>> +     .map_lookup_and_delete_batch =          \
>>> +     _name##_map_lookup_and_delete_batch,    \
>>> +     .map_update_batch =                     \
>>> +     generic_map_update_batch,               \
>>> +     .map_delete_batch =                     \
>>> +     generic_map_delete_batch
>>> +
>>>    struct bucket {
>>>        struct hlist_nulls_head head;
>>>        raw_spinlock_t lock;
>>> @@ -1232,6 +1242,250 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
>>>        rcu_read_unlock();
>>>    }
>>>
>>> +static int
>>> +__htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>> +                                const union bpf_attr *attr,
>>> +                                union bpf_attr __user *uattr,
>>> +                                bool do_delete, bool is_lru_map,
>>> +                                bool is_percpu)
>>> +{
>>> +     struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>>> +     u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
>>> +     void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
>>> +     void __user *uvalues = u64_to_user_ptr(attr->batch.values);
>>> +     void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
>>> +     void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
>>> +     u32 batch, max_count, size, bucket_size;
>>> +     u64 elem_map_flags, map_flags;
>>> +     struct hlist_nulls_head *head;
>>> +     struct hlist_nulls_node *n;
>>> +     unsigned long flags;
>>> +     struct htab_elem *l;
>>> +     struct bucket *b;
>>> +     int ret = 0;
>>> +
>>> +     elem_map_flags = attr->batch.elem_flags;
>>> +     if ((elem_map_flags & ~BPF_F_LOCK) ||
>>> +         ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
>>> +             return -EINVAL;
>>> +
>>> +     map_flags = attr->batch.flags;
>>> +     if (map_flags)
>>> +             return -EINVAL;
>>> +
>>> +     max_count = attr->batch.count;
>>> +     if (!max_count)
>>> +             return 0;
>>> +
>>> +     batch = 0;
>>> +     if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
>>> +             return -EFAULT;
>>> +
>>> +     if (batch >= htab->n_buckets)
>>> +             return -ENOENT;
>>> +
>>> +     key_size = htab->map.key_size;
>>> +     roundup_key_size = round_up(htab->map.key_size, 8);
>>> +     value_size = htab->map.value_size;
>>> +     size = round_up(value_size, 8);
>>> +     if (is_percpu)
>>> +             value_size = size * num_possible_cpus();
>>> +     total = 0;
>>> +     bucket_size = 1;
>>
>> Have you checked typical hash table linklist length?
> While testing with hash tables ranging from 10 to 1000 entries I saw
> linked lists of upto 5 entries.
>> Maybe initial value bucket_size = 2 is able to cover most common cases?
> I think 4-5 is still a reasonable number, what do you think?

5 should be okay. You can add some comments to explain why "5".

>>
>>> +
>>> +alloc:
>>> +     /* We cannot do copy_from_user or copy_to_user inside
>>> +      * the rcu_read_lock. Allocate enough space here.
>>> +      */
>>> +     keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
>>> +     values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
>>> +     if (!keys || !values) {
>>> +             ret = -ENOMEM;
>>> +             goto out;
>>
>> In this case, we won't copy batch and total to user buffer. Maybe we
>> should do that?
> Yes, I think last line should be: goto after_loop;
>>
>>
>>> +     }
>>> +
>>> +again:
>>> +     preempt_disable();
>>> +     this_cpu_inc(bpf_prog_active);
>>> +     rcu_read_lock();
>>> +again_nocopy:
>>> +     dst_key = keys;
>>> +     dst_val = values;
>>> +     b = &htab->buckets[batch];
>>> +     head = &b->head;
>>> +     raw_spin_lock_irqsave(&b->lock, flags);
>>> +
>>> +     bucket_cnt = 0;
>>> +     hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
>>> +             bucket_cnt++;
>>> +
>>> +     if (bucket_cnt > (max_count - total)) {
>>> +             if (total == 0)
>>> +                     ret = -ENOSPC;
>>> +             raw_spin_unlock_irqrestore(&b->lock, flags);
>>> +             rcu_read_unlock();
>>> +             this_cpu_dec(bpf_prog_active);
>>> +             preempt_enable();
>>> +             goto after_loop;
>>> +     }
>>> +
>>> +     if (bucket_cnt > bucket_size) {
>>> +             bucket_size = bucket_cnt;
>>> +             raw_spin_unlock_irqrestore(&b->lock, flags);
>>> +             rcu_read_unlock();
>>> +             this_cpu_dec(bpf_prog_active);
>>> +             preempt_enable();
>>> +             kvfree(keys);
>>> +             kvfree(values);
>>> +             goto alloc;
>>> +     }
>>> +
>>> +     hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
>>> +             memcpy(dst_key, l->key, key_size);
>>> +
>>> +             if (is_percpu) {
>>> +                     int off = 0, cpu;
>>> +                     void __percpu *pptr;
>>> +
>>> +                     pptr = htab_elem_get_ptr(l, map->key_size);
>>> +                     for_each_possible_cpu(cpu) {
>>> +                             bpf_long_memcpy(dst_val + off,
>>> +                                             per_cpu_ptr(pptr, cpu), size);
>>> +                             off += size;
>>> +                     }
>>> +             } else {
>>> +                     value = l->key + roundup_key_size;
>>> +                     if (elem_map_flags & BPF_F_LOCK)
>>> +                             copy_map_value_locked(map, dst_val, value,
>>> +                                                   true);
>>> +                     else
>>> +                             copy_map_value(map, dst_val, value);
>>> +                     check_and_init_map_lock(map, dst_val);
>>> +             }
>>> +             if (do_delete) {
>>> +                     hlist_nulls_del_rcu(&l->hash_node);
>>> +                     if (is_lru_map)
>>> +                             bpf_lru_push_free(&htab->lru, &l->lru_node);
>>> +                     else
>>> +                             free_htab_elem(htab, l);
>>> +             }
>>> +             dst_key += key_size;
>>> +             dst_val += value_size;
>>> +     }
>>> +
>>> +     raw_spin_unlock_irqrestore(&b->lock, flags);
>>> +     /* If we are not copying data, we can go to next bucket and avoid
>>> +      * unlocking the rcu.
>>> +      */
>>> +     if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
>>> +             batch++;
>>> +             goto again_nocopy;
>>> +     }
>>> +
>>> +     rcu_read_unlock();
>>> +     this_cpu_dec(bpf_prog_active);
>>> +     preempt_enable();
>>> +     if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
>>> +         key_size * bucket_cnt) ||
>>> +         copy_to_user(uvalues + total * value_size, values,
>>> +         value_size * bucket_cnt))) {
>>> +             ret = -EFAULT;
>>> +             goto after_loop;
>>> +     }
>>> +
>>> +     total += bucket_cnt;
>>> +     batch++;
>>> +     if (batch >= htab->n_buckets) {
>>> +             ret = -ENOENT;
>>> +             goto after_loop;
>>> +     }
>>> +     goto again;
>>> +
>>> +after_loop:
>>> +     if (ret && (ret != -ENOENT && ret != -EFAULT))
>>> +             goto out;
>>
>> We won't have many error codes reaching here, -ENOENT, -EFAULT, -ENOSPC,
>> and possibly -ENOMEM.
>> Maybe just
>>          if (ret == -EFAULT)
>>                  goto out;
>>
> Yes I think that make senses, I only need to add
> 
> if (put_user(0, &uattr->batch.count))
>          return -EFAULT;
> 
> before traversing the map to make sure that if there is an error,
> batch.count doesn't miss report entries since that variable is used as
> input/output. Does this make sense?

This does make sense. You can put the above checking right before
the "alloc" label. Everything after that will go through copying
"count".

> 
>>> +
>>> +     /* copy # of entries and next batch */
>>> +     ubatch = u64_to_user_ptr(attr->batch.out_batch);
>>> +     if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
>>> +         put_user(total, &uattr->batch.count))
>>> +             ret = -EFAULT;
>>> +
>>> +out:
>>> +     kvfree(keys);
>>> +     kvfree(values);
>>> +     return ret;
>>> +}
>>> +
>> [...]