Re: [PATCH v6 bpf-next 1/5] bpf: Add bloom filter map implementation

From: Martin KaFai Lau <kafai@fb.com>
To: Joanne Koong <joannekoong@fb.com>
Cc: <bpf@vger.kernel.org>, <andrii@kernel.org>, <ast@kernel.org>,
	<daniel@iogearbox.net>, <Kernel-team@fb.com>
Subject: Re: [PATCH v6 bpf-next 1/5] bpf: Add bloom filter map implementation
Date: Thu, 28 Oct 2021 14:14:24 -0700	[thread overview]
Message-ID: <20211028211424.m5y4kafaulvgke54@kafai-mbp.dhcp.thefacebook.com> (raw)
In-Reply-To: <20211027234504.30744-2-joannekoong@fb.com>

On Wed, Oct 27, 2021 at 04:45:00PM -0700, Joanne Koong wrote:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 31421c74ba08..50105e0b8fcc 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -169,6 +169,7 @@ struct bpf_map {
The earlier context is copied here:

	struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
        void *security;
#endif

>  	u32 value_size;
>  	u32 max_entries;
>  	u32 map_flags;
> +	u64 map_extra; /* any per-map-type extra fields */
There is a 4 byte hole before the new 'u64 map_extra'.  Try to move
it before map_flags

Later in this struct. This existing comment needs to be updated also:
	/* 22 bytes hole */

>  	int spin_lock_off; /* >=0 valid offset, <0 error */
>  	int timer_off; /* >=0 valid offset, <0 error */
>  	u32 id;
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 9c81724e4b98..c4424ac2fa02 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
>  #endif
>  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
>  
>  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
>  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c10820037883..8bead4aa3ad0 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -906,6 +906,7 @@ enum bpf_map_type {
>  	BPF_MAP_TYPE_RINGBUF,
>  	BPF_MAP_TYPE_INODE_STORAGE,
>  	BPF_MAP_TYPE_TASK_STORAGE,
> +	BPF_MAP_TYPE_BLOOM_FILTER,
>  };
>  
>  /* Note that tracing related programs such as
> @@ -1274,6 +1275,13 @@ union bpf_attr {
>  						   * struct stored as the
>  						   * map value
>  						   */
> +		/* Any per-map-type extra fields
> +		 *
> +		 * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
> +		 * number of hash functions (if 0, the bloom filter will default
> +		 * to using 5 hash functions).
> +		 */
> +		__u64	map_extra;
>  	};
>  
>  	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
> @@ -5638,6 +5646,7 @@ struct bpf_map_info {
>  	__u32 btf_id;
>  	__u32 btf_key_type_id;
>  	__u32 btf_value_type_id;
There is also a 4 byte hole here.  A "__u32 :32" is needed.
You can find details in 36f9814a494a ("bpf: fix uapi hole for 32 bit compat applications")

> +	__u64 map_extra;
>  } __attribute__((aligned(8)));

[ ... ]

> +static int peek_elem(struct bpf_map *map, void *value)
These generic map-ops names could be confusing in tracing and
in perf-report.  There was a 'bloom_filter_map_' prefix in the earlier version.
I could have missed something in the earlier discussion threads.
What was the reason in dropping the prefix?

> +{
> +	struct bpf_bloom_filter *bloom =
> +		container_of(map, struct bpf_bloom_filter, map);
> +	u32 i, h;
> +
> +	for (i = 0; i < bloom->nr_hash_funcs; i++) {
> +		h = hash(bloom, value, map->value_size, i);
> +		if (!test_bit(h, bloom->bitset))
> +			return -ENOENT;
> +	}
> +
> +	return 0;
> +}
> +
> +static int push_elem(struct bpf_map *map, void *value, u64 flags)
> +{
> +	struct bpf_bloom_filter *bloom =
> +		container_of(map, struct bpf_bloom_filter, map);
> +	u32 i, h;
> +
> +	if (flags != BPF_ANY)
> +		return -EINVAL;
> +
> +	for (i = 0; i < bloom->nr_hash_funcs; i++) {
> +		h = hash(bloom, value, map->value_size, i);
> +		set_bit(h, bloom->bitset);
> +	}
> +
> +	return 0;
> +}
> +
> +static int pop_elem(struct bpf_map *map, void *value)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static struct bpf_map *map_alloc(union bpf_attr *attr)
> +{
> +	u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
> +	int numa_node = bpf_map_attr_numa_node(attr);
> +	struct bpf_bloom_filter *bloom;
> +
> +	if (!bpf_capable())
> +		return ERR_PTR(-EPERM);
> +
> +	if (attr->key_size != 0 || attr->value_size == 0 ||
> +	    attr->max_entries == 0 ||
> +	    attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
> +	    !bpf_map_flags_access_ok(attr->map_flags) ||
> +	    (attr->map_extra & ~0xF))
> +		return ERR_PTR(-EINVAL);
> +
> +	/* The lower 4 bits of map_extra specify the number of hash functions */
> +	nr_hash_funcs = attr->map_extra & 0xF;
nit. "& 0xF" is unnecessary.  It has just been tested immediately above.

> +	if (nr_hash_funcs == 0)
> +		/* Default to using 5 hash functions if unspecified */
> +		nr_hash_funcs = 5;
> +
> +	/* For the bloom filter, the optimal bit array size that minimizes the
> +	 * false positive probability is n * k / ln(2) where n is the number of
> +	 * expected entries in the bloom filter and k is the number of hash
> +	 * functions. We use 7 / 5 to approximate 1 / ln(2).
> +	 *
> +	 * We round this up to the nearest power of two to enable more efficient
> +	 * hashing using bitmasks. The bitmask will be the bit array size - 1.
> +	 *
> +	 * If this overflows a u32, the bit array size will have 2^32 (4
> +	 * GB) bits.
> +	 */
> +	if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
> +	    check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
> +	    nr_bits > (1UL << 31)) {
> +		/* The bit array size is 2^32 bits but to avoid overflowing the
> +		 * u32, we use U32_MAX, which will round up to the equivalent
> +		 * number of bytes
> +		 */
> +		bitset_bytes = BITS_TO_BYTES(U32_MAX);
> +		bitset_mask = U32_MAX;
> +	} else {
> +		if (nr_bits <= BITS_PER_LONG)
> +			nr_bits = BITS_PER_LONG;
> +		else
> +			nr_bits = roundup_pow_of_two(nr_bits);
> +		bitset_bytes = BITS_TO_BYTES(nr_bits);
> +		bitset_mask = nr_bits - 1;
> +	}
> +
> +	bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
> +	bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
> +
> +	if (!bloom)
> +		return ERR_PTR(-ENOMEM);
> +
> +	bpf_map_init_from_attr(&bloom->map, attr);
> +
> +	bloom->nr_hash_funcs = nr_hash_funcs;
> +	bloom->bitset_mask = bitset_mask;
> +
> +	/* Check whether the value size is u32-aligned */
> +	if ((attr->value_size & (sizeof(u32) - 1)) == 0)
> +		bloom->aligned_u32_count =
> +			attr->value_size / sizeof(u32);
> +
> +	if (!(attr->map_flags & BPF_F_ZERO_SEED))
> +		bloom->hash_seed = get_random_int();
> +
> +	return &bloom->map;
> +}