Re: [PATCH nft 2/3] src: Add support for concatenated set ranges

From: Stefano Brivio <sbrivio@redhat.com>
To: Phil Sutter <phil@nwl.cc>
Cc: "Pablo Neira Ayuso" <pablo@netfilter.org>,
	netfilter-devel@vger.kernel.org,
	"Florian Westphal" <fw@strlen.de>,
	"Kadlecsik József" <kadlec@blackhole.kfki.hu>,
	"Eric Garver" <eric@garver.life>
Subject: Re: [PATCH nft 2/3] src: Add support for concatenated set ranges
Date: Wed, 20 Nov 2019 12:49:54 +0100	[thread overview]
Message-ID: <20191120124954.0740a1a5@redhat.com> (raw)
In-Reply-To: <20191119221238.GF8016@orbyte.nwl.cc>

Hi,

On Tue, 19 Nov 2019 23:12:38 +0100
Phil Sutter <phil@nwl.cc> wrote:

> Hi,
> 
> On Tue, Nov 19, 2019 at 02:07:11AM +0100, Stefano Brivio wrote:
> [...]
> > diff --git a/src/netlink.c b/src/netlink.c
> > index 7306e358..b8bfd199 100644
> > --- a/src/netlink.c
> > +++ b/src/netlink.c  
> [...]
> > @@ -199,10 +210,39 @@ static void netlink_gen_concat_data(const struct expr *expr,
> >  		memset(data, 0, sizeof(data));
> >  		offset = 0;
> >  		list_for_each_entry(i, &expr->expressions, list) {
> > -			assert(i->etype == EXPR_VALUE);
> > -			mpz_export_data(data + offset, i->value, i->byteorder,
> > -					div_round_up(i->len, BITS_PER_BYTE));
> > -			offset += netlink_padded_len(i->len) / BITS_PER_BYTE;
> > +			if (i->etype == EXPR_RANGE) {
> > +				const struct expr *e;
> > +
> > +				if (is_range_end)
> > +					e = i->right;
> > +				else
> > +					e = i->left;
> > +
> > +				offset += netlink_export_pad(data + offset,
> > +							     e->value, e);
> > +			} else if (i->etype == EXPR_PREFIX) {
> > +				if (is_range_end) {
> > +					mpz_t v;
> > +
> > +					mpz_init_bitmask(v, i->len -
> > +							    i->prefix_len);
> > +					mpz_add(v, i->prefix->value, v);
> > +					offset += netlink_export_pad(data +
> > +								     offset,
> > +								     v, i);  
> 
> Given the right-alignment, maybe introduce __netlink_gen_concat_data()
> to contain the loop body?

While at it, I would also drop the if (1) that makes this function not
so pretty. It was introduced by 53fc2c7a7998 ("netlink: move data
related functions to netlink.c") where data was turned from a allocated
buffer to VLA, but I don't see a reason why we can't do:

	unsigned int len = expr->len / BITS_PER_BYTE, offset = 0;
	unsigned char data[len];

So, the whole thing would look like (together with the other change
you suggest):

--
static int netlink_gen_concat_data_expr(const struct expr *i,
					unsigned char *data)
{
	if (i->etype == EXPR_RANGE) {
		const struct expr *e;

		if (i->flags & EXPR_F_INTERVAL_END)
			e = i->right;
		else
			e = i->left;

		return netlink_export_pad(data, e->value, e);
	}

	if (i->etype == EXPR_PREFIX) {
		if (i->flags & EXPR_F_INTERVAL_END) {
			int count;
			mpz_t v;

			mpz_init_bitmask(v, i->len - i->prefix_len);
			mpz_add(v, i->prefix->value, v);
			count = netlink_export_pad(data, v, i);
			mpz_clear(v);
			return count;
		}

		return netlink_export_pad(data, i->prefix->value, i);
	}

	assert(i->etype == EXPR_VALUE);

	return netlink_export_pad(data, i->value, i);
}

static void netlink_gen_concat_data(const struct expr *expr,
				    struct nft_data_linearize *nld)
{
	unsigned int len = expr->len / BITS_PER_BYTE, offset = 0;
	unsigned char data[len];
	const struct expr *i;

	memset(data, 0, len);

	list_for_each_entry(i, &expr->expressions, list)
		offset += netlink_gen_concat_data_expr(i, data + offset);

	memcpy(nld->value, data, len);
	nld->len = len;
}
--

Is that better?

> 
> > +					mpz_clear(v);
> > +					continue;
> > +				}
> > +
> > +				offset += netlink_export_pad(data + offset,
> > +							     i->prefix->value,
> > +							     i);
> > +			} else {
> > +				assert(i->etype == EXPR_VALUE);
> > +
> > +				offset += netlink_export_pad(data + offset,
> > +							     i->value, i);
> > +			}
> >  		}
> >  
> >  		memcpy(nld->value, data, len);
> > @@ -247,13 +287,14 @@ static void netlink_gen_verdict(const struct expr *expr,
> >  	}
> >  }
> >  
> > -void netlink_gen_data(const struct expr *expr, struct nft_data_linearize *data)
> > +void netlink_gen_data(const struct expr *expr, struct nft_data_linearize *data,
> > +		      int end)  
> 
> s/end/is_range_end/ for consistency?

Oops, yes, it had 'is_range_end' in the prototype but not here.
Probably dropping this anyway.

> >  {
> >  	switch (expr->etype) {
> >  	case EXPR_VALUE:
> >  		return netlink_gen_constant_data(expr, data);
> >  	case EXPR_CONCAT:
> > -		return netlink_gen_concat_data(expr, data);
> > +		return netlink_gen_concat_data(expr, data, end);
> >  	case EXPR_VERDICT:
> >  		return netlink_gen_verdict(expr, data);
> >  	default:
> > @@ -712,8 +753,14 @@ void alloc_setelem_cache(const struct expr *set, struct nftnl_set *nls)
> >  	const struct expr *expr;
> >  
> >  	list_for_each_entry(expr, &set->expressions, list) {
> > -		nlse = alloc_nftnl_setelem(set, expr);
> > +		nlse = alloc_nftnl_setelem(set, expr, 0);
> >  		nftnl_set_elem_add(nls, nlse);
> > +
> > +		if (set->set_flags & NFT_SET_SUBKEY) {
> > +			nlse = alloc_nftnl_setelem(set, expr, 1);
> > +			nftnl_set_elem_add(nls, nlse);
> > +		}
> > +  
> 
> Can't we drop 'const' from expr declaration and temporarily set
> EXPR_F_INTERVAL_END to carry the is_interval_end bit or does that mess
> up set element creation?

No, it actually works nicely, I simply didn't think of that.

> What I don't like about your code is how it adds an expression-type
> specific parameter to netlink_gen_data which is supposed to be
> type-agnostic. Avoiding this would also shrink this patch quite a bit.

Indeed, I also didn't like that, thanks for the tip. I'm changing this
in v2.

> [...]
> > diff --git a/src/parser_bison.y b/src/parser_bison.y
> > index 3f283256..2b718971 100644
> > --- a/src/parser_bison.y
> > +++ b/src/parser_bison.y  
> [...]
> > @@ -3941,7 +3940,24 @@ basic_rhs_expr		:	inclusive_or_rhs_expr
> >  			;
> >  
> >  concat_rhs_expr		:	basic_rhs_expr
> > -			|	concat_rhs_expr	DOT	basic_rhs_expr
> > +			|	multiton_rhs_expr
> > +			|	concat_rhs_expr		DOT	multiton_rhs_expr
> > +			{
> > +				if ($$->etype != EXPR_CONCAT) {
> > +					$$ = concat_expr_alloc(&@$);
> > +					compound_expr_add($$, $1);
> > +				} else {
> > +					struct location rhs[] = {
> > +						[1]	= @2,
> > +						[2]	= @3,
> > +					};
> > +					location_update(&$3->location, rhs, 2);
> > +					$$ = $1;
> > +					$$->location = @$;
> > +				}
> > +				compound_expr_add($$, $3);
> > +			}
> > +			|	concat_rhs_expr		DOT	basic_rhs_expr  
> 
> So this is the fifth copy of the same piece of code. :(
> 
> Isn't there a better way to solve that?

I couldn't think of any.

> If not, we could at least introduce a function compound_expr_alloc_or_add().

Right, added. The only ugly aspect is that we also need to update the
location of $3 here, and I don't think we should export
location_update() from parser_bison.y, so this function needs to be in
parser_byson.y itself, I guess.

> [...]
> > diff --git a/src/rule.c b/src/rule.c
> > index 4abc13c9..377781b1 100644
> > --- a/src/rule.c
> > +++ b/src/rule.c  
> [...]
> > @@ -1618,15 +1620,15 @@ static int do_command_insert(struct netlink_ctx *ctx, struct cmd *cmd)
> >  
> >  static int do_delete_setelems(struct netlink_ctx *ctx, struct cmd *cmd)
> >  {
> > -	struct handle *h = &cmd->handle;
> >  	struct expr *expr = cmd->expr;
> > +	struct handle *h = &cmd->handle;  
> 
> Unrelated change?

Oops.

> >  	struct table *table;
> >  	struct set *set;
> >  
> >  	table = table_lookup(h, &ctx->nft->cache);
> >  	set = set_lookup(table, h->set.name);
> >  
> > -	if (set->flags & NFT_SET_INTERVAL &&
> > +	if (set->flags & NFT_SET_INTERVAL && !(set->flags & NFT_SET_SUBKEY) &&  
> 
> Maybe introduce set_is_non_concat_range() or something? Maybe even a
> macro, it's just about:
> 
> | return set->flags & (NFT_SET_INTERVAL | NFT_SET_SUBKEY) == NFT_SET_INTERVAL;

I'm adding that as static inline together with the other set_is_*()
functions in rule.h, it looks consistent.

> [...]
> > diff --git a/src/segtree.c b/src/segtree.c
> > index 9f1eecc0..e49576bc 100644
> > --- a/src/segtree.c
> > +++ b/src/segtree.c  
> [...]
> > @@ -823,6 +828,9 @@ static int expr_value_cmp(const void *p1, const void *p2)
> >  	struct expr *e2 = *(void * const *)p2;
> >  	int ret;
> >  
> > +	if (expr_value(e1)->etype == EXPR_CONCAT)
> > +		return -1;
> > +  
> 
> Funny how misleading expr_value()'s name is. ;)
> 
> [...]
> > +/* Given start and end elements of a range, check if it can be represented as
> > + * a single netmask, and if so, how long, by returning a zero or positive value.
> > + */
> > +static int range_mask_len(mpz_t start, mpz_t end, unsigned int len)
> > +{
> > +	unsigned int step = 0, i;
> > +	mpz_t base, tmp;
> > +	int masks = 0;
> > +
> > +	mpz_init_set_ui(base, mpz_get_ui(start));
> > +
> > +	while (mpz_cmp(base, end) <= 0) {
> > +		step = 0;
> > +		while (!mpz_tstbit(base, step)) {
> > +			mpz_init_set_ui(tmp, mpz_get_ui(base));
> > +			for (i = 0; i <= step; i++)
> > +				mpz_setbit(tmp, i);
> > +			if (mpz_cmp(tmp, end) > 0) {
> > +				mpz_clear(tmp);
> > +				break;
> > +			}
> > +			mpz_clear(tmp);
> > +
> > +			step++;
> > +
> > +			if (step >= len)
> > +				goto out;
> > +		}
> > +
> > +		if (masks++)
> > +			goto out;
> > +
> > +		mpz_add_ui(base, base, 1 << step);
> > +	}
> > +
> > +out:
> > +	mpz_clear(base);
> > +
> > +	if (masks > 1)
> > +		return -1;
> > +	return len - step;
> > +}  
> 
> I don't understand this algorithm.

It basically does the same thing as the function you wrote below, but
instead of shifting 'start' and 'end', 'step' is increased, and set
onto them, so that at every iteration we have the resulting mask
available in 'base'.

That's not actually needed here, though. It's a left-over from a
previous idea of generating composing netmasks in nftables rather than
in the set. I'll switch to the "obvious" implementation.

> Intuitively, I would just:
> 
> | int tmp = len;
> | while (start != end && !(start & 1) && (end & 1) && tmp) {
> |	start >>= 1;
> |	end >>= 1;
> |	tmp--;
> | }
> | return (tmp && start == end) ? len - tmp : -1;
> 
> Is that slow when dealing with gmp?

I don't think so, it also avoid copies and allocations, while shifting
and setting bits have comparable complexities. I'd go with the gmp
version of this.

-- 
Stefano