linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] ipvs: add genetlink cmd to dump all services and destinations
@ 2020-10-30 20:27 Cezar Sa Espinola
  2020-11-02 20:56 ` Julian Anastasov
  0 siblings, 1 reply; 8+ messages in thread
From: Cezar Sa Espinola @ 2020-10-30 20:27 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: Cezar Sa Espinola, Wensong Zhang, Simon Horman,
	Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
	David S. Miller, Jakub Kicinski, open list:IPVS, open list:IPVS,
	open list, open list:NETFILTER, open list:NETFILTER

A common operation for userspace applications managing ipvs is to dump
all services and all destinations and then sort out what needs to be
done. Previously this could only be accomplished by issuing 1 netlink
IPVS_CMD_GET_SERVICE dump command followed by N IPVS_CMD_GET_DEST dump
commands. For a dynamic system with a very large number of services this
could be cause a performance impact.

This patch introduces a new way of dumping all services and destinations
with the new IPVS_CMD_GET_SERVICE_DEST command. A dump call for this
command will send the services as IPVS_CMD_NEW_SERVICE messages
imediatelly followed by its destinations as multiple IPVS_CMD_NEW_DEST
messages. It's also possible to dump a single service and its
destinations by sending a IPVS_CMD_ATTR_SERVICE argument to the dump
command.

Signed-off-by: Cezar Sa Espinola <cezarsa@gmail.com>
---

To ensure that this patch improves performance I decided to also patch
ipvsadm in order to run some benchmarks comparing 'ipvsadm -Sn' with the
unpatched version. The ipvsadm patch is available on github in [1] for
now but I intend to submit it if this RFC goes forward.

The benchmarks look nice and detailed results and some scripts to allow
reproducing then are available in another github repository [2]. The
summary of the benchmarks is:

svcs  | dsts | run time compared to unpatched
----- | ---- | ------------------------------
 1000 |    4 | -60.63%
 2000 |    2 | -71.10%
 8000 |    2 | -52.83%
16000 |    1 | -54.13%
  100 |  100 |  -4.76%

[1] - https://github.com/cezarsa/ipvsadm/compare/master...dump-svc-ds
[2] - https://github.com/cezarsa/ipvsadm-validate#benchmark-results

 include/uapi/linux/ip_vs.h     |   2 +
 net/netfilter/ipvs/ip_vs_ctl.c | 109 +++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 4102ddcb4e14..353548cb7b81 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -331,6 +331,8 @@ enum {
 	IPVS_CMD_ZERO,			/* zero all counters and stats */
 	IPVS_CMD_FLUSH,			/* flush services and dests */
 
+	IPVS_CMD_GET_SERVICE_DEST,	/* get service and destination info */
+
 	__IPVS_CMD_MAX,
 };
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e279ded4e306..09a7dd823dc0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3396,6 +3396,109 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	return skb->len;
 }
 
+struct dump_services_dests_ctx {
+	struct ip_vs_service	*last_svc;
+	int			idx_svc;
+	int			idx_dest;
+	int			start_svc;
+	int			start_dest;
+};
+
+static int ip_vs_genl_dump_service_destinations(struct sk_buff *skb,
+						struct netlink_callback *cb,
+						struct ip_vs_service *svc,
+						struct dump_services_dests_ctx *ctx)
+{
+	struct ip_vs_dest *dest;
+
+	if (++ctx->idx_svc < ctx->start_svc)
+		return 0;
+
+	if (ctx->idx_svc == ctx->start_svc && ctx->last_svc != svc)
+		return 0;
+
+	if (ctx->idx_svc > ctx->start_svc) {
+		if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+			ctx->idx_svc--;
+			return -EMSGSIZE;
+		}
+		ctx->last_svc = svc;
+		ctx->start_dest = 0;
+	}
+
+	ctx->idx_dest = 0;
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (++ctx->idx_dest <= ctx->start_dest)
+			continue;
+		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+			ctx->idx_dest--;
+			return -EMSGSIZE;
+		}
+	}
+
+	return 0;
+}
+
+static int ip_vs_genl_dump_services_destinations(struct sk_buff *skb,
+						 struct netlink_callback *cb)
+{
+	/* Besides usual index based counters, saving a pointer to the last
+	 * dumped service is useful to ensure we only dump destinations that
+	 * belong to it, even when services are removed while the dump is still
+	 * running causing indexes to shift.
+	 */
+	struct dump_services_dests_ctx ctx = {
+		.idx_svc = 0,
+		.idx_dest = 0,
+		.start_svc = cb->args[0],
+		.start_dest = cb->args[1],
+		.last_svc = (struct ip_vs_service *)(cb->args[2]),
+	};
+	struct net *net = sock_net(skb->sk);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_service *svc = NULL;
+	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+	int i;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX,
+				   ip_vs_cmd_policy, cb->extack) == 0) {
+		svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
+
+		if (!IS_ERR_OR_NULL(svc)) {
+			ip_vs_genl_dump_service_destinations(skb, cb, svc, &ctx);
+			goto nla_put_failure;
+		}
+	}
+
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+			if (svc->ipvs != ipvs)
+				continue;
+			if (ip_vs_genl_dump_service_destinations(skb, cb, svc, &ctx) < 0)
+				goto nla_put_failure;
+		}
+	}
+
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], s_list) {
+			if (svc->ipvs != ipvs)
+				continue;
+			if (ip_vs_genl_dump_service_destinations(skb, cb, svc, &ctx) < 0)
+				goto nla_put_failure;
+		}
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+	cb->args[0] = ctx.idx_svc;
+	cb->args[1] = ctx.idx_dest;
+	cb->args[2] = (long)ctx.last_svc;
+
+	return skb->len;
+}
+
 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 				 struct nlattr *nla, bool full_entry)
 {
@@ -3991,6 +4094,12 @@ static const struct genl_small_ops ip_vs_genl_ops[] = {
 		.flags	= GENL_ADMIN_PERM,
 		.doit	= ip_vs_genl_set_cmd,
 	},
+	{
+		.cmd	= IPVS_CMD_GET_SERVICE_DEST,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags	= GENL_ADMIN_PERM,
+		.dumpit	= ip_vs_genl_dump_services_destinations,
+	},
 };
 
 static struct genl_family ip_vs_genl_family __ro_after_init = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC] ipvs: add genetlink cmd to dump all services and destinations
  2020-10-30 20:27 [PATCH RFC] ipvs: add genetlink cmd to dump all services and destinations Cezar Sa Espinola
@ 2020-11-02 20:56 ` Julian Anastasov
  2020-11-03 16:36   ` Cezar Sá Espinola
  0 siblings, 1 reply; 8+ messages in thread
From: Julian Anastasov @ 2020-11-02 20:56 UTC (permalink / raw)
  To: Cezar Sa Espinola
  Cc: Wensong Zhang, Simon Horman, open list:IPVS, open list:IPVS,
	open list, open list:NETFILTER, open list:NETFILTER


	Hello,

On Fri, 30 Oct 2020, Cezar Sa Espinola wrote:

> A common operation for userspace applications managing ipvs is to dump
> all services and all destinations and then sort out what needs to be
> done. Previously this could only be accomplished by issuing 1 netlink
> IPVS_CMD_GET_SERVICE dump command followed by N IPVS_CMD_GET_DEST dump
> commands. For a dynamic system with a very large number of services this
> could be cause a performance impact.
> 
> This patch introduces a new way of dumping all services and destinations
> with the new IPVS_CMD_GET_SERVICE_DEST command. A dump call for this
> command will send the services as IPVS_CMD_NEW_SERVICE messages
> imediatelly followed by its destinations as multiple IPVS_CMD_NEW_DEST
> messages. It's also possible to dump a single service and its
> destinations by sending a IPVS_CMD_ATTR_SERVICE argument to the dump
> command.
> 
> Signed-off-by: Cezar Sa Espinola <cezarsa@gmail.com>
> ---
> 
> To ensure that this patch improves performance I decided to also patch
> ipvsadm in order to run some benchmarks comparing 'ipvsadm -Sn' with the
> unpatched version. The ipvsadm patch is available on github in [1] for
> now but I intend to submit it if this RFC goes forward.
> 
> The benchmarks look nice and detailed results and some scripts to allow
> reproducing then are available in another github repository [2]. The
> summary of the benchmarks is:
> 
> svcs  | dsts | run time compared to unpatched
> ----- | ---- | ------------------------------
>  1000 |    4 | -60.63%
>  2000 |    2 | -71.10%
>  8000 |    2 | -52.83%
> 16000 |    1 | -54.13%
>   100 |  100 |  -4.76%
> 
> [1] - https://github.com/cezarsa/ipvsadm/compare/master...dump-svc-ds
> [2] - https://github.com/cezarsa/ipvsadm-validate#benchmark-results
> 
>  include/uapi/linux/ip_vs.h     |   2 +
>  net/netfilter/ipvs/ip_vs_ctl.c | 109 +++++++++++++++++++++++++++++++++
>  2 files changed, 111 insertions(+)
> 
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index 4102ddcb4e14..353548cb7b81 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -331,6 +331,8 @@ enum {
>  	IPVS_CMD_ZERO,			/* zero all counters and stats */
>  	IPVS_CMD_FLUSH,			/* flush services and dests */
>  
> +	IPVS_CMD_GET_SERVICE_DEST,	/* get service and destination info */
> +
>  	__IPVS_CMD_MAX,
>  };
>  
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index e279ded4e306..09a7dd823dc0 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -3396,6 +3396,109 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
>  	return skb->len;
>  }
>  
> +struct dump_services_dests_ctx {
> +	struct ip_vs_service	*last_svc;
> +	int			idx_svc;
> +	int			idx_dest;
> +	int			start_svc;
> +	int			start_dest;
> +};
> +
> +static int ip_vs_genl_dump_service_destinations(struct sk_buff *skb,
> +						struct netlink_callback *cb,
> +						struct ip_vs_service *svc,
> +						struct dump_services_dests_ctx *ctx)
> +{
> +	struct ip_vs_dest *dest;
> +
> +	if (++ctx->idx_svc < ctx->start_svc)
> +		return 0;
> +
> +	if (ctx->idx_svc == ctx->start_svc && ctx->last_svc != svc)
> +		return 0;
> +
> +	if (ctx->idx_svc > ctx->start_svc) {
> +		if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
> +			ctx->idx_svc--;
> +			return -EMSGSIZE;
> +		}
> +		ctx->last_svc = svc;
> +		ctx->start_dest = 0;
> +	}
> +
> +	ctx->idx_dest = 0;
> +	list_for_each_entry(dest, &svc->destinations, n_list) {
> +		if (++ctx->idx_dest <= ctx->start_dest)
> +			continue;
> +		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
> +			ctx->idx_dest--;

	At this point idx_svc is incremented and we
stop at the middle of dest list, so we need ctx->idx_svc-- too.

	And now what happens if all dests can not fit in a packet?
We should start next packet with the same svc? And then
user space should merge the dests when multiple packets
start with same service?

	The main points are:

- the virtual services are in hash table, their order is
not important, user space can sort them

- order of dests in a service is important for the schedulers

- every packet should contain info for svc, so that we can
properly add dests to the right svc

> +			return -EMSGSIZE;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int ip_vs_genl_dump_services_destinations(struct sk_buff *skb,
> +						 struct netlink_callback *cb)
> +{
> +	/* Besides usual index based counters, saving a pointer to the last
> +	 * dumped service is useful to ensure we only dump destinations that
> +	 * belong to it, even when services are removed while the dump is still
> +	 * running causing indexes to shift.
> +	 */
> +	struct dump_services_dests_ctx ctx = {
> +		.idx_svc = 0,
> +		.idx_dest = 0,
> +		.start_svc = cb->args[0],
> +		.start_dest = cb->args[1],
> +		.last_svc = (struct ip_vs_service *)(cb->args[2]),
> +	};
> +	struct net *net = sock_net(skb->sk);
> +	struct netns_ipvs *ipvs = net_ipvs(net);
> +	struct ip_vs_service *svc = NULL;
> +	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
> +	int i;
> +
> +	mutex_lock(&__ip_vs_mutex);
> +
> +	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX,
> +				   ip_vs_cmd_policy, cb->extack) == 0) {
> +		svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
> +
> +		if (!IS_ERR_OR_NULL(svc)) {
> +			ip_vs_genl_dump_service_destinations(skb, cb, svc, &ctx);
> +			goto nla_put_failure;
> +		}
> +	}
> +
> +	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
> +		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
> +			if (svc->ipvs != ipvs)
> +				continue;
> +			if (ip_vs_genl_dump_service_destinations(skb, cb, svc, &ctx) < 0)
> +				goto nla_put_failure;
> +		}
> +	}
> +
> +	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
> +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], s_list) {
> +			if (svc->ipvs != ipvs)
> +				continue;
> +			if (ip_vs_genl_dump_service_destinations(skb, cb, svc, &ctx) < 0)
> +				goto nla_put_failure;
> +		}
> +	}
> +
> +nla_put_failure:
> +	mutex_unlock(&__ip_vs_mutex);
> +	cb->args[0] = ctx.idx_svc;
> +	cb->args[1] = ctx.idx_dest;
> +	cb->args[2] = (long)ctx.last_svc;

	last_svc is used out of __ip_vs_mutex region,
so it is not safe. We can get a reference count but this
is bad if user space blocks.

	But even if we use just indexes it should be ok.
If multiple agents are used in parallel it is not our
problem. What can happen is that we can send duplicates
or to skip entries (both svcs and dests). It is impossible
to keep any kind of references to current entries or even
keys to lookup them if another agent can remove them.

> +
> +	return skb->len;
> +}
> +
>  static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
>  				 struct nlattr *nla, bool full_entry)
>  {
> @@ -3991,6 +4094,12 @@ static const struct genl_small_ops ip_vs_genl_ops[] = {
>  		.flags	= GENL_ADMIN_PERM,
>  		.doit	= ip_vs_genl_set_cmd,
>  	},
> +	{
> +		.cmd	= IPVS_CMD_GET_SERVICE_DEST,
> +		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
> +		.flags	= GENL_ADMIN_PERM,
> +		.dumpit	= ip_vs_genl_dump_services_destinations,
> +	},
>  };
>  
>  static struct genl_family ip_vs_genl_family __ro_after_init = {
> -- 

Regards

--
Julian Anastasov <ja@ssi.bg>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC] ipvs: add genetlink cmd to dump all services and destinations
  2020-11-02 20:56 ` Julian Anastasov
@ 2020-11-03 16:36   ` Cezar Sá Espinola
  2020-11-03 19:18     ` Julian Anastasov
  0 siblings, 1 reply; 8+ messages in thread
From: Cezar Sá Espinola @ 2020-11-03 16:36 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: Wensong Zhang, Simon Horman, open list:IPVS, open list:IPVS,
	open list, open list:NETFILTER, open list:NETFILTER

Hi,

> > +     if (ctx->idx_svc == ctx->start_svc && ctx->last_svc != svc)
> > +             return 0;
> > +
> > +     if (ctx->idx_svc > ctx->start_svc) {
> > +             if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
> > +                     ctx->idx_svc--;
> > +                     return -EMSGSIZE;
> > +             }
> > +             ctx->last_svc = svc;
> > +             ctx->start_dest = 0;
> > +     }
> > +
> > +     ctx->idx_dest = 0;
> > +     list_for_each_entry(dest, &svc->destinations, n_list) {
> > +             if (++ctx->idx_dest <= ctx->start_dest)
> > +                     continue;
> > +             if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
> > +                     ctx->idx_dest--;
>
>         At this point idx_svc is incremented and we
> stop at the middle of dest list, so we need ctx->idx_svc-- too.
>
>         And now what happens if all dests can not fit in a packet?
> We should start next packet with the same svc? And then
> user space should merge the dests when multiple packets
> start with same service?

My (maybe not so great) idea was to avoid repeating the svc on each
packet. It's possible for a packet to start with a destination and
user space must consider then as belonging to the last svc received on
the previous packet. The comparison "ctx->last_svc != svc" was
intended to ensure that a packet only starts with destinations if the
current service is the same as the svc we sent on the previous packet.

>
>         The main points are:
>
> - the virtual services are in hash table, their order is
> not important, user space can sort them
>
> - order of dests in a service is important for the schedulers
>
> - every packet should contain info for svc, so that we can
> properly add dests to the right svc

Thanks, I will rework the patch with these points in mind. It does
sound safer to ensure every packet starts with service information.

> > +nla_put_failure:
> > +     mutex_unlock(&__ip_vs_mutex);
> > +     cb->args[0] = ctx.idx_svc;
> > +     cb->args[1] = ctx.idx_dest;
> > +     cb->args[2] = (long)ctx.last_svc;
>
>         last_svc is used out of __ip_vs_mutex region,
> so it is not safe. We can get a reference count but this
> is bad if user space blocks.

I thought it would be relatively safe to store a pointer to the last
svc since I would only use it for pointer comparison and never
dereferencing it. But in retrospect it does look unsafe and fragile
and could probably lead to errors especially if services are modified
during a dump causing the stored pointer to point to a different
service.

>         But even if we use just indexes it should be ok.
> If multiple agents are used in parallel it is not our
> problem. What can happen is that we can send duplicates
> or to skip entries (both svcs and dests). It is impossible
> to keep any kind of references to current entries or even
> keys to lookup them if another agent can remove them.

Got it. I noticed this behavior while writing this patch and even
created a few crude validation scripts running parallel agents and
checking the diff in [1].

[1] - https://github.com/cezarsa/ipvsadm-validate/blob/37ebd39785b1e835c6d4b5c58aaca7be60d5e194/test.sh#L86-L87

Thanks a lot for the review,
--
Cezar Sá Espinola

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC] ipvs: add genetlink cmd to dump all services and destinations
  2020-11-03 16:36   ` Cezar Sá Espinola
@ 2020-11-03 19:18     ` Julian Anastasov
  2020-11-06 15:40       ` [PATCH RFC v2] " Cezar Sa Espinola
  0 siblings, 1 reply; 8+ messages in thread
From: Julian Anastasov @ 2020-11-03 19:18 UTC (permalink / raw)
  To: Cezar Sá Espinola
  Cc: Wensong Zhang, Simon Horman, open list:IPVS, open list:IPVS,
	open list, open list:NETFILTER, open list:NETFILTER

[-- Attachment #1: Type: text/plain, Size: 3169 bytes --]


	Hello,

On Tue, 3 Nov 2020, Cezar Sá Espinola wrote:

> >         And now what happens if all dests can not fit in a packet?
> > We should start next packet with the same svc? And then
> > user space should merge the dests when multiple packets
> > start with same service?
> 
> My (maybe not so great) idea was to avoid repeating the svc on each
> packet. It's possible for a packet to start with a destination and
> user space must consider then as belonging to the last svc received on
> the previous packet. The comparison "ctx->last_svc != svc" was
> intended to ensure that a packet only starts with destinations if the
> current service is the same as the svc we sent on the previous packet.

	You can also consider the idea of having 3 coordinates
for start svc: idx_svc_tab (0 or 1), idx_svc_row (0..IP_VS_SVC_TAB_SIZE-1)
and idx_svc for index in row's chain. On new packet this will
indicate the htable and its row and we have to skip svcs in
this row to find our starting svc. I think, this will still fit in
the netlink_callback's args area. If not, we can always kmalloc
our context in args[0]. In single table, this should speedup
the start svc lookup 128 times in average (we have 256 rows).
In setup with 1024 svcs (average 4 in each of the 256 rows)
we should skip these 0..3 entries instead of 512 in average.

> >         last_svc is used out of __ip_vs_mutex region,
> > so it is not safe. We can get a reference count but this
> > is bad if user space blocks.
> 
> I thought it would be relatively safe to store a pointer to the last
> svc since I would only use it for pointer comparison and never
> dereferencing it. But in retrospect it does look unsafe and fragile
> and could probably lead to errors especially if services are modified
> during a dump causing the stored pointer to point to a different
> service.

	Yes, nobody is using such pointers. We should create
packets that correctly identify svc for the dests. The drawback
is that user space may need more work for merging. We can always
create a sorted array of pointers to svcs, so that we can binary
search with bsearch() the svc from every received packet. Then we
will know if this is a new svc or an old one (with dests in
multiple packets). Should we also check for dest duplicates in
the svc? The question is how much safe we should play. In
user space the max work we can do is to avoid duplicates
and to put dests to their correct svc.

> >         But even if we use just indexes it should be ok.
> > If multiple agents are used in parallel it is not our
> > problem. What can happen is that we can send duplicates
> > or to skip entries (both svcs and dests). It is impossible
> > to keep any kind of references to current entries or even
> > keys to lookup them if another agent can remove them.
> 
> Got it. I noticed this behavior while writing this patch and even
> created a few crude validation scripts running parallel agents and
> checking the diff in [1].

	Ok, make sure your tests cover cases with multiple
dests, so that single service occupies multiple packets,
I'm not sure if 100 dests fit in one packet or not.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH RFC v2] ipvs: add genetlink cmd to dump all services and destinations
  2020-11-03 19:18     ` Julian Anastasov
@ 2020-11-06 15:40       ` Cezar Sa Espinola
  2020-11-09 21:29         ` Julian Anastasov
  0 siblings, 1 reply; 8+ messages in thread
From: Cezar Sa Espinola @ 2020-11-06 15:40 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: Cezar Sa Espinola, Wensong Zhang, Simon Horman,
	Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
	David S. Miller, Jakub Kicinski, open list:IPVS, open list:IPVS,
	open list, open list:NETFILTER, open list:NETFILTER

A common operation for userspace applications managing ipvs is to dump all
services and all destinations and then sort out what needs to be done.
Previously this could only be accomplished by issuing 1 netlink
IPVS_CMD_GET_SERVICE dump command followed by N IPVS_CMD_GET_DEST dump
commands. For a dynamic system with a very large number of services this
could be cause a performance impact.

This patch introduces a new way of dumping all services and destinations
with the new IPVS_CMD_GET_SERVICE_DEST command. A dump call for this
command will send the services as IPVS_CMD_NEW_SERVICE messages each
containing a nested array of destinations with the new IPVS_SVC_ATTR_DESTS
and IPVS_DESTS_ATTR_DEST attributes. Services may be repeated if their
destinations do not fit in a single packet, user space should be
responsible for merging the destinations from each repeated service.

It's also possible to dump a single service and its destinations by sending
a IPVS_CMD_ATTR_SERVICE argument to the dump command.

Signed-off-by: Cezar Sa Espinola <cezarsa@gmail.com>
---
Changes in v2:
- Send destinations nested in a service.

This patch should be safer now as every packet starts with a service
message. Also Julian's idea of adding more coordinates for tab and row
showed a nice performance improvement. I think the same idea could also
be applied to the existing ip_vs_genl_dump_services, I can send a patch
for it later.

I also verified that each packet can fit somewhere around 50
destinations on my system. So I was able to verify that it works as
expected even when services are repeated including the additional
destinations.

As before a patched ipvsadm for the message format in this commit is
available in [1] and benchmarks in [2] were executed comparing runtime
of "ipvsadm -Sn" using the new command vs the old command. Here are the
benchmark summary:

tcp svcs  | fw svcs | dsts | run time
--------- | ------- | ---- | --------
     1000 |       0 |    4 | -62.12%
     2000 |       0 |    2 | -72.97%
     8000 |       0 |    2 | -73.47%
    16000 |       0 |    1 | -81.79%
      100 |       0 |  100 |  -9.09%
     8000 |    8000 |    1 | -81.10%

[1] - https://github.com/cezarsa/ipvsadm/compare/master...dump-svc-ds
[2] - https://github.com/cezarsa/ipvsadm-validate#benchmark-results
 
 include/uapi/linux/ip_vs.h     |  17 ++-
 net/netfilter/ipvs/ip_vs_ctl.c | 188 ++++++++++++++++++++++++++++++---
 2 files changed, 189 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 4102ddcb4e14..ce9bfa03b61b 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -331,6 +331,8 @@ enum {
 	IPVS_CMD_ZERO,			/* zero all counters and stats */
 	IPVS_CMD_FLUSH,			/* flush services and dests */
 
+	IPVS_CMD_GET_SERVICE_DEST,	/* get service and destination info */
+
 	__IPVS_CMD_MAX,
 };
 
@@ -374,15 +376,28 @@ enum {
 
 	IPVS_SVC_ATTR_STATS64,		/* nested attribute for service stats */
 
+	IPVS_SVC_ATTR_DESTS,		/* nested destinations */
+
 	__IPVS_SVC_ATTR_MAX,
 };
 
 #define IPVS_SVC_ATTR_MAX (__IPVS_SVC_ATTR_MAX - 1)
 
+enum {
+	IPVS_DESTS_ATTR_UNSPEC = 0,
+
+	IPVS_DESTS_ATTR_DEST,	/* nested destination */
+
+	__IPVS_DESTS_ATTR_MAX,
+};
+
+#define IPVS_DESTS_ATTR_MAX (__IPVS_DESTS_ATTR_MAX - 1)
+
 /*
  * Attributes used to describe a destination (real server)
  *
- * Used inside nested attribute IPVS_CMD_ATTR_DEST
+ * Used inside nested attribute IPVS_CMD_ATTR_DEST and
+ * IPVS_DESTS_ATTR_DEST
  */
 enum {
 	IPVS_DEST_ATTR_UNSPEC = 0,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e279ded4e306..9f40627f74ef 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2981,6 +2981,7 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
+	[IPVS_SVC_ATTR_DESTS]		= { .type = NLA_NESTED },
 };
 
 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
@@ -3070,31 +3071,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
 	return -EMSGSIZE;
 }
 
-static int ip_vs_genl_fill_service(struct sk_buff *skb,
-				   struct ip_vs_service *svc)
+static int ip_vs_genl_put_service_attrs(struct sk_buff *skb,
+					struct ip_vs_service *svc)
 {
 	struct ip_vs_scheduler *sched;
 	struct ip_vs_pe *pe;
-	struct nlattr *nl_service;
 	struct ip_vs_flags flags = { .flags = svc->flags,
 				     .mask = ~0 };
 	struct ip_vs_kstats kstats;
 	char *sched_name;
 
-	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
-	if (!nl_service)
-		return -EMSGSIZE;
-
 	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 	if (svc->fwmark) {
 		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
-			goto nla_put_failure;
+			return -EMSGSIZE;
 	} else {
 		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
 		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
 		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
-			goto nla_put_failure;
+			return -EMSGSIZE;
 	}
 
 	sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -3105,11 +3101,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
 	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
 	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 	ip_vs_copy_stats(&kstats, &svc->stats);
 	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc)
+{
+	struct nlattr *nl_service;
+
+	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_put_service_attrs(skb, svc))
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nl_service);
@@ -3286,12 +3297,13 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
 	return ret ? ERR_PTR(ret) : svc;
 }
 
-static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, int container_type,
+				struct ip_vs_dest *dest)
 {
 	struct nlattr *nl_dest;
 	struct ip_vs_kstats kstats;
 
-	nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
+	nl_dest = nla_nest_start_noflag(skb, container_type);
 	if (!nl_dest)
 		return -EMSGSIZE;
 
@@ -3344,7 +3356,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (!hdr)
 		return -EMSGSIZE;
 
-	if (ip_vs_genl_fill_dest(skb, dest) < 0)
+	if (ip_vs_genl_fill_dest(skb, IPVS_CMD_ATTR_DEST, dest) < 0)
 		goto nla_put_failure;
 
 	genlmsg_end(skb, hdr);
@@ -3396,6 +3408,146 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	return skb->len;
 }
 
+struct dump_services_dests_ctx {
+	int	idx_svc;
+	int	idx_dest;
+	int	start_svc;
+	int	start_dest;
+};
+
+static int ip_vs_genl_dump_service_dests(struct sk_buff *skb,
+					 struct netlink_callback *cb,
+					 struct netns_ipvs *ipvs,
+					 struct ip_vs_service *svc,
+					 struct dump_services_dests_ctx *ctx)
+{
+	void *hdr;
+	struct nlattr *nl_service;
+	struct nlattr *nl_dests;
+	struct ip_vs_dest *dest;
+	int ret = 0;
+
+	if (svc->ipvs != ipvs)
+		return 0;
+
+	if (++ctx->idx_svc <= ctx->start_svc)
+		return 0;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			  cb->nlh->nlmsg_seq, &ip_vs_genl_family,
+			  NLM_F_MULTI, IPVS_CMD_NEW_SERVICE);
+	if (!hdr)
+		goto out_err;
+
+	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		goto nla_put_failure;
+
+	if (ip_vs_genl_put_service_attrs(skb, svc))
+		goto nla_nested_failure;
+
+	nl_dests = nla_nest_start_noflag(skb, IPVS_SVC_ATTR_DESTS);
+	if (!nl_dests)
+		goto nla_nested_failure;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (++ctx->idx_dest <= ctx->start_dest)
+			continue;
+		if (ip_vs_genl_fill_dest(skb, IPVS_DESTS_ATTR_DEST, dest) < 0) {
+			ctx->idx_svc--;
+			ctx->idx_dest--;
+			ret = -EMSGSIZE;
+			goto nla_nested_end;
+		}
+	}
+	ctx->idx_dest = 0;
+	ctx->start_dest = 0;
+
+nla_nested_end:
+	nla_nest_end(skb, nl_dests);
+	nla_nest_end(skb, nl_service);
+	genlmsg_end(skb, hdr);
+	return ret;
+
+nla_nested_failure:
+	nla_nest_cancel(skb, nl_service);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+
+out_err:
+	ctx->idx_svc--;
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services_destinations(struct sk_buff *skb,
+						 struct netlink_callback *cb)
+{
+	struct dump_services_dests_ctx ctx = {
+		.idx_svc = 0,
+		.start_svc = cb->args[0],
+		.idx_dest = 0,
+		.start_dest = cb->args[1],
+	};
+	struct net *net = sock_net(skb->sk);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_service *svc = NULL;
+	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+	int tab = cb->args[2];
+	int row = cb->args[3];
+
+	mutex_lock(&__ip_vs_mutex);
+
+	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs,
+				   IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy,
+				   cb->extack) == 0) {
+		if (attrs[IPVS_CMD_ATTR_SERVICE]) {
+			svc = ip_vs_genl_find_service(ipvs,
+						      attrs[IPVS_CMD_ATTR_SERVICE]);
+			if (IS_ERR_OR_NULL(svc))
+				goto out_err;
+			ip_vs_genl_dump_service_dests(skb, cb, ipvs, svc, &ctx);
+			goto nla_put_failure;
+		}
+	}
+
+	for (; tab == 0 && row < IP_VS_SVC_TAB_SIZE; row++) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[row], s_list) {
+			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
+							  svc, &ctx))
+				goto nla_put_failure;
+		}
+		ctx.idx_svc = 0;
+		ctx.start_svc = 0;
+	}
+
+	if (tab == 0) {
+		row = 0;
+		tab++;
+	}
+
+	for (; row < IP_VS_SVC_TAB_SIZE; row++) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[row], f_list) {
+			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
+							  svc, &ctx))
+				goto nla_put_failure;
+		}
+		ctx.idx_svc = 0;
+		ctx.start_svc = 0;
+	}
+
+nla_put_failure:
+	cb->args[0] = ctx.idx_svc;
+	cb->args[1] = ctx.idx_dest;
+	cb->args[2] = tab;
+	cb->args[3] = row;
+
+out_err:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 				 struct nlattr *nla, bool full_entry)
 {
@@ -3991,6 +4143,12 @@ static const struct genl_small_ops ip_vs_genl_ops[] = {
 		.flags	= GENL_ADMIN_PERM,
 		.doit	= ip_vs_genl_set_cmd,
 	},
+	{
+		.cmd	= IPVS_CMD_GET_SERVICE_DEST,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags	= GENL_ADMIN_PERM,
+		.dumpit	= ip_vs_genl_dump_services_destinations,
+	},
 };
 
 static struct genl_family ip_vs_genl_family __ro_after_init = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC v2] ipvs: add genetlink cmd to dump all services and destinations
  2020-11-06 15:40       ` [PATCH RFC v2] " Cezar Sa Espinola
@ 2020-11-09 21:29         ` Julian Anastasov
  2020-11-10 14:45           ` [PATCH RFC v3] " Cezar Sa Espinola
  0 siblings, 1 reply; 8+ messages in thread
From: Julian Anastasov @ 2020-11-09 21:29 UTC (permalink / raw)
  To: Cezar Sa Espinola
  Cc: Wensong Zhang, Simon Horman, Pablo Neira Ayuso, open list:IPVS,
	open list:IPVS, open list, open list:NETFILTER,
	open list:NETFILTER


	Hello,

On Fri, 6 Nov 2020, Cezar Sa Espinola wrote:

> A common operation for userspace applications managing ipvs is to dump all
> services and all destinations and then sort out what needs to be done.
> Previously this could only be accomplished by issuing 1 netlink
> IPVS_CMD_GET_SERVICE dump command followed by N IPVS_CMD_GET_DEST dump
> commands. For a dynamic system with a very large number of services this
> could be cause a performance impact.
> 
> This patch introduces a new way of dumping all services and destinations
> with the new IPVS_CMD_GET_SERVICE_DEST command. A dump call for this
> command will send the services as IPVS_CMD_NEW_SERVICE messages each
> containing a nested array of destinations with the new IPVS_SVC_ATTR_DESTS
> and IPVS_DESTS_ATTR_DEST attributes. Services may be repeated if their
> destinations do not fit in a single packet, user space should be
> responsible for merging the destinations from each repeated service.
> 
> It's also possible to dump a single service and its destinations by sending
> a IPVS_CMD_ATTR_SERVICE argument to the dump command.
> 
> Signed-off-by: Cezar Sa Espinola <cezarsa@gmail.com>
> ---
> Changes in v2:
> - Send destinations nested in a service.
> 
> This patch should be safer now as every packet starts with a service
> message. Also Julian's idea of adding more coordinates for tab and row
> showed a nice performance improvement. I think the same idea could also
> be applied to the existing ip_vs_genl_dump_services, I can send a patch
> for it later.
> 
> I also verified that each packet can fit somewhere around 50
> destinations on my system. So I was able to verify that it works as
> expected even when services are repeated including the additional
> destinations.
> 
> As before a patched ipvsadm for the message format in this commit is
> available in [1] and benchmarks in [2] were executed comparing runtime
> of "ipvsadm -Sn" using the new command vs the old command. Here are the
> benchmark summary:
> 
> tcp svcs  | fw svcs | dsts | run time
> --------- | ------- | ---- | --------
>      1000 |       0 |    4 | -62.12%
>      2000 |       0 |    2 | -72.97%
>      8000 |       0 |    2 | -73.47%
>     16000 |       0 |    1 | -81.79%
>       100 |       0 |  100 |  -9.09%
>      8000 |    8000 |    1 | -81.10%
> 
> [1] - https://github.com/cezarsa/ipvsadm/compare/master...dump-svc-ds
> [2] - https://github.com/cezarsa/ipvsadm-validate#benchmark-results
>  
>  include/uapi/linux/ip_vs.h     |  17 ++-
>  net/netfilter/ipvs/ip_vs_ctl.c | 188 ++++++++++++++++++++++++++++++---
>  2 files changed, 189 insertions(+), 16 deletions(-)
> 
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index 4102ddcb4e14..ce9bfa03b61b 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -331,6 +331,8 @@ enum {
>  	IPVS_CMD_ZERO,			/* zero all counters and stats */
>  	IPVS_CMD_FLUSH,			/* flush services and dests */
>  
> +	IPVS_CMD_GET_SERVICE_DEST,	/* get service and destination info */
> +
>  	__IPVS_CMD_MAX,
>  };
>  
> @@ -374,15 +376,28 @@ enum {
>  
>  	IPVS_SVC_ATTR_STATS64,		/* nested attribute for service stats */
>  
> +	IPVS_SVC_ATTR_DESTS,		/* nested destinations */
> +
>  	__IPVS_SVC_ATTR_MAX,
>  };
>  
>  #define IPVS_SVC_ATTR_MAX (__IPVS_SVC_ATTR_MAX - 1)
>  
> +enum {
> +	IPVS_DESTS_ATTR_UNSPEC = 0,
> +
> +	IPVS_DESTS_ATTR_DEST,	/* nested destination */
> +
> +	__IPVS_DESTS_ATTR_MAX,
> +};
> +
> +#define IPVS_DESTS_ATTR_MAX (__IPVS_DESTS_ATTR_MAX - 1)
> +
>  /*
>   * Attributes used to describe a destination (real server)
>   *
> - * Used inside nested attribute IPVS_CMD_ATTR_DEST
> + * Used inside nested attribute IPVS_CMD_ATTR_DEST and
> + * IPVS_DESTS_ATTR_DEST
>   */
>  enum {
>  	IPVS_DEST_ATTR_UNSPEC = 0,
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index e279ded4e306..9f40627f74ef 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -2981,6 +2981,7 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
>  	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
>  	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
>  	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
> +	[IPVS_SVC_ATTR_DESTS]		= { .type = NLA_NESTED },
>  };
>  
>  /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
> @@ -3070,31 +3071,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
>  	return -EMSGSIZE;
>  }
>  
> -static int ip_vs_genl_fill_service(struct sk_buff *skb,
> -				   struct ip_vs_service *svc)
> +static int ip_vs_genl_put_service_attrs(struct sk_buff *skb,
> +					struct ip_vs_service *svc)
>  {
>  	struct ip_vs_scheduler *sched;
>  	struct ip_vs_pe *pe;
> -	struct nlattr *nl_service;
>  	struct ip_vs_flags flags = { .flags = svc->flags,
>  				     .mask = ~0 };
>  	struct ip_vs_kstats kstats;
>  	char *sched_name;
>  
> -	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
> -	if (!nl_service)
> -		return -EMSGSIZE;
> -
>  	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
> -		goto nla_put_failure;
> +		return -EMSGSIZE;
>  	if (svc->fwmark) {
>  		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
> -			goto nla_put_failure;
> +			return -EMSGSIZE;
>  	} else {
>  		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
>  		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
>  		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
> -			goto nla_put_failure;
> +			return -EMSGSIZE;
>  	}
>  
>  	sched = rcu_dereference_protected(svc->scheduler, 1);
> @@ -3105,11 +3101,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
>  	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
>  	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
>  	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
> -		goto nla_put_failure;
> +		return -EMSGSIZE;
>  	ip_vs_copy_stats(&kstats, &svc->stats);
>  	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
> -		goto nla_put_failure;
> +		return -EMSGSIZE;
>  	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
> +		return -EMSGSIZE;
> +
> +	return 0;
> +}
> +
> +static int ip_vs_genl_fill_service(struct sk_buff *skb,
> +				   struct ip_vs_service *svc)
> +{
> +	struct nlattr *nl_service;
> +
> +	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
> +	if (!nl_service)
> +		return -EMSGSIZE;
> +
> +	if (ip_vs_genl_put_service_attrs(skb, svc))
>  		goto nla_put_failure;
>  
>  	nla_nest_end(skb, nl_service);
> @@ -3286,12 +3297,13 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
>  	return ret ? ERR_PTR(ret) : svc;
>  }
>  
> -static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
> +static int ip_vs_genl_fill_dest(struct sk_buff *skb, int container_type,
> +				struct ip_vs_dest *dest)
>  {
>  	struct nlattr *nl_dest;
>  	struct ip_vs_kstats kstats;
>  
> -	nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
> +	nl_dest = nla_nest_start_noflag(skb, container_type);
>  	if (!nl_dest)
>  		return -EMSGSIZE;
>  
> @@ -3344,7 +3356,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
>  	if (!hdr)
>  		return -EMSGSIZE;
>  
> -	if (ip_vs_genl_fill_dest(skb, dest) < 0)
> +	if (ip_vs_genl_fill_dest(skb, IPVS_CMD_ATTR_DEST, dest) < 0)
>  		goto nla_put_failure;
>  
>  	genlmsg_end(skb, hdr);
> @@ -3396,6 +3408,146 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
>  	return skb->len;
>  }
>  
> +struct dump_services_dests_ctx {
> +	int	idx_svc;
> +	int	idx_dest;
> +	int	start_svc;
> +	int	start_dest;
> +};
> +
> +static int ip_vs_genl_dump_service_dests(struct sk_buff *skb,
> +					 struct netlink_callback *cb,
> +					 struct netns_ipvs *ipvs,
> +					 struct ip_vs_service *svc,
> +					 struct dump_services_dests_ctx *ctx)
> +{
> +	void *hdr;
> +	struct nlattr *nl_service;
> +	struct nlattr *nl_dests;
> +	struct ip_vs_dest *dest;
> +	int ret = 0;
> +
> +	if (svc->ipvs != ipvs)
> +		return 0;
> +
> +	if (++ctx->idx_svc <= ctx->start_svc)
> +		return 0;
> +
> +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
> +			  cb->nlh->nlmsg_seq, &ip_vs_genl_family,
> +			  NLM_F_MULTI, IPVS_CMD_NEW_SERVICE);
> +	if (!hdr)
> +		goto out_err;
> +
> +	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
> +	if (!nl_service)
> +		goto nla_put_failure;
> +
> +	if (ip_vs_genl_put_service_attrs(skb, svc))
> +		goto nla_nested_failure;
> +
> +	nl_dests = nla_nest_start_noflag(skb, IPVS_SVC_ATTR_DESTS);
> +	if (!nl_dests)
> +		goto nla_nested_failure;
> +
> +	list_for_each_entry(dest, &svc->destinations, n_list) {
> +		if (++ctx->idx_dest <= ctx->start_dest)
> +			continue;
> +		if (ip_vs_genl_fill_dest(skb, IPVS_DESTS_ATTR_DEST, dest) < 0) {
> +			ctx->idx_svc--;
> +			ctx->idx_dest--;
> +			ret = -EMSGSIZE;
> +			goto nla_nested_end;
> +		}
> +	}
> +	ctx->idx_dest = 0;
> +	ctx->start_dest = 0;
> +
> +nla_nested_end:
> +	nla_nest_end(skb, nl_dests);
> +	nla_nest_end(skb, nl_service);
> +	genlmsg_end(skb, hdr);
> +	return ret;
> +
> +nla_nested_failure:
> +	nla_nest_cancel(skb, nl_service);
> +
> +nla_put_failure:
> +	genlmsg_cancel(skb, hdr);
> +
> +out_err:
> +	ctx->idx_svc--;
> +	return -EMSGSIZE;
> +}
> +
> +static int ip_vs_genl_dump_services_destinations(struct sk_buff *skb,
> +						 struct netlink_callback *cb)
> +{
> +	struct dump_services_dests_ctx ctx = {
> +		.idx_svc = 0,
> +		.start_svc = cb->args[0],
> +		.idx_dest = 0,
> +		.start_dest = cb->args[1],
> +	};
> +	struct net *net = sock_net(skb->sk);
> +	struct netns_ipvs *ipvs = net_ipvs(net);
> +	struct ip_vs_service *svc = NULL;

	NULL not needed

> +	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
> +	int tab = cb->args[2];
> +	int row = cb->args[3];
> +
> +	mutex_lock(&__ip_vs_mutex);
> +
> +	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs,
> +				   IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy,
> +				   cb->extack) == 0) {
> +		if (attrs[IPVS_CMD_ATTR_SERVICE]) {
> +			svc = ip_vs_genl_find_service(ipvs,
> +						      attrs[IPVS_CMD_ATTR_SERVICE]);
> +			if (IS_ERR_OR_NULL(svc))
> +				goto out_err;
> +			ip_vs_genl_dump_service_dests(skb, cb, ipvs, svc, &ctx);
> +			goto nla_put_failure;
> +		}
> +	}
> +

	To make it more readable and to avoid lookup when at EOF
we can start with the tab checks:

	if (tab >= 2)
		goto nla_put_failure;	# or done
	if (tab >= 1)
		goto tab_1;

	for (; row < IP_VS_SVC_TAB_SIZE; row++) {

> +	for (; tab == 0 && row < IP_VS_SVC_TAB_SIZE; row++) {
> +		hlist_for_each_entry(svc, &ip_vs_svc_table[row], s_list) {
> +			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
> +							  svc, &ctx))
> +				goto nla_put_failure;
> +		}
> +		ctx.idx_svc = 0;
> +		ctx.start_svc = 0;

	If we were at the middle of dests for the previous packet
but now the svc and its dests are deleted, we have to reset the
dest ptr too, otherwise we will skip dests in next row:

		ctx->idx_dest = 0;
		ctx->start_dest = 0;

	But any kind of modifications will show wrong results,
so it does not matter much.

> +	}
> +
> +	if (tab == 0) {
> +		row = 0;
> +		tab++;
> +	}
> +

	row = 0;
	tab++;

tab_1:

> +	for (; row < IP_VS_SVC_TAB_SIZE; row++) {
> +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[row], f_list) {
> +			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
> +							  svc, &ctx))
> +				goto nla_put_failure;
> +		}
> +		ctx.idx_svc = 0;
> +		ctx.start_svc = 0;

		ctx->idx_dest = 0;
		ctx->start_dest = 0;

> +	}

	row = 0;	# Not needed
	tab++;		$ tab = 2 to indicate EOF

> +
> +nla_put_failure:
> +	cb->args[0] = ctx.idx_svc;
> +	cb->args[1] = ctx.idx_dest;
> +	cb->args[2] = tab;
> +	cb->args[3] = row;
> +
> +out_err:
> +	mutex_unlock(&__ip_vs_mutex);
> +
> +	return skb->len;
> +}
> +
>  static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
>  				 struct nlattr *nla, bool full_entry)
>  {
> @@ -3991,6 +4143,12 @@ static const struct genl_small_ops ip_vs_genl_ops[] = {
>  		.flags	= GENL_ADMIN_PERM,
>  		.doit	= ip_vs_genl_set_cmd,
>  	},
> +	{
> +		.cmd	= IPVS_CMD_GET_SERVICE_DEST,
> +		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
> +		.flags	= GENL_ADMIN_PERM,
> +		.dumpit	= ip_vs_genl_dump_services_destinations,
> +	},
>  };
>  
>  static struct genl_family ip_vs_genl_family __ro_after_init = {
> -- 
> 2.25.1

Regards

--
Julian Anastasov <ja@ssi.bg>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH RFC v3] ipvs: add genetlink cmd to dump all services and destinations
  2020-11-09 21:29         ` Julian Anastasov
@ 2020-11-10 14:45           ` Cezar Sa Espinola
  2020-11-15 12:50             ` Julian Anastasov
  0 siblings, 1 reply; 8+ messages in thread
From: Cezar Sa Espinola @ 2020-11-10 14:45 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: Cezar Sa Espinola, Wensong Zhang, Simon Horman,
	Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
	David S. Miller, Jakub Kicinski, open list:IPVS, open list:IPVS,
	open list, open list:NETFILTER, open list:NETFILTER

A common operation for userspace applications managing ipvs is to dump all
services and all destinations and then sort out what needs to be done.
Previously this could only be accomplished by issuing 1 netlink
IPVS_CMD_GET_SERVICE dump command followed by N IPVS_CMD_GET_DEST dump
commands. For a dynamic system with a very large number of services this
could be cause a performance impact.

This patch introduces a new way of dumping all services and destinations
with the new IPVS_CMD_GET_SERVICE_DEST command. A dump call for this
command will send the services as IPVS_CMD_NEW_SERVICE messages each
containing a nested array of destinations with the new IPVS_SVC_ATTR_DESTS
and IPVS_DESTS_ATTR_DEST attributes. Services may be repeated if their
destinations do not fit in a single packet, user space should be
responsible for merging the destinations from each repeated service.

It's also possible to dump a single service and its destinations by sending
a IPVS_CMD_ATTR_SERVICE argument to the dump command.

Signed-off-by: Cezar Sa Espinola <cezarsa@gmail.com>
---
Changes in v2:
- Send destinations nested in a service.
Changes in v3:
- Avoid tab lookups at EOF and other requested changes.

 include/uapi/linux/ip_vs.h     |  17 ++-
 net/netfilter/ipvs/ip_vs_ctl.c | 200 ++++++++++++++++++++++++++++++---
 2 files changed, 201 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 4102ddcb4e14..ce9bfa03b61b 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -331,6 +331,8 @@ enum {
 	IPVS_CMD_ZERO,			/* zero all counters and stats */
 	IPVS_CMD_FLUSH,			/* flush services and dests */
 
+	IPVS_CMD_GET_SERVICE_DEST,	/* get service and destination info */
+
 	__IPVS_CMD_MAX,
 };
 
@@ -374,15 +376,28 @@ enum {
 
 	IPVS_SVC_ATTR_STATS64,		/* nested attribute for service stats */
 
+	IPVS_SVC_ATTR_DESTS,		/* nested destinations */
+
 	__IPVS_SVC_ATTR_MAX,
 };
 
 #define IPVS_SVC_ATTR_MAX (__IPVS_SVC_ATTR_MAX - 1)
 
+enum {
+	IPVS_DESTS_ATTR_UNSPEC = 0,
+
+	IPVS_DESTS_ATTR_DEST,	/* nested destination */
+
+	__IPVS_DESTS_ATTR_MAX,
+};
+
+#define IPVS_DESTS_ATTR_MAX (__IPVS_DESTS_ATTR_MAX - 1)
+
 /*
  * Attributes used to describe a destination (real server)
  *
- * Used inside nested attribute IPVS_CMD_ATTR_DEST
+ * Used inside nested attribute IPVS_CMD_ATTR_DEST and
+ * IPVS_DESTS_ATTR_DEST
  */
 enum {
 	IPVS_DEST_ATTR_UNSPEC = 0,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e279ded4e306..65456eaaf11f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2981,6 +2981,7 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
+	[IPVS_SVC_ATTR_DESTS]		= { .type = NLA_NESTED },
 };
 
 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
@@ -3070,31 +3071,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
 	return -EMSGSIZE;
 }
 
-static int ip_vs_genl_fill_service(struct sk_buff *skb,
-				   struct ip_vs_service *svc)
+static int ip_vs_genl_put_service_attrs(struct sk_buff *skb,
+					struct ip_vs_service *svc)
 {
 	struct ip_vs_scheduler *sched;
 	struct ip_vs_pe *pe;
-	struct nlattr *nl_service;
 	struct ip_vs_flags flags = { .flags = svc->flags,
 				     .mask = ~0 };
 	struct ip_vs_kstats kstats;
 	char *sched_name;
 
-	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
-	if (!nl_service)
-		return -EMSGSIZE;
-
 	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 	if (svc->fwmark) {
 		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
-			goto nla_put_failure;
+			return -EMSGSIZE;
 	} else {
 		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
 		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
 		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
-			goto nla_put_failure;
+			return -EMSGSIZE;
 	}
 
 	sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -3105,11 +3101,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
 	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
 	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 	ip_vs_copy_stats(&kstats, &svc->stats);
 	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
-		goto nla_put_failure;
+		return -EMSGSIZE;
 	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc)
+{
+	struct nlattr *nl_service;
+
+	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_put_service_attrs(skb, svc))
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nl_service);
@@ -3286,12 +3297,13 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
 	return ret ? ERR_PTR(ret) : svc;
 }
 
-static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, int container_type,
+				struct ip_vs_dest *dest)
 {
 	struct nlattr *nl_dest;
 	struct ip_vs_kstats kstats;
 
-	nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
+	nl_dest = nla_nest_start_noflag(skb, container_type);
 	if (!nl_dest)
 		return -EMSGSIZE;
 
@@ -3344,7 +3356,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (!hdr)
 		return -EMSGSIZE;
 
-	if (ip_vs_genl_fill_dest(skb, dest) < 0)
+	if (ip_vs_genl_fill_dest(skb, IPVS_CMD_ATTR_DEST, dest) < 0)
 		goto nla_put_failure;
 
 	genlmsg_end(skb, hdr);
@@ -3396,6 +3408,158 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	return skb->len;
 }
 
+struct dump_services_dests_ctx {
+	int	idx_svc;
+	int	idx_dest;
+	int	start_svc;
+	int	start_dest;
+};
+
+static int ip_vs_genl_dump_service_dests(struct sk_buff *skb,
+					 struct netlink_callback *cb,
+					 struct netns_ipvs *ipvs,
+					 struct ip_vs_service *svc,
+					 struct dump_services_dests_ctx *ctx)
+{
+	void *hdr;
+	struct nlattr *nl_service;
+	struct nlattr *nl_dests;
+	struct ip_vs_dest *dest;
+	int ret = 0;
+
+	if (svc->ipvs != ipvs)
+		return 0;
+
+	if (++ctx->idx_svc <= ctx->start_svc)
+		return 0;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			  cb->nlh->nlmsg_seq, &ip_vs_genl_family,
+			  NLM_F_MULTI, IPVS_CMD_NEW_SERVICE);
+	if (!hdr)
+		goto out_err;
+
+	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		goto nla_put_failure;
+
+	if (ip_vs_genl_put_service_attrs(skb, svc))
+		goto nla_nested_failure;
+
+	nl_dests = nla_nest_start_noflag(skb, IPVS_SVC_ATTR_DESTS);
+	if (!nl_dests)
+		goto nla_nested_failure;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (++ctx->idx_dest <= ctx->start_dest)
+			continue;
+		if (ip_vs_genl_fill_dest(skb, IPVS_DESTS_ATTR_DEST, dest) < 0) {
+			ctx->idx_svc--;
+			ctx->idx_dest--;
+			ret = -EMSGSIZE;
+			goto nla_nested_end;
+		}
+	}
+	ctx->idx_dest = 0;
+	ctx->start_dest = 0;
+
+nla_nested_end:
+	nla_nest_end(skb, nl_dests);
+	nla_nest_end(skb, nl_service);
+	genlmsg_end(skb, hdr);
+	return ret;
+
+nla_nested_failure:
+	nla_nest_cancel(skb, nl_service);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+
+out_err:
+	ctx->idx_svc--;
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services_destinations(struct sk_buff *skb,
+						 struct netlink_callback *cb)
+{
+	struct dump_services_dests_ctx ctx = {
+		.idx_svc = 0,
+		.start_svc = cb->args[0],
+		.idx_dest = 0,
+		.start_dest = cb->args[1],
+	};
+	struct net *net = sock_net(skb->sk);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_service *svc;
+	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+	int tab = cb->args[2];
+	int row = cb->args[3];
+
+	mutex_lock(&__ip_vs_mutex);
+
+	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs,
+				   IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy,
+				   cb->extack) == 0) {
+		if (attrs[IPVS_CMD_ATTR_SERVICE]) {
+			svc = ip_vs_genl_find_service(ipvs,
+						      attrs[IPVS_CMD_ATTR_SERVICE]);
+			if (IS_ERR_OR_NULL(svc))
+				goto out_err;
+			ip_vs_genl_dump_service_dests(skb, cb, ipvs, svc, &ctx);
+			goto nla_put_failure;
+		}
+	}
+
+	if (tab >= 2)
+		goto nla_put_failure;
+
+	if (tab >= 1)
+		goto tab_1;
+
+	for (; row < IP_VS_SVC_TAB_SIZE; row++) {
+		hlist_for_each_entry(svc, &ip_vs_svc_table[row], s_list) {
+			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
+							  svc, &ctx))
+				goto nla_put_failure;
+		}
+		ctx.idx_svc = 0;
+		ctx.start_svc = 0;
+		ctx.idx_dest = 0;
+		ctx.start_dest = 0;
+	}
+
+	row = 0;
+	tab++;
+
+tab_1:
+	for (; row < IP_VS_SVC_TAB_SIZE; row++) {
+		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[row], f_list) {
+			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
+							  svc, &ctx))
+				goto nla_put_failure;
+		}
+		ctx.idx_svc = 0;
+		ctx.start_svc = 0;
+		ctx.idx_dest = 0;
+		ctx.start_dest = 0;
+	}
+
+	row = 0;
+	tab++;
+
+nla_put_failure:
+	cb->args[0] = ctx.idx_svc;
+	cb->args[1] = ctx.idx_dest;
+	cb->args[2] = tab;
+	cb->args[3] = row;
+
+out_err:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
 				 struct nlattr *nla, bool full_entry)
 {
@@ -3991,6 +4155,12 @@ static const struct genl_small_ops ip_vs_genl_ops[] = {
 		.flags	= GENL_ADMIN_PERM,
 		.doit	= ip_vs_genl_set_cmd,
 	},
+	{
+		.cmd	= IPVS_CMD_GET_SERVICE_DEST,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.flags	= GENL_ADMIN_PERM,
+		.dumpit	= ip_vs_genl_dump_services_destinations,
+	},
 };
 
 static struct genl_family ip_vs_genl_family __ro_after_init = {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH RFC v3] ipvs: add genetlink cmd to dump all services and destinations
  2020-11-10 14:45           ` [PATCH RFC v3] " Cezar Sa Espinola
@ 2020-11-15 12:50             ` Julian Anastasov
  0 siblings, 0 replies; 8+ messages in thread
From: Julian Anastasov @ 2020-11-15 12:50 UTC (permalink / raw)
  To: Cezar Sa Espinola
  Cc: Wensong Zhang, Simon Horman, Pablo Neira Ayuso, open list:IPVS,
	open list:IPVS, open list, open list:NETFILTER,
	open list:NETFILTER


	Hello,

On Tue, 10 Nov 2020, Cezar Sa Espinola wrote:

> A common operation for userspace applications managing ipvs is to dump all
> services and all destinations and then sort out what needs to be done.
> Previously this could only be accomplished by issuing 1 netlink
> IPVS_CMD_GET_SERVICE dump command followed by N IPVS_CMD_GET_DEST dump
> commands. For a dynamic system with a very large number of services this
> could be cause a performance impact.
> 
> This patch introduces a new way of dumping all services and destinations
> with the new IPVS_CMD_GET_SERVICE_DEST command. A dump call for this
> command will send the services as IPVS_CMD_NEW_SERVICE messages each
> containing a nested array of destinations with the new IPVS_SVC_ATTR_DESTS
> and IPVS_DESTS_ATTR_DEST attributes. Services may be repeated if their
> destinations do not fit in a single packet, user space should be
> responsible for merging the destinations from each repeated service.
> 
> It's also possible to dump a single service and its destinations by sending
> a IPVS_CMD_ATTR_SERVICE argument to the dump command.
> 
> Signed-off-by: Cezar Sa Espinola <cezarsa@gmail.com>

	Thanks! Overall, I don't see problems with the logic
in the kernel patch and also in the ipvsadm patch. Below
you will see only comments about styling. In the above
patch description you can include some benchmark info
that specify speedup in what range was observed depending
on used services/dests, so that it can go to the commit log.

> ---
> Changes in v2:
> - Send destinations nested in a service.
> Changes in v3:
> - Avoid tab lookups at EOF and other requested changes.
> 
>  include/uapi/linux/ip_vs.h     |  17 ++-
>  net/netfilter/ipvs/ip_vs_ctl.c | 200 ++++++++++++++++++++++++++++++---
>  2 files changed, 201 insertions(+), 16 deletions(-)
> 
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index 4102ddcb4e14..ce9bfa03b61b 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -331,6 +331,8 @@ enum {
>  	IPVS_CMD_ZERO,			/* zero all counters and stats */
>  	IPVS_CMD_FLUSH,			/* flush services and dests */
>  
> +	IPVS_CMD_GET_SERVICE_DEST,	/* get service and destination info */
> +
>  	__IPVS_CMD_MAX,
>  };
>  
> @@ -374,15 +376,28 @@ enum {
>  
>  	IPVS_SVC_ATTR_STATS64,		/* nested attribute for service stats */
>  
> +	IPVS_SVC_ATTR_DESTS,		/* nested destinations */
> +
>  	__IPVS_SVC_ATTR_MAX,
>  };
>  
>  #define IPVS_SVC_ATTR_MAX (__IPVS_SVC_ATTR_MAX - 1)
>  
> +enum {
> +	IPVS_DESTS_ATTR_UNSPEC = 0,
> +
> +	IPVS_DESTS_ATTR_DEST,	/* nested destination */
> +
> +	__IPVS_DESTS_ATTR_MAX,
> +};
> +
> +#define IPVS_DESTS_ATTR_MAX (__IPVS_DESTS_ATTR_MAX - 1)
> +
>  /*
>   * Attributes used to describe a destination (real server)
>   *
> - * Used inside nested attribute IPVS_CMD_ATTR_DEST
> + * Used inside nested attribute IPVS_CMD_ATTR_DEST and
> + * IPVS_DESTS_ATTR_DEST
>   */
>  enum {
>  	IPVS_DEST_ATTR_UNSPEC = 0,
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index e279ded4e306..65456eaaf11f 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -2981,6 +2981,7 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
>  	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
>  	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
>  	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
> +	[IPVS_SVC_ATTR_DESTS]		= { .type = NLA_NESTED },
>  };
>  
>  /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
> @@ -3070,31 +3071,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
>  	return -EMSGSIZE;
>  }
>  
> -static int ip_vs_genl_fill_service(struct sk_buff *skb,
> -				   struct ip_vs_service *svc)
> +static int ip_vs_genl_put_service_attrs(struct sk_buff *skb,
> +					struct ip_vs_service *svc)
>  {
>  	struct ip_vs_scheduler *sched;
>  	struct ip_vs_pe *pe;
> -	struct nlattr *nl_service;
>  	struct ip_vs_flags flags = { .flags = svc->flags,
>  				     .mask = ~0 };
>  	struct ip_vs_kstats kstats;
>  	char *sched_name;
>  
> -	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
> -	if (!nl_service)
> -		return -EMSGSIZE;
> -
>  	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
> -		goto nla_put_failure;
> +		return -EMSGSIZE;
>  	if (svc->fwmark) {
>  		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
> -			goto nla_put_failure;
> +			return -EMSGSIZE;
>  	} else {
>  		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
>  		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
>  		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
> -			goto nla_put_failure;
> +			return -EMSGSIZE;
>  	}
>  
>  	sched = rcu_dereference_protected(svc->scheduler, 1);
> @@ -3105,11 +3101,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
>  	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
>  	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
>  	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
> -		goto nla_put_failure;
> +		return -EMSGSIZE;
>  	ip_vs_copy_stats(&kstats, &svc->stats);
>  	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
> -		goto nla_put_failure;
> +		return -EMSGSIZE;
>  	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
> +		return -EMSGSIZE;
> +
> +	return 0;
> +}
> +
> +static int ip_vs_genl_fill_service(struct sk_buff *skb,
> +				   struct ip_vs_service *svc)
> +{
> +	struct nlattr *nl_service;
> +
> +	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
> +	if (!nl_service)
> +		return -EMSGSIZE;
> +
> +	if (ip_vs_genl_put_service_attrs(skb, svc))
>  		goto nla_put_failure;
>  
>  	nla_nest_end(skb, nl_service);
> @@ -3286,12 +3297,13 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
>  	return ret ? ERR_PTR(ret) : svc;
>  }
>  
> -static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
> +static int ip_vs_genl_fill_dest(struct sk_buff *skb, int container_type,
> +				struct ip_vs_dest *dest)
>  {
>  	struct nlattr *nl_dest;
>  	struct ip_vs_kstats kstats;
>  
> -	nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
> +	nl_dest = nla_nest_start_noflag(skb, container_type);
>  	if (!nl_dest)
>  		return -EMSGSIZE;
>  
> @@ -3344,7 +3356,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
>  	if (!hdr)
>  		return -EMSGSIZE;
>  
> -	if (ip_vs_genl_fill_dest(skb, dest) < 0)
> +	if (ip_vs_genl_fill_dest(skb, IPVS_CMD_ATTR_DEST, dest) < 0)
>  		goto nla_put_failure;
>  
>  	genlmsg_end(skb, hdr);
> @@ -3396,6 +3408,158 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
>  	return skb->len;
>  }
>  
> +struct dump_services_dests_ctx {
> +	int	idx_svc;
> +	int	idx_dest;
> +	int	start_svc;
> +	int	start_dest;
> +};
> +
> +static int ip_vs_genl_dump_service_dests(struct sk_buff *skb,
> +					 struct netlink_callback *cb,
> +					 struct netns_ipvs *ipvs,
> +					 struct ip_vs_service *svc,
> +					 struct dump_services_dests_ctx *ctx)
> +{
> +	void *hdr;
> +	struct nlattr *nl_service;
> +	struct nlattr *nl_dests;
> +	struct ip_vs_dest *dest;
> +	int ret = 0;
> +
> +	if (svc->ipvs != ipvs)
> +		return 0;
> +
> +	if (++ctx->idx_svc <= ctx->start_svc)
> +		return 0;
> +
> +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
> +			  cb->nlh->nlmsg_seq, &ip_vs_genl_family,
> +			  NLM_F_MULTI, IPVS_CMD_NEW_SERVICE);
> +	if (!hdr)
> +		goto out_err;
> +
> +	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
> +	if (!nl_service)
> +		goto nla_put_failure;
> +
> +	if (ip_vs_genl_put_service_attrs(skb, svc))
> +		goto nla_nested_failure;
> +
> +	nl_dests = nla_nest_start_noflag(skb, IPVS_SVC_ATTR_DESTS);
> +	if (!nl_dests)
> +		goto nla_nested_failure;
> +
> +	list_for_each_entry(dest, &svc->destinations, n_list) {
> +		if (++ctx->idx_dest <= ctx->start_dest)
> +			continue;
> +		if (ip_vs_genl_fill_dest(skb, IPVS_DESTS_ATTR_DEST, dest) < 0) {
> +			ctx->idx_svc--;
> +			ctx->idx_dest--;
> +			ret = -EMSGSIZE;
> +			goto nla_nested_end;
> +		}
> +	}
> +	ctx->idx_dest = 0;
> +	ctx->start_dest = 0;
> +
> +nla_nested_end:
> +	nla_nest_end(skb, nl_dests);
> +	nla_nest_end(skb, nl_service);
> +	genlmsg_end(skb, hdr);
> +	return ret;
> +
> +nla_nested_failure:
> +	nla_nest_cancel(skb, nl_service);
> +
> +nla_put_failure:
> +	genlmsg_cancel(skb, hdr);
> +
> +out_err:
> +	ctx->idx_svc--;
> +	return -EMSGSIZE;
> +}
> +
> +static int ip_vs_genl_dump_services_destinations(struct sk_buff *skb,
> +						 struct netlink_callback *cb)
> +{
> +	struct dump_services_dests_ctx ctx = {
> +		.idx_svc = 0,
> +		.start_svc = cb->args[0],
> +		.idx_dest = 0,
> +		.start_dest = cb->args[1],
> +	};
> +	struct net *net = sock_net(skb->sk);
> +	struct netns_ipvs *ipvs = net_ipvs(net);
> +	struct ip_vs_service *svc;
> +	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
> +	int tab = cb->args[2];
> +	int row = cb->args[3];
> +
> +	mutex_lock(&__ip_vs_mutex);
> +
> +	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs,
> +				   IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy,
> +				   cb->extack) == 0) {
> +		if (attrs[IPVS_CMD_ATTR_SERVICE]) {
> +			svc = ip_vs_genl_find_service(ipvs,
> +						      attrs[IPVS_CMD_ATTR_SERVICE]);
> +			if (IS_ERR_OR_NULL(svc))
> +				goto out_err;
> +			ip_vs_genl_dump_service_dests(skb, cb, ipvs, svc, &ctx);
> +			goto nla_put_failure;

	May be we should use different name for above label,
we are not deailing with nla in this function.

> +		}
> +	}
> +
> +	if (tab >= 2)
> +		goto nla_put_failure;
> +
> +	if (tab >= 1)
> +		goto tab_1;
> +
> +	for (; row < IP_VS_SVC_TAB_SIZE; row++) {
> +		hlist_for_each_entry(svc, &ip_vs_svc_table[row], s_list) {
> +			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
> +							  svc, &ctx))
> +				goto nla_put_failure;
> +		}
> +		ctx.idx_svc = 0;
> +		ctx.start_svc = 0;
> +		ctx.idx_dest = 0;
> +		ctx.start_dest = 0;
> +	}
> +
> +	row = 0;
> +	tab++;
> +
> +tab_1:
> +	for (; row < IP_VS_SVC_TAB_SIZE; row++) {
> +		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[row], f_list) {
> +			if (ip_vs_genl_dump_service_dests(skb, cb, ipvs,
> +							  svc, &ctx))
> +				goto nla_put_failure;
> +		}
> +		ctx.idx_svc = 0;
> +		ctx.start_svc = 0;
> +		ctx.idx_dest = 0;
> +		ctx.start_dest = 0;
> +	}
> +
> +	row = 0;
> +	tab++;
> +
> +nla_put_failure:
> +	cb->args[0] = ctx.idx_svc;
> +	cb->args[1] = ctx.idx_dest;
> +	cb->args[2] = tab;
> +	cb->args[3] = row;
> +
> +out_err:
> +	mutex_unlock(&__ip_vs_mutex);
> +
> +	return skb->len;
> +}
> +
>  static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
>  				 struct nlattr *nla, bool full_entry)
>  {
> @@ -3991,6 +4155,12 @@ static const struct genl_small_ops ip_vs_genl_ops[] = {
>  		.flags	= GENL_ADMIN_PERM,
>  		.doit	= ip_vs_genl_set_cmd,
>  	},
> +	{
> +		.cmd	= IPVS_CMD_GET_SERVICE_DEST,
> +		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
> +		.flags	= GENL_ADMIN_PERM,
> +		.dumpit	= ip_vs_genl_dump_services_destinations,
> +	},
>  };
>  
>  static struct genl_family ip_vs_genl_family __ro_after_init = {
> -- 
> 2.25.1

	Some comments for the ipvsadm patch. Please, separately
post it next time here on the list with its own subject, so that
we can comment it inline.

- ipvs_get_services_dests(): #ifdef can be before declarations,
	try to use long-to-short lines (reverse xmas tree order
	for variables in declarations)
- print_service_entry(): no need to check d before free(d),
	free() checks it itself, just like kfree() in kernel.
- ipvs_services_dests_parse_cb: we should stop if realloc() fails,
	sadly, existing code does not check realloc() result but
	for new code we should do it
- ipvs_get_services_dests(): kernel avoids using assignments in
	'if' condition, we do the same for new code. You have to
	split such code to assignment+condition.
- there are extra parentheses in code such as sizeof(*(get->index)),
	that should be fine instead: sizeof(*get->index), same for
	sizeof(get->index[0]). Extra parens also for &(get->dests),
	etc.
- as new code runs only for LIBIPVS_USE_NL, check if it is wrapped
	in proper #ifdef in libipvs/libipvs.c. Make sure
	ipvsadm compiles without LIBIPVS_USE_NL.
- the extern word should not be used in .h files anymore

	Some of the above styling issues are also reported by
linux# scripts/checkpatch.pl --strict /tmp/ipvsadm.patch

	As we try to apply to ipvsadm the same styling rules
that are used for networking in kernel, you should be able
to fix all such places with help from checkpatch.pl. Probably,
you know about this file:

Documentation/process/coding-style.rst

Regards

--
Julian Anastasov <ja@ssi.bg>


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-11-15 12:50 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-30 20:27 [PATCH RFC] ipvs: add genetlink cmd to dump all services and destinations Cezar Sa Espinola
2020-11-02 20:56 ` Julian Anastasov
2020-11-03 16:36   ` Cezar Sá Espinola
2020-11-03 19:18     ` Julian Anastasov
2020-11-06 15:40       ` [PATCH RFC v2] " Cezar Sa Espinola
2020-11-09 21:29         ` Julian Anastasov
2020-11-10 14:45           ` [PATCH RFC v3] " Cezar Sa Espinola
2020-11-15 12:50             ` Julian Anastasov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).