All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
@ 2014-02-01 22:11 mathieu.poirier
  2014-02-01 22:57 ` Florian Westphal
  0 siblings, 1 reply; 12+ messages in thread
From: mathieu.poirier @ 2014-02-01 22:11 UTC (permalink / raw)
  To: pablo, kaber, kadlec; +Cc: netfilter-devel, netfilter, fw, mathieu.poirier

From: Mathieu Poirier <mathieu.poirier@linaro.org>

Adding packet and byte quota support.  Once a quota has been
reached a noticifaction is sent to user space that includes
the name of the accounting object along with the current byte
and packet count.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/netfilter/nfnetlink_acct.h      |  11 ++-
 include/uapi/linux/netfilter/nfnetlink.h      |   2 +
 include/uapi/linux/netfilter/nfnetlink_acct.h |   1 +
 include/uapi/linux/netfilter/xt_nfacct.h      |  15 ++++
 net/netfilter/Kconfig                         |   3 +-
 net/netfilter/nfnetlink_acct.c                |  35 ++++++---
 net/netfilter/xt_nfacct.c                     | 108 +++++++++++++++++++++++---
 7 files changed, 152 insertions(+), 23 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h
index b2e85e5..490b83e 100644
--- a/include/linux/netfilter/nfnetlink_acct.h
+++ b/include/linux/netfilter/nfnetlink_acct.h
@@ -3,11 +3,18 @@
 
 #include <uapi/linux/netfilter/nfnetlink_acct.h>
 
-
-struct nf_acct;
+struct nf_acct {
+	atomic64_t		pkts;
+	atomic64_t		bytes;
+	struct list_head	head;
+	atomic_t		refcnt;
+	char			name[NFACCT_NAME_MAX];
+	struct rcu_head		rcu_head;
+};
 
 struct nf_acct *nfnl_acct_find_get(const char *filter_name);
 void nfnl_acct_put(struct nf_acct *acct);
 void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct);
+void nfnl_quota_event(struct nf_acct *cur);
 
 #endif /* _NFNL_ACCT_H */
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 596ddd4..354a7e5 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -20,6 +20,8 @@ enum nfnetlink_groups {
 #define NFNLGRP_CONNTRACK_EXP_DESTROY	NFNLGRP_CONNTRACK_EXP_DESTROY
 	NFNLGRP_NFTABLES,
 #define NFNLGRP_NFTABLES                NFNLGRP_NFTABLES
+	NFNLGRP_ACCT_QUOTA,
+#define NFNLGRP_ACCT_QUOTA		NFNLGRP_ACCT_QUOTA
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
diff --git a/include/uapi/linux/netfilter/nfnetlink_acct.h b/include/uapi/linux/netfilter/nfnetlink_acct.h
index c7b6269..ae8ea0a 100644
--- a/include/uapi/linux/netfilter/nfnetlink_acct.h
+++ b/include/uapi/linux/netfilter/nfnetlink_acct.h
@@ -19,6 +19,7 @@ enum nfnl_acct_type {
 	NFACCT_PKTS,
 	NFACCT_BYTES,
 	NFACCT_USE,
+	NFACCT_QUOTA,
 	__NFACCT_MAX
 };
 #define NFACCT_MAX (__NFACCT_MAX - 1)
diff --git a/include/uapi/linux/netfilter/xt_nfacct.h b/include/uapi/linux/netfilter/xt_nfacct.h
index 3e19c8a..23f536c 100644
--- a/include/uapi/linux/netfilter/xt_nfacct.h
+++ b/include/uapi/linux/netfilter/xt_nfacct.h
@@ -3,11 +3,26 @@
 
 #include <linux/netfilter/nfnetlink_acct.h>
 
+enum xt_quota_flags {
+	XT_NFACCT_QUOTA_PKTS	= 1 << 0,
+	XT_NFACCT_QUOTA		= 1 << 1,
+};
+
 struct nf_acct;
+struct nf_acct_quota;
 
 struct xt_nfacct_match_info {
 	char		name[NFACCT_NAME_MAX];
 	struct nf_acct	*nfacct;
 };
 
+struct xt_nfacct_match_info_v1 {
+	char		name[NFACCT_NAME_MAX];
+	struct nf_acct	*nfacct;
+
+	__u32 flags;
+	__aligned_u64 quota;
+	/* used internally by kernel */
+	struct nf_acct_quota	*priv __attribute__((aligned(8)));
+};
 #endif /* _XT_NFACCT_MATCH_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 748f304..ce184951 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1108,7 +1108,8 @@ config NETFILTER_XT_MATCH_NFACCT
 	select NETFILTER_NETLINK_ACCT
 	help
 	  This option allows you to use the extended accounting through
-	  nfnetlink_acct.
+	  nfnetlink_acct.  It is also possible to add quotas at the
+	  packet and byte level.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c7b6d46..d825f60 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -29,15 +29,6 @@ MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
 
 static LIST_HEAD(nfnl_acct_list);
 
-struct nf_acct {
-	atomic64_t		pkts;
-	atomic64_t		bytes;
-	struct list_head	head;
-	atomic_t		refcnt;
-	char			name[NFACCT_NAME_MAX];
-	struct rcu_head		rcu_head;
-};
-
 static int
 nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
@@ -92,7 +83,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	return 0;
 }
 
-static int
+int
 nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		   int event, struct nf_acct *acct)
 {
@@ -134,6 +125,7 @@ nla_put_failure:
 	nlmsg_cancel(skb, nlh);
 	return -1;
 }
+EXPORT_SYMBOL_GPL(nfnl_acct_fill_info);
 
 static int
 nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
@@ -336,6 +328,29 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_update);
 
+void
+nfnl_quota_event(struct nf_acct *cur)
+{
+	int ret;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+	ret = nfnl_acct_fill_info(skb, 0, 0, NFACCT_QUOTA,
+				 NFNL_MSG_ACCT_NEW, cur);
+
+	if (ret <= 0) {
+		kfree_skb(skb);
+		return;
+	}
+
+	netlink_broadcast(init_net.nfnl, skb, 0,
+			 NFNLGRP_ACCT_QUOTA, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(nfnl_quota_event);
+
 static int __init nfnl_acct_init(void)
 {
 	int ret;
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index b3be0ef..c557620 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -8,9 +8,12 @@
  */
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/spinlock.h>
 
+#include <net/netlink.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nfnetlink_acct.h>
+#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/xt_nfacct.h>
 
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
@@ -19,6 +22,11 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_nfacct");
 MODULE_ALIAS("ip6t_nfacct");
 
+struct nf_acct_quota {
+	spinlock_t lock;
+	bool quota_reached; /* true if quota has been reached. */
+};
+
 static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_nfacct_match_info *info = par->targinfo;
@@ -28,6 +36,36 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
+static bool
+nfacct_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	u64 val;
+	const struct xt_nfacct_match_info_v1 *info = par->matchinfo;
+	bool ret = true;
+
+	nfnl_acct_update(skb, info->nfacct);
+
+	if (info->flags & XT_NFACCT_QUOTA) {
+		spin_lock_bh(&info->priv->lock);
+		val = (info->flags & XT_NFACCT_QUOTA_PKTS) ?
+				atomic64_read(&info->nfacct->pkts) :
+				atomic64_read(&info->nfacct->bytes);
+		if (val <= info->quota) {
+			ret = !ret;
+			info->priv->quota_reached = false;
+		}
+
+		if (val >= info->quota && !info->priv->quota_reached) {
+			info->priv->quota_reached = true;
+			nfnl_quota_event(info->nfacct);
+		}
+
+		spin_unlock_bh(&info->priv->lock);
+	}
+
+	return ret;
+}
+
 static int
 nfacct_mt_checkentry(const struct xt_mtchk_param *par)
 {
@@ -44,6 +82,34 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
 	return 0;
 }
 
+static int
+nfacct_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_nfacct_match_info_v1 *info = par->matchinfo;
+	struct nf_acct *nfacct;
+
+	nfacct = nfnl_acct_find_get(info->name);
+	if (nfacct == NULL) {
+		pr_info("xt_nfacct: invalid accounting object `%s'\n",
+		       info->name);
+		return -ENOENT;
+	}
+
+	info->nfacct = nfacct;
+
+	if ((info->flags & XT_NFACCT_QUOTA) && !info->priv) {
+		info->priv =  kmalloc(sizeof(struct nf_acct_quota), GFP_KERNEL);
+		if (info->priv == NULL) {
+			nfnl_acct_put(info->nfacct);
+			return -ENOMEM;
+		}
+		spin_lock_init(&info->priv->lock);
+		info->priv->quota_reached = false;
+	}
+
+	return 0;
+}
+
 static void
 nfacct_mt_destroy(const struct xt_mtdtor_param *par)
 {
@@ -52,24 +118,46 @@ nfacct_mt_destroy(const struct xt_mtdtor_param *par)
 	nfnl_acct_put(info->nfacct);
 }
 
-static struct xt_match nfacct_mt_reg __read_mostly = {
-	.name       = "nfacct",
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = nfacct_mt_checkentry,
-	.match      = nfacct_mt,
-	.destroy    = nfacct_mt_destroy,
-	.matchsize  = sizeof(struct xt_nfacct_match_info),
-	.me         = THIS_MODULE,
+static void
+nfacct_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+	const struct xt_nfacct_match_info_v1 *info = par->matchinfo;
+
+	nfnl_acct_put(info->nfacct);
+	kfree(info->priv);
+}
+
+static struct xt_match nfacct_mt_reg[] __read_mostly = {
+	{
+		.name       = "nfacct",
+		.family     = NFPROTO_UNSPEC,
+		.checkentry = nfacct_mt_checkentry,
+		.match      = nfacct_mt,
+		.destroy    = nfacct_mt_destroy,
+		.matchsize  = sizeof(struct xt_nfacct_match_info),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "nfacct",
+		.revision   = 1,
+		.family     = NFPROTO_UNSPEC,
+		.checkentry = nfacct_mt_checkentry_v1,
+		.match      = nfacct_mt_v1,
+		.destroy    = nfacct_mt_destroy_v1,
+		.matchsize  = sizeof(struct xt_nfacct_match_info_v1),
+		.me         = THIS_MODULE,
+	},
+
 };
 
 static int __init nfacct_mt_init(void)
 {
-	return xt_register_match(&nfacct_mt_reg);
+	return xt_register_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg));
 }
 
 static void __exit nfacct_mt_exit(void)
 {
-	xt_unregister_match(&nfacct_mt_reg);
+	xt_unregister_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg));
 }
 
 module_init(nfacct_mt_init);
-- 
1.8.3.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-01 22:11 [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct mathieu.poirier
@ 2014-02-01 22:57 ` Florian Westphal
  2014-02-02 17:58   ` Mathieu Poirier
  0 siblings, 1 reply; 12+ messages in thread
From: Florian Westphal @ 2014-02-01 22:57 UTC (permalink / raw)
  To: mathieu.poirier; +Cc: pablo, kaber, kadlec, netfilter-devel, netfilter, fw

mathieu.poirier@linaro.org <mathieu.poirier@linaro.org> wrote:
> +struct xt_nfacct_match_info_v1 {
> +	char		name[NFACCT_NAME_MAX];
> +	struct nf_acct	*nfacct;
> +
> +	__u32 flags;
> +	__aligned_u64 quota;
> +	/* used internally by kernel */
> +	struct nf_acct_quota	*priv __attribute__((aligned(8)));
> +};

I think *nfacct pointer is also kernel internal, so it should also get
aligned (might also be possible to stash it in *private struct).

> +	if (info->flags & XT_NFACCT_QUOTA) {
> +		spin_lock_bh(&info->priv->lock);
> +		val = (info->flags & XT_NFACCT_QUOTA_PKTS) ?
> +				atomic64_read(&info->nfacct->pkts) :
> +				atomic64_read(&info->nfacct->bytes);
> +		if (val <= info->quota) {
> +			ret = !ret;
> +			info->priv->quota_reached = false;

Why quota_reached = false
assignment?

[ How is this toggled (other than transition to 'true' below)? ]

> +		if (val >= info->quota && !info->priv->quota_reached) {
> +			info->priv->quota_reached = true;
> +			nfnl_quota_event(info->nfacct);
> +		}
> +
> +		spin_unlock_bh(&info->priv->lock);
> +	}

Hm.  Not sure why the lock has to be hold during all tests.  What about:

if (info->flags & XT_NFACCT_QUOTA) {
		val = (info->flags & XT_NFACCT_QUOTA_PKTS) ?
				atomic64_read(&info->nfacct->pkts) :
				atomic64_read(&info->nfacct->bytes);
		/* normal case: quota not reached */
		if (val <= info->quota)
			return !ret;

		/* other case: quota reached AND event sent */
		if (info->priv->quota_reached)
		       return ret;

		spin_lock_bh(&info->priv->lock);

		if (!info->priv->quota_reached) {
			info->priv->quota_reached = true;
			nfnl_quota_event(info->nfacct);
		}

		spin_unlock_bh(&info->priv->lock);

[ could also cmpxchg instead of spinlock ]

Other than that this looks good to me.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-01 22:57 ` Florian Westphal
@ 2014-02-02 17:58   ` Mathieu Poirier
  2014-02-02 20:40     ` Florian Westphal
  0 siblings, 1 reply; 12+ messages in thread
From: Mathieu Poirier @ 2014-02-02 17:58 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, Patrick McHardy, kadlec, netfilter-devel, netfilter

On 1 February 2014 15:57, Florian Westphal <fw@strlen.de> wrote:
> mathieu.poirier@linaro.org <mathieu.poirier@linaro.org> wrote:
>> +struct xt_nfacct_match_info_v1 {
>> +     char            name[NFACCT_NAME_MAX];
>> +     struct nf_acct  *nfacct;
>> +
>> +     __u32 flags;
>> +     __aligned_u64 quota;
>> +     /* used internally by kernel */
>> +     struct nf_acct_quota    *priv __attribute__((aligned(8)));
>> +};
>
> I think *nfacct pointer is also kernel internal, so it should also get
> aligned (might also be possible to stash it in *private struct).

I suspect Pablo didn't change it already for a good reason and as such
reluctant to do the modification myself.

>
>> +     if (info->flags & XT_NFACCT_QUOTA) {
>> +             spin_lock_bh(&info->priv->lock);
>> +             val = (info->flags & XT_NFACCT_QUOTA_PKTS) ?
>> +                             atomic64_read(&info->nfacct->pkts) :
>> +                             atomic64_read(&info->nfacct->bytes);
>> +             if (val <= info->quota) {
>> +                     ret = !ret;
>> +                     info->priv->quota_reached = false;
>
> Why quota_reached = false
> assignment?

An object that has reached it's quota can always be reset from the
userspace with:

$ nfacct get object_name reset

As such we need to reset the flag so that a broadcast isn't sent.

>
> [ How is this toggled (other than transition to 'true' below)? ]
>
>> +             if (val >= info->quota && !info->priv->quota_reached) {
>> +                     info->priv->quota_reached = true;
>> +                     nfnl_quota_event(info->nfacct);
>> +             }
>> +
>> +             spin_unlock_bh(&info->priv->lock);
>> +     }
>
> Hm.  Not sure why the lock has to be hold during all tests.  What about:

Here "info" is an object seen and shared by all processors.  If a
process looses the CPU between the two atomic64_read, other CPUs in
the system can grab "info" and go through the same code, leading to an
erroneous count of byte and packets.

To me the real problem is with the incrementation of "pkts" and
"bytes" in function "nfnl_acct_update" - there is simply no way to
prevent a process from loosing the CPU between the two incrementation.
 I've been assured this can't happen on the INPUT and FORWARD path but
still waiting an answer on my scenario for the OUTPUT path.  If it
wasn't for that I think there is a way to remove the utilisation of
the spinlock.

>
> if (info->flags & XT_NFACCT_QUOTA) {
>                 val = (info->flags & XT_NFACCT_QUOTA_PKTS) ?
>                                 atomic64_read(&info->nfacct->pkts) :
>                                 atomic64_read(&info->nfacct->bytes);
>                 /* normal case: quota not reached */
>                 if (val <= info->quota)
>                         return !ret;
>
>                 /* other case: quota reached AND event sent */
>                 if (info->priv->quota_reached)
>                        return ret;
>
>                 spin_lock_bh(&info->priv->lock);
>
>                 if (!info->priv->quota_reached) {
>                         info->priv->quota_reached = true;
>                         nfnl_quota_event(info->nfacct);
>                 }
>
>                 spin_unlock_bh(&info->priv->lock);
>
> [ could also cmpxchg instead of spinlock ]
>
> Other than that this looks good to me.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-02 17:58   ` Mathieu Poirier
@ 2014-02-02 20:40     ` Florian Westphal
  2014-02-03 23:40       ` Mathieu Poirier
  0 siblings, 1 reply; 12+ messages in thread
From: Florian Westphal @ 2014-02-02 20:40 UTC (permalink / raw)
  To: Mathieu Poirier
  Cc: Florian Westphal, Pablo Neira Ayuso, Patrick McHardy, kadlec,
	netfilter-devel

Mathieu Poirier <mathieu.poirier@linaro.org> wrote:
[ removed netfilter@ from cc ]

> On 1 February 2014 15:57, Florian Westphal <fw@strlen.de> wrote:
> > mathieu.poirier@linaro.org <mathieu.poirier@linaro.org> wrote:
> >> +struct xt_nfacct_match_info_v1 {
> >> +     char            name[NFACCT_NAME_MAX];
> >> +     struct nf_acct  *nfacct;
> >> +
> >> +     __u32 flags;
> >> +     __aligned_u64 quota;
> >> +     /* used internally by kernel */
> >> +     struct nf_acct_quota    *priv __attribute__((aligned(8)));
> >> +};
> >
> > I think *nfacct pointer is also kernel internal, so it should also get
> > aligned (might also be possible to stash it in *private struct).
> 
> I suspect Pablo didn't change it already for a good reason and as such
> reluctant to do the modification myself.

Looks like an oversight to me.  I haven't checked but my guess would
be that -m nfacct won't work with 32bit userspace on 64 bit machines.

Would be better to avoid such problems with match revision 2.

> >> +             if (val <= info->quota) {
> >> +                     ret = !ret;
> >> +                     info->priv->quota_reached = false;
> >
> > Why quota_reached = false
> > assignment?
> 
> An object that has reached it's quota can always be reset from the
> userspace with:
> 
> $ nfacct get object_name reset

I see.  Makes sense,  thanks.
 
> As such we need to reset the flag so that a broadcast isn't sent.
> > [ How is this toggled (other than transition to 'true' below)? ]
> >
> >> +             if (val >= info->quota && !info->priv->quota_reached) {
> >> +                     info->priv->quota_reached = true;
> >> +                     nfnl_quota_event(info->nfacct);
> >> +             }
> >> +
> >> +             spin_unlock_bh(&info->priv->lock);
> >> +     }
> >
> > Hm.  Not sure why the lock has to be hold during all tests.  What about:
> 
> Here "info" is an object seen and shared by all processors.  If a
> process looses the CPU between the two atomic64_read, other CPUs in
> the system can grab "info" and go through the same code, leading to an
> erroneous count of byte and packets.
> To me the real problem is with the incrementation of "pkts" and
> "bytes" in function "nfnl_acct_update" - there is simply no way to
> prevent a process from loosing the CPU between the two incrementation.

Not following.  Write side is done via atomic ops, so we don't lose any
information.  Also, note that netfilter hooks run with rcu_read_lock().

Its possible that concurrent reader sees pkts already incremented but
the "old" byte count.  Is that what you're referring to? Is it a concern?

When you read either the packet or byte count you only know that this
_was_ the statistic count at some point in the (very recent) past anyway.

To me it only looks like you need to make sure that you try to send out
a broadcast notification message once when one of the counters has
recently exceeded the given threshold.

Or is there more to this functionality?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-02 20:40     ` Florian Westphal
@ 2014-02-03 23:40       ` Mathieu Poirier
  2014-02-12 10:33         ` Pablo Neira Ayuso
  0 siblings, 1 reply; 12+ messages in thread
From: Mathieu Poirier @ 2014-02-03 23:40 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On 2 February 2014 13:40, Florian Westphal <fw@strlen.de> wrote:
> Mathieu Poirier <mathieu.poirier@linaro.org> wrote:
> [ removed netfilter@ from cc ]
>
>> On 1 February 2014 15:57, Florian Westphal <fw@strlen.de> wrote:
>> > mathieu.poirier@linaro.org <mathieu.poirier@linaro.org> wrote:
>> >> +struct xt_nfacct_match_info_v1 {
>> >> +     char            name[NFACCT_NAME_MAX];
>> >> +     struct nf_acct  *nfacct;
>> >> +
>> >> +     __u32 flags;
>> >> +     __aligned_u64 quota;
>> >> +     /* used internally by kernel */
>> >> +     struct nf_acct_quota    *priv __attribute__((aligned(8)));
>> >> +};
>> >
>> > I think *nfacct pointer is also kernel internal, so it should also get
>> > aligned (might also be possible to stash it in *private struct).
>>
>> I suspect Pablo didn't change it already for a good reason and as such
>> reluctant to do the modification myself.
>
> Looks like an oversight to me.  I haven't checked but my guess would
> be that -m nfacct won't work with 32bit userspace on 64 bit machines.

Ok, I'll do the modification.

>
> Would be better to avoid such problems with match revision 2.
>
>> >> +             if (val <= info->quota) {
>> >> +                     ret = !ret;
>> >> +                     info->priv->quota_reached = false;
>> >
>> > Why quota_reached = false
>> > assignment?
>>
>> An object that has reached it's quota can always be reset from the
>> userspace with:
>>
>> $ nfacct get object_name reset
>
> I see.  Makes sense,  thanks.
>
>> As such we need to reset the flag so that a broadcast isn't sent.
>> > [ How is this toggled (other than transition to 'true' below)? ]
>> >
>> >> +             if (val >= info->quota && !info->priv->quota_reached) {
>> >> +                     info->priv->quota_reached = true;
>> >> +                     nfnl_quota_event(info->nfacct);
>> >> +             }
>> >> +
>> >> +             spin_unlock_bh(&info->priv->lock);
>> >> +     }
>> >
>> > Hm.  Not sure why the lock has to be hold during all tests.  What about:
>>
>> Here "info" is an object seen and shared by all processors.  If a
>> process looses the CPU between the two atomic64_read, other CPUs in
>> the system can grab "info" and go through the same code, leading to an
>> erroneous count of byte and packets.
>> To me the real problem is with the incrementation of "pkts" and
>> "bytes" in function "nfnl_acct_update" - there is simply no way to
>> prevent a process from loosing the CPU between the two incrementation.
>
> Not following.  Write side is done via atomic ops, so we don't lose any
> information.  Also, note that netfilter hooks run with rcu_read_lock().
>
> Its possible that concurrent reader sees pkts already incremented but
> the "old" byte count.  Is that what you're referring to? Is it a concern?
>
> When you read either the packet or byte count you only know that this
> _was_ the statistic count at some point in the (very recent) past anyway.
>
> To me it only looks like you need to make sure that you try to send out
> a broadcast notification message once when one of the counters has
> recently exceeded the given threshold.
>
> Or is there more to this functionality?

I just did a small experiment on a 5-CPU vexpress highlighting what I
see as a problem with "nfnl_acct_update":

1)  I setup a rule where packets are guaranteed to hit "nfnl_acct_update":
  $ iptables -I OUTPUT -p icmp -m nfacct --nfacct-name qaz --packet
--quota 50000 --jump REJECT

2)  Instrumented  "nfnl_acct_update" to look like this:

void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
{
         atomic64_inc(&nfacct->pkts);
         printk(KERN_ALERT "PID: %d on %d updated packets: %lld with
byte: %lld skb->len: %d\n",
                                         current->pid, smp_processor_id(),
                                         nfacct->pkts, nfacct->bytes, skb->len);
         atomic64_add(skb->len, &nfacct->bytes);
         printk(KERN_ALERT "PID: %d on %d updated bytes: %lld with
packets: %lld skb->len: %d\n",
                                         current->pid, smp_processor_id(),
                                         nfacct->bytes, nfacct->pkts, skb->len);
}


3) From the cmd line I started a "ping 192.168.1.1 > /dev/null &" that
I bounded to each of my 5 CPUs.  As such I ping my gateway
concurrently on each CPU.

4) Here is the output:

1   [  108.224602] PID: 3199 on 3 updated packets: 1 with byte: 0 skb->len: 84
2   [  108.224917] PID: 3195 on 4 updated packets: 2 with byte: 0 skb->len: 84
3   [  108.224921] PID: 3195 on 4 updated bytes: 84 with packets: 2 skb->len: 84
4   [  108.227378] PID: 3204 on 4 updated packets: 3 with byte: 84 skb->len: 84
5   [  108.227382] PID: 3204 on 4 updated bytes: 168 with packets: 3
skb->len: 84
6   [  108.229297] PID: 3202 on 4 updated packets: 4 with byte: 168 skb->len: 84
7   [  108.229300] PID: 3202 on 4 updated bytes: 252 with packets: 4
skb->len: 84
8   [  108.232075] PID: 3196 on 4 updated packets: 5 with byte: 252 skb->len: 84
9   [  108.232079] PID: 3196 on 4 updated bytes: 336 with packets: 5
skb->len: 84
10 [  108.233988] PID: 3194 on 4 updated packets: 6 with byte: 336 skb->len: 84
11 [  108.233992] PID: 3194 on 4 updated bytes: 420 with packets: 6 skb->len: 84
12 [  108.236324] PID: 3198 on 4 updated packets: 7 with byte: 420 skb->len: 84
13 [  108.236328] PID: 3198 on 4 updated bytes: 504 with packets: 7 skb->len: 84
14 [  108.238648] PID: 3197 on 4 updated packets: 8 with byte: 504 skb->len: 84
15 [  108.238652] PID: 3197 on 4 updated bytes: 588 with packets: 8 skb->len: 84
16 [  108.240593] PID: 3203 on 4 updated packets: 9 with byte: 588 skb->len: 84
17 [  108.240596] PID: 3203 on 4 updated bytes: 672 with packets: 9 skb->len: 84
18 [  108.242973] PID: 3200 on 4 updated packets: 10 with byte: 672 skb->len: 84
19 [  108.242977] PID: 3200 on 4 updated bytes: 756 with packets: 10
skb->len: 84
20 [  108.245329] PID: 3201 on 4 updated packets: 11 with byte: 756 skb->len: 84
21 [  108.245333] PID: 3201 on 4 updated bytes: 840 with packets: 11
skb->len: 84
22 [  108.248056] PID: 3205 on 4 updated packets: 12 with byte: 840 skb->len: 84
23 [  108.248060] PID: 3205 on 4 updated bytes: 924 with packets: 12
skb->len: 84
24 [  108.687297] PID: 3199 on 3 updated bytes: 1008 with packets: 12
skb->len: 84
25 [  109.211928] PID: 3195 on 4 updated packets: 13 with byte: 1008
skb->len: 84
26 [  109.232575] PID: 3195 on 4 updated bytes: 1092 with packets: 13
skb->len: 84

A concurrency issue can be observed on line two where CPU4 is reading
a byte count of 0 when it should really be 84 since CPU3 should have
had time to complete the byte increment.  On line 3 the byte count is
at 84 when it should be at 168.  The count between packets and byte is
kept out of sync until line 24 where PID 3199 finally has a chance of
upgrading its byte count.  This is just an example - there were many
other cases...

If one was to implement quotas on a device where network traffic is
accounted for billing purposes things couldn't be relied upon.  In my
opinion packet and byte increment must be protected to make sure their
values are always synchronised.  I don't really if the information
processed is a representation of a past state, but I care that the
byte and packet counts are coherent with each other.

If you (or anyone else) agrees with my assessment I'll send a patch to
fix the problem.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-03 23:40       ` Mathieu Poirier
@ 2014-02-12 10:33         ` Pablo Neira Ayuso
  2014-02-12 16:58           ` Mathieu Poirier
  0 siblings, 1 reply; 12+ messages in thread
From: Pablo Neira Ayuso @ 2014-02-12 10:33 UTC (permalink / raw)
  To: Mathieu Poirier
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On Mon, Feb 03, 2014 at 04:40:49PM -0700, Mathieu Poirier wrote:
> I just did a small experiment on a 5-CPU vexpress highlighting what I
> see as a problem with "nfnl_acct_update":
> 
> 1)  I setup a rule where packets are guaranteed to hit "nfnl_acct_update":
>   $ iptables -I OUTPUT -p icmp -m nfacct --nfacct-name qaz --packet
> --quota 50000 --jump REJECT
> 
> 2)  Instrumented  "nfnl_acct_update" to look like this:
> 
> void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
> {
>          atomic64_inc(&nfacct->pkts);
>          printk(KERN_ALERT "PID: %d on %d updated packets: %lld with
> byte: %lld skb->len: %d\n",
>                                          current->pid, smp_processor_id(),
>                                          nfacct->pkts, nfacct->bytes, skb->len);
>          atomic64_add(skb->len, &nfacct->bytes);
>          printk(KERN_ALERT "PID: %d on %d updated bytes: %lld with
> packets: %lld skb->len: %d\n",
>                                          current->pid, smp_processor_id(),
>                                          nfacct->bytes, nfacct->pkts, skb->len);
> }
> 
> 
> 3) From the cmd line I started a "ping 192.168.1.1 > /dev/null &" that
> I bounded to each of my 5 CPUs.  As such I ping my gateway
> concurrently on each CPU.
> 
> 4) Here is the output:
> 
> 1   [  108.224602] PID: 3199 on 3 updated packets: 1 with byte: 0 skb->len: 84
> 2   [  108.224917] PID: 3195 on 4 updated packets: 2 with byte: 0 skb->len: 84
> 3   [  108.224921] PID: 3195 on 4 updated bytes: 84 with packets: 2 skb->len: 84
> 4   [  108.227378] PID: 3204 on 4 updated packets: 3 with byte: 84 skb->len: 84
> 5   [  108.227382] PID: 3204 on 4 updated bytes: 168 with packets: 3
> skb->len: 84
> 6   [  108.229297] PID: 3202 on 4 updated packets: 4 with byte: 168 skb->len: 84
> 7   [  108.229300] PID: 3202 on 4 updated bytes: 252 with packets: 4
> skb->len: 84
> 8   [  108.232075] PID: 3196 on 4 updated packets: 5 with byte: 252 skb->len: 84
> 9   [  108.232079] PID: 3196 on 4 updated bytes: 336 with packets: 5
> skb->len: 84
> 10 [  108.233988] PID: 3194 on 4 updated packets: 6 with byte: 336 skb->len: 84
> 11 [  108.233992] PID: 3194 on 4 updated bytes: 420 with packets: 6 skb->len: 84
> 12 [  108.236324] PID: 3198 on 4 updated packets: 7 with byte: 420 skb->len: 84
> 13 [  108.236328] PID: 3198 on 4 updated bytes: 504 with packets: 7 skb->len: 84
> 14 [  108.238648] PID: 3197 on 4 updated packets: 8 with byte: 504 skb->len: 84
> 15 [  108.238652] PID: 3197 on 4 updated bytes: 588 with packets: 8 skb->len: 84
> 16 [  108.240593] PID: 3203 on 4 updated packets: 9 with byte: 588 skb->len: 84
> 17 [  108.240596] PID: 3203 on 4 updated bytes: 672 with packets: 9 skb->len: 84
> 18 [  108.242973] PID: 3200 on 4 updated packets: 10 with byte: 672 skb->len: 84
> 19 [  108.242977] PID: 3200 on 4 updated bytes: 756 with packets: 10
> skb->len: 84
> 20 [  108.245329] PID: 3201 on 4 updated packets: 11 with byte: 756 skb->len: 84
> 21 [  108.245333] PID: 3201 on 4 updated bytes: 840 with packets: 11
> skb->len: 84
> 22 [  108.248056] PID: 3205 on 4 updated packets: 12 with byte: 840 skb->len: 84
> 23 [  108.248060] PID: 3205 on 4 updated bytes: 924 with packets: 12
> skb->len: 84
> 24 [  108.687297] PID: 3199 on 3 updated bytes: 1008 with packets: 12
> skb->len: 84
> 25 [  109.211928] PID: 3195 on 4 updated packets: 13 with byte: 1008
> skb->len: 84
> 26 [  109.232575] PID: 3195 on 4 updated bytes: 1092 with packets: 13
> skb->len: 84
> 
> A concurrency issue can be observed on line two where CPU4 is reading
> a byte count of 0 when it should really be 84 since CPU3 should have
> had time to complete the byte increment.  On line 3 the byte count is
> at 84 when it should be at 168.  The count between packets and byte is
> kept out of sync until line 24 where PID 3199 finally has a chance of
> upgrading its byte count.  This is just an example - there were many
> other cases...

I can see that packet and byte counters are not in sync, but I don't
see why that should be a problem for your quota extension. Your
accounting is based on packet *OR* byte counters. Moreover, if you
inspect the result of the counter update via atomic_inc_return /
atomic_add_return, you can ensure that packets running out of the
quota limit will always match.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-12 10:33         ` Pablo Neira Ayuso
@ 2014-02-12 16:58           ` Mathieu Poirier
  2014-02-12 22:16             ` Pablo Neira Ayuso
  0 siblings, 1 reply; 12+ messages in thread
From: Mathieu Poirier @ 2014-02-12 16:58 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On 12 February 2014 03:33, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Mon, Feb 03, 2014 at 04:40:49PM -0700, Mathieu Poirier wrote:
>> I just did a small experiment on a 5-CPU vexpress highlighting what I
>> see as a problem with "nfnl_acct_update":
>>
>> 1)  I setup a rule where packets are guaranteed to hit "nfnl_acct_update":
>>   $ iptables -I OUTPUT -p icmp -m nfacct --nfacct-name qaz --packet
>> --quota 50000 --jump REJECT
>>
>> 2)  Instrumented  "nfnl_acct_update" to look like this:
>>
>> void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
>> {
>>          atomic64_inc(&nfacct->pkts);
>>          printk(KERN_ALERT "PID: %d on %d updated packets: %lld with
>> byte: %lld skb->len: %d\n",
>>                                          current->pid, smp_processor_id(),
>>                                          nfacct->pkts, nfacct->bytes, skb->len);
>>          atomic64_add(skb->len, &nfacct->bytes);
>>          printk(KERN_ALERT "PID: %d on %d updated bytes: %lld with
>> packets: %lld skb->len: %d\n",
>>                                          current->pid, smp_processor_id(),
>>                                          nfacct->bytes, nfacct->pkts, skb->len);
>> }
>>
>>
>> 3) From the cmd line I started a "ping 192.168.1.1 > /dev/null &" that
>> I bounded to each of my 5 CPUs.  As such I ping my gateway
>> concurrently on each CPU.
>>
>> 4) Here is the output:
>>
>> 1   [  108.224602] PID: 3199 on 3 updated packets: 1 with byte: 0 skb->len: 84
>> 2   [  108.224917] PID: 3195 on 4 updated packets: 2 with byte: 0 skb->len: 84
>> 3   [  108.224921] PID: 3195 on 4 updated bytes: 84 with packets: 2 skb->len: 84
>> 4   [  108.227378] PID: 3204 on 4 updated packets: 3 with byte: 84 skb->len: 84
>> 5   [  108.227382] PID: 3204 on 4 updated bytes: 168 with packets: 3
>> skb->len: 84
>> 6   [  108.229297] PID: 3202 on 4 updated packets: 4 with byte: 168 skb->len: 84
>> 7   [  108.229300] PID: 3202 on 4 updated bytes: 252 with packets: 4
>> skb->len: 84
>> 8   [  108.232075] PID: 3196 on 4 updated packets: 5 with byte: 252 skb->len: 84
>> 9   [  108.232079] PID: 3196 on 4 updated bytes: 336 with packets: 5
>> skb->len: 84
>> 10 [  108.233988] PID: 3194 on 4 updated packets: 6 with byte: 336 skb->len: 84
>> 11 [  108.233992] PID: 3194 on 4 updated bytes: 420 with packets: 6 skb->len: 84
>> 12 [  108.236324] PID: 3198 on 4 updated packets: 7 with byte: 420 skb->len: 84
>> 13 [  108.236328] PID: 3198 on 4 updated bytes: 504 with packets: 7 skb->len: 84
>> 14 [  108.238648] PID: 3197 on 4 updated packets: 8 with byte: 504 skb->len: 84
>> 15 [  108.238652] PID: 3197 on 4 updated bytes: 588 with packets: 8 skb->len: 84
>> 16 [  108.240593] PID: 3203 on 4 updated packets: 9 with byte: 588 skb->len: 84
>> 17 [  108.240596] PID: 3203 on 4 updated bytes: 672 with packets: 9 skb->len: 84
>> 18 [  108.242973] PID: 3200 on 4 updated packets: 10 with byte: 672 skb->len: 84
>> 19 [  108.242977] PID: 3200 on 4 updated bytes: 756 with packets: 10
>> skb->len: 84
>> 20 [  108.245329] PID: 3201 on 4 updated packets: 11 with byte: 756 skb->len: 84
>> 21 [  108.245333] PID: 3201 on 4 updated bytes: 840 with packets: 11
>> skb->len: 84
>> 22 [  108.248056] PID: 3205 on 4 updated packets: 12 with byte: 840 skb->len: 84
>> 23 [  108.248060] PID: 3205 on 4 updated bytes: 924 with packets: 12
>> skb->len: 84
>> 24 [  108.687297] PID: 3199 on 3 updated bytes: 1008 with packets: 12
>> skb->len: 84
>> 25 [  109.211928] PID: 3195 on 4 updated packets: 13 with byte: 1008
>> skb->len: 84
>> 26 [  109.232575] PID: 3195 on 4 updated bytes: 1092 with packets: 13
>> skb->len: 84
>>
>> A concurrency issue can be observed on line two where CPU4 is reading
>> a byte count of 0 when it should really be 84 since CPU3 should have
>> had time to complete the byte increment.  On line 3 the byte count is
>> at 84 when it should be at 168.  The count between packets and byte is
>> kept out of sync until line 24 where PID 3199 finally has a chance of
>> upgrading its byte count.  This is just an example - there were many
>> other cases...
>
> I can see that packet and byte counters are not in sync, but I don't
> see why that should be a problem for your quota extension. Your
> accounting is based on packet *OR* byte counters.

If we have a filter reporting the amount of packets and bytes that
have gone through a chain the least we can do is make them coherent.
Otherwise the numbers simply can't be trusted.  Fixing that problem is
what I've done in the 4th version of my patchset sent on Monday.

 Moreover, if you
> inspect the result of the counter update via atomic_inc_return /
> atomic_add_return, you can ensure that packets running out of the
> quota limit will always match.

Thanks for Florian Westphal's hint on 'cmpxchg' I came up with a match
algorithm that requires no spinlock and doesn't mandate that packet
count be synchronised with bytes.  Before sending it out I'd like us
to reach a consensus on the above - I think the current accounting
method is broken and ought to be fixed.

If the current situation is fine with you I'll simply send another
version that works without counts being synchronised - but I really
think we can do better.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-12 16:58           ` Mathieu Poirier
@ 2014-02-12 22:16             ` Pablo Neira Ayuso
  2014-02-12 22:20               ` Pablo Neira Ayuso
                                 ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2014-02-12 22:16 UTC (permalink / raw)
  To: Mathieu Poirier
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On Wed, Feb 12, 2014 at 09:58:28AM -0700, Mathieu Poirier wrote:
[...]
> > I can see that packet and byte counters are not in sync, but I don't
> > see why that should be a problem for your quota extension. Your
> > accounting is based on packet *OR* byte counters.
> 
> If we have a filter reporting the amount of packets and bytes that
> have gone through a chain the least we can do is make them coherent.
> Otherwise the numbers simply can't be trusted.  Fixing that problem is
> what I've done in the 4th version of my patchset sent on Monday.
>
> > Moreover, if you inspect the result of the counter update via
> > atomic_inc_return / atomic_add_return, you can ensure that packets
> > running out of the quota limit will always match.
> 
> Thanks for Florian Westphal's hint on 'cmpxchg' I came up with a match
> algorithm that requires no spinlock and doesn't mandate that packet
> count be synchronised with bytes.  Before sending it out I'd like us
> to reach a consensus on the above - I think the current accounting
> method is broken and ought to be fixed.

I really think we should skip that spinlock with atomic64. That
doesn't make sense to me.

> If the current situation is fine with you I'll simply send another
> version that works without counts being synchronised - but I really
> think we can do better.

Given the interferences that you described, I think you can use this
approach to ensure that only one event is delivered:

        if (old < info->quota && new >= info->quota &&
            test_and_set_bit(0, info->quota_reached))
                deliver_event(...);

On top of that, there's another problem that needs to be solved now
that we have a quota_reached field in the info area. This needs a
"database of quota objects" that are identified by a name, otherwise
iptables rule updates will trigger a new (unnecessary event) event
every time *any* new rule is added.

If you want to test what I mean, try this:

iptables -I INPUT -m quota --quota 1000 -j LOG --log-prefix "quota: "

then keep an eye on /var/log/messages, ping somewhere to generate
traffic. You should see log messages for each packet until the quota
is reached.  After a couple of ICMP packets, you will see no more logs
since the quota has been reached.

Now, keep monitoring /var/log/messages and add any rule, eg. iptables
-I INPUT. The quota matches again and you'll start seeing more log
messages since every rule updates "resets" the internal state of the
match. This is a known problem. To solve this, you have to use a
specific quota object with a refcnt that is identified by a name. See
xt_hashlimit.c for instance. Note that the quota_reached can't be
added to the existing nfacct object since it's a valid configuration
to have two rules using with different quota values that refer to the
same nfacct object (think of a scenario in which you want to throttle
to N Kbytes/s after X bytes has been consumed, then throttle to a
lower M Kbytes/s after X+Y bytes has been consumed).

Thus, the iptables command for this should look like:

iptables -I INPUT -p icmp \
        -m nfacct --mode packets --quota 100 --quota-name icmp-quota-100

In _mt_check you have to look up for "icmp-quota-100" in the list of
quota objects to attach it to the rule. If it doesn't exist you create
it. In the _destroy path, you have to put the refcnt, if it reaches
zero, it will be released. It may sound tricky, but this will work
fine in practise.

Please, take the time to digest this and let us know if you have any
question. Thank you.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-12 22:16             ` Pablo Neira Ayuso
@ 2014-02-12 22:20               ` Pablo Neira Ayuso
  2014-02-14 15:20               ` Mathieu Poirier
  2014-02-16  2:38               ` Mathieu Poirier
  2 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2014-02-12 22:20 UTC (permalink / raw)
  To: Mathieu Poirier
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On Wed, Feb 12, 2014 at 11:16:31PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Feb 12, 2014 at 09:58:28AM -0700, Mathieu Poirier wrote:
> [...]
> > > I can see that packet and byte counters are not in sync, but I don't
> > > see why that should be a problem for your quota extension. Your
> > > accounting is based on packet *OR* byte counters.
> > 
> > If we have a filter reporting the amount of packets and bytes that
> > have gone through a chain the least we can do is make them coherent.
> > Otherwise the numbers simply can't be trusted.  Fixing that problem is
> > what I've done in the 4th version of my patchset sent on Monday.
> >
> > > Moreover, if you inspect the result of the counter update via
> > > atomic_inc_return / atomic_add_return, you can ensure that packets
> > > running out of the quota limit will always match.
> > 
> > Thanks for Florian Westphal's hint on 'cmpxchg' I came up with a match
> > algorithm that requires no spinlock and doesn't mandate that packet
> > count be synchronised with bytes.  Before sending it out I'd like us
> > to reach a consensus on the above - I think the current accounting
> > method is broken and ought to be fixed.
> 
> I really think we should skip that spinlock with atomic64. That
> doesn't make sense to me.
> 
> > If the current situation is fine with you I'll simply send another
> > version that works without counts being synchronised - but I really
> > think we can do better.
> 
> Given the interferences that you described, I think you can use this
> approach to ensure that only one event is delivered:
> 
>         if (old < info->quota && new >= info->quota &&
>             test_and_set_bit(0, info->quota_reached))
>                 deliver_event(...);
> 
> On top of that, there's another problem that needs to be solved now
> that we have a quota_reached field in the info area. This needs a
> "database of quota objects" that are identified by a name, otherwise
> iptables rule updates will trigger a new (unnecessary event) event
> every time *any* new rule is added.
> 
> If you want to test what I mean, try this:
> 
> iptables -I INPUT -m quota --quota 1000 -j LOG --log-prefix "quota: "
> 
> then keep an eye on /var/log/messages, ping somewhere to generate
> traffic. You should see log messages for each packet until the quota
> is reached.  After a couple of ICMP packets, you will see no more logs
> since the quota has been reached.
> 
> Now, keep monitoring /var/log/messages and add any rule, eg. iptables
> -I INPUT. The quota matches again and you'll start seeing more log
> messages since every rule updates "resets" the internal state of the
> match. This is a known problem. To solve this, you have to use a
> specific quota object with a refcnt that is identified by a name. See
> xt_hashlimit.c for instance. Note that the quota_reached can't be
> added to the existing nfacct object since it's a valid configuration
> to have two rules using with different quota values that refer to the
> same nfacct object (think of a scenario in which you want to throttle
> to N Kbytes/s after X bytes has been consumed, then throttle to a
> lower M Kbytes/s after X+Y bytes has been consumed).
> 
> Thus, the iptables command for this should look like:
> 
> iptables -I INPUT -p icmp \
>         -m nfacct --mode packets --quota 100 --quota-name icmp-quota-100

This still needs --name icmp, of course.

Just to insist on the concept: the nfacct and the quota object should
be independent.

> In _mt_check you have to look up for "icmp-quota-100" in the list of
> quota objects to attach it to the rule. If it doesn't exist you create
> it. In the _destroy path, you have to put the refcnt, if it reaches
> zero, it will be released. It may sound tricky, but this will work
> fine in practise.
> 
> Please, take the time to digest this and let us know if you have any
> question. Thank you.
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-12 22:16             ` Pablo Neira Ayuso
  2014-02-12 22:20               ` Pablo Neira Ayuso
@ 2014-02-14 15:20               ` Mathieu Poirier
  2014-02-16  2:38               ` Mathieu Poirier
  2 siblings, 0 replies; 12+ messages in thread
From: Mathieu Poirier @ 2014-02-14 15:20 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On 12 February 2014 15:16, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Wed, Feb 12, 2014 at 09:58:28AM -0700, Mathieu Poirier wrote:
> [...]
>> > I can see that packet and byte counters are not in sync, but I don't
>> > see why that should be a problem for your quota extension. Your
>> > accounting is based on packet *OR* byte counters.
>>
>> If we have a filter reporting the amount of packets and bytes that
>> have gone through a chain the least we can do is make them coherent.
>> Otherwise the numbers simply can't be trusted.  Fixing that problem is
>> what I've done in the 4th version of my patchset sent on Monday.
>>
>> > Moreover, if you inspect the result of the counter update via
>> > atomic_inc_return / atomic_add_return, you can ensure that packets
>> > running out of the quota limit will always match.
>>
>> Thanks for Florian Westphal's hint on 'cmpxchg' I came up with a match
>> algorithm that requires no spinlock and doesn't mandate that packet
>> count be synchronised with bytes.  Before sending it out I'd like us
>> to reach a consensus on the above - I think the current accounting
>> method is broken and ought to be fixed.
>
> I really think we should skip that spinlock with atomic64. That
> doesn't make sense to me.

I'm not fond of it either.

>
>> If the current situation is fine with you I'll simply send another
>> version that works without counts being synchronised - but I really
>> think we can do better.
>
> Given the interferences that you described, I think you can use this
> approach to ensure that only one event is delivered:
>
>         if (old < info->quota && new >= info->quota &&
>             test_and_set_bit(0, info->quota_reached))
>                 deliver_event(...);

I had something similar but I used atomic_cmpxch() instead:

         if (val >= info->quota)
                 if (atomic_cmpxchg(&info->priv->notified, 0, 1) == 0)
                        nfnl_quota_event(info->nfacct);

Note that I've embedded 'notified' in a private field since userspace
doesn't need to know what we do in the kernel.  I've also used an
atomic value... Get back to me with your preference.

>
> On top of that, there's another problem that needs to be solved now
> that we have a quota_reached field in the info area. This needs a
> "database of quota objects" that are identified by a name, otherwise
> iptables rule updates will trigger a new (unnecessary event) event
> every time *any* new rule is added.
>
> If you want to test what I mean, try this:
>
> iptables -I INPUT -m quota --quota 1000 -j LOG --log-prefix "quota: "
>
> then keep an eye on /var/log/messages, ping somewhere to generate
> traffic. You should see log messages for each packet until the quota
> is reached.  After a couple of ICMP packets, you will see no more logs
> since the quota has been reached.
>
> Now, keep monitoring /var/log/messages and add any rule, eg. iptables
> -I INPUT. The quota matches again and you'll start seeing more log
> messages since every rule updates "resets" the internal state of the
> match. This is a known problem. To solve this, you have to use a
> specific quota object with a refcnt that is identified by a name. See
> xt_hashlimit.c for instance. Note that the quota_reached can't be
> added to the existing nfacct object since it's a valid configuration
> to have two rules using with different quota values that refer to the
> same nfacct object (think of a scenario in which you want to throttle
> to N Kbytes/s after X bytes has been consumed, then throttle to a
> lower M Kbytes/s after X+Y bytes has been consumed).

Thanks for pointing this out - I was able to reproduce the problem
with the quota filter.  I'll look at the effect on nfacct and will get
back to you if I have questions.

>
> Thus, the iptables command for this should look like:
>
> iptables -I INPUT -p icmp \
>         -m nfacct --mode packets --quota 100 --quota-name icmp-quota-100
>
> In _mt_check you have to look up for "icmp-quota-100" in the list of
> quota objects to attach it to the rule. If it doesn't exist you create
> it. In the _destroy path, you have to put the refcnt, if it reaches
> zero, it will be released. It may sound tricky, but this will work
> fine in practise.
>
> Please, take the time to digest this and let us know if you have any
> question. Thank you.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-12 22:16             ` Pablo Neira Ayuso
  2014-02-12 22:20               ` Pablo Neira Ayuso
  2014-02-14 15:20               ` Mathieu Poirier
@ 2014-02-16  2:38               ` Mathieu Poirier
  2014-02-16 10:01                 ` Pablo Neira Ayuso
  2 siblings, 1 reply; 12+ messages in thread
From: Mathieu Poirier @ 2014-02-16  2:38 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On 12 February 2014 15:16, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Wed, Feb 12, 2014 at 09:58:28AM -0700, Mathieu Poirier wrote:
> [...]
>> > I can see that packet and byte counters are not in sync, but I don't
>> > see why that should be a problem for your quota extension. Your
>> > accounting is based on packet *OR* byte counters.
>>
>> If we have a filter reporting the amount of packets and bytes that
>> have gone through a chain the least we can do is make them coherent.
>> Otherwise the numbers simply can't be trusted.  Fixing that problem is
>> what I've done in the 4th version of my patchset sent on Monday.
>>
>> > Moreover, if you inspect the result of the counter update via
>> > atomic_inc_return / atomic_add_return, you can ensure that packets
>> > running out of the quota limit will always match.
>>
>> Thanks for Florian Westphal's hint on 'cmpxchg' I came up with a match
>> algorithm that requires no spinlock and doesn't mandate that packet
>> count be synchronised with bytes.  Before sending it out I'd like us
>> to reach a consensus on the above - I think the current accounting
>> method is broken and ought to be fixed.
>
> I really think we should skip that spinlock with atomic64. That
> doesn't make sense to me.
>
>> If the current situation is fine with you I'll simply send another
>> version that works without counts being synchronised - but I really
>> think we can do better.
>
> Given the interferences that you described, I think you can use this
> approach to ensure that only one event is delivered:
>
>         if (old < info->quota && new >= info->quota &&
>             test_and_set_bit(0, info->quota_reached))
>                 deliver_event(...);
>
> On top of that, there's another problem that needs to be solved now
> that we have a quota_reached field in the info area. This needs a
> "database of quota objects" that are identified by a name, otherwise
> iptables rule updates will trigger a new (unnecessary event) event
> every time *any* new rule is added.
>
> If you want to test what I mean, try this:
>
> iptables -I INPUT -m quota --quota 1000 -j LOG --log-prefix "quota: "
>
> then keep an eye on /var/log/messages, ping somewhere to generate
> traffic. You should see log messages for each packet until the quota
> is reached.  After a couple of ICMP packets, you will see no more logs
> since the quota has been reached.
>
> Now, keep monitoring /var/log/messages and add any rule, eg. iptables
> -I INPUT. The quota matches again and you'll start seeing more log
> messages since every rule updates "resets" the internal state of the
> match. This is a known problem. To solve this, you have to use a
> specific quota object with a refcnt that is identified by a name. See
> xt_hashlimit.c for instance. Note that the quota_reached can't be
> added to the existing nfacct object since it's a valid configuration
> to have two rules using with different quota values that refer to the
> same nfacct object (think of a scenario in which you want to throttle
> to N Kbytes/s after X bytes has been consumed, then throttle to a
> lower M Kbytes/s after X+Y bytes has been consumed).
>
> Thus, the iptables command for this should look like:
>
> iptables -I INPUT -p icmp \
>         -m nfacct --mode packets --quota 100 --quota-name icmp-quota-100
>
> In _mt_check you have to look up for "icmp-quota-100" in the list of
> quota objects to attach it to the rule. If it doesn't exist you create
> it. In the _destroy path, you have to put the refcnt, if it reaches
> zero, it will be released. It may sound tricky, but this will work
> fine in practise.
>
> Please, take the time to digest this and let us know if you have any
> question. Thank you.

I traced things a little and I understand exactly what is happening -
I will implement the same mechanism as in hashlimit.  Since accounting
objects can be delete from nfnetlink_acct I suggest we add a notifier
to it.  That way objects in the database can be deleted when
accounting objects are deleted from userspace with nfacct.  What do
you think?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct
  2014-02-16  2:38               ` Mathieu Poirier
@ 2014-02-16 10:01                 ` Pablo Neira Ayuso
  0 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2014-02-16 10:01 UTC (permalink / raw)
  To: Mathieu Poirier
  Cc: Florian Westphal, Patrick McHardy, kadlec, netfilter-devel, John Stultz

On Sat, Feb 15, 2014 at 07:38:08PM -0700, Mathieu Poirier wrote:
> On 12 February 2014 15:16, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > On Wed, Feb 12, 2014 at 09:58:28AM -0700, Mathieu Poirier wrote:
> > [...]
> >> > I can see that packet and byte counters are not in sync, but I don't
> >> > see why that should be a problem for your quota extension. Your
> >> > accounting is based on packet *OR* byte counters.
> >>
> >> If we have a filter reporting the amount of packets and bytes that
> >> have gone through a chain the least we can do is make them coherent.
> >> Otherwise the numbers simply can't be trusted.  Fixing that problem is
> >> what I've done in the 4th version of my patchset sent on Monday.
> >>
> >> > Moreover, if you inspect the result of the counter update via
> >> > atomic_inc_return / atomic_add_return, you can ensure that packets
> >> > running out of the quota limit will always match.
> >>
> >> Thanks for Florian Westphal's hint on 'cmpxchg' I came up with a match
> >> algorithm that requires no spinlock and doesn't mandate that packet
> >> count be synchronised with bytes.  Before sending it out I'd like us
> >> to reach a consensus on the above - I think the current accounting
> >> method is broken and ought to be fixed.
> >
> > I really think we should skip that spinlock with atomic64. That
> > doesn't make sense to me.
> >
> >> If the current situation is fine with you I'll simply send another
> >> version that works without counts being synchronised - but I really
> >> think we can do better.
> >
> > Given the interferences that you described, I think you can use this
> > approach to ensure that only one event is delivered:
> >
> >         if (old < info->quota && new >= info->quota &&
> >             test_and_set_bit(0, info->quota_reached))
> >                 deliver_event(...);
> >
> > On top of that, there's another problem that needs to be solved now
> > that we have a quota_reached field in the info area. This needs a
> > "database of quota objects" that are identified by a name, otherwise
> > iptables rule updates will trigger a new (unnecessary event) event
> > every time *any* new rule is added.
> >
> > If you want to test what I mean, try this:
> >
> > iptables -I INPUT -m quota --quota 1000 -j LOG --log-prefix "quota: "
> >
> > then keep an eye on /var/log/messages, ping somewhere to generate
> > traffic. You should see log messages for each packet until the quota
> > is reached.  After a couple of ICMP packets, you will see no more logs
> > since the quota has been reached.
> >
> > Now, keep monitoring /var/log/messages and add any rule, eg. iptables
> > -I INPUT. The quota matches again and you'll start seeing more log
> > messages since every rule updates "resets" the internal state of the
> > match. This is a known problem. To solve this, you have to use a
> > specific quota object with a refcnt that is identified by a name. See
> > xt_hashlimit.c for instance. Note that the quota_reached can't be
> > added to the existing nfacct object since it's a valid configuration
> > to have two rules using with different quota values that refer to the
> > same nfacct object (think of a scenario in which you want to throttle
> > to N Kbytes/s after X bytes has been consumed, then throttle to a
> > lower M Kbytes/s after X+Y bytes has been consumed).
> >
> > Thus, the iptables command for this should look like:
> >
> > iptables -I INPUT -p icmp \
> >         -m nfacct --mode packets --quota 100 --quota-name icmp-quota-100
> >
> > In _mt_check you have to look up for "icmp-quota-100" in the list of
> > quota objects to attach it to the rule. If it doesn't exist you create
> > it. In the _destroy path, you have to put the refcnt, if it reaches
> > zero, it will be released. It may sound tricky, but this will work
> > fine in practise.
> >
> > Please, take the time to digest this and let us know if you have any
> > question. Thank you.
> 
> I traced things a little and I understand exactly what is happening -
> I will implement the same mechanism as in hashlimit.  Since accounting
> objects can be delete from nfnetlink_acct I suggest we add a notifier
> to it.  That way objects in the database can be deleted when
> accounting objects are deleted from userspace with nfacct.  What do
> you think?

Just return -EBUSY if you try to delete an object that is in use. Bump
the refcount of nfacct object when the quota object points to it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-02-16 10:01 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-01 22:11 [PATCH RESEND v3] netfilter: xtables: add quota support to nfacct mathieu.poirier
2014-02-01 22:57 ` Florian Westphal
2014-02-02 17:58   ` Mathieu Poirier
2014-02-02 20:40     ` Florian Westphal
2014-02-03 23:40       ` Mathieu Poirier
2014-02-12 10:33         ` Pablo Neira Ayuso
2014-02-12 16:58           ` Mathieu Poirier
2014-02-12 22:16             ` Pablo Neira Ayuso
2014-02-12 22:20               ` Pablo Neira Ayuso
2014-02-14 15:20               ` Mathieu Poirier
2014-02-16  2:38               ` Mathieu Poirier
2014-02-16 10:01                 ` Pablo Neira Ayuso

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.