All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
@ 2010-04-20 14:31 Changli Gao
  2010-04-20 14:57 ` Patrick McHardy
  2010-04-21 20:23 ` Paul E. McKenney
  0 siblings, 2 replies; 9+ messages in thread
From: Changli Gao @ 2010-04-20 14:31 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, Eric Dumazet, Changli Gao

use hash table to speed up entry lookup

A hash table is used to speed up entry lookup when the verdicts aren't received
in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
Its default value is 1. Reciprocal division is used to lower the cost of
division, and the entry IDs are generated carefully to get fair entry
distribution in the buckets of the hash table.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netfilter/nfnetlink_queue.h |    1 
 init/Kconfig                              |    1 
 lib/Kconfig                               |    3 
 lib/Makefile                              |    4 
 lib/reciprocal_div.c                      |    2 
 net/netfilter/Kconfig                     |    1 
 net/netfilter/nfnetlink_queue.c           |  252 +++++++++++++++++++++++++-----
 7 files changed, 227 insertions(+), 37 deletions(-)
diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
index 2455fe5..77b1566 100644
--- a/include/linux/netfilter/nfnetlink_queue.h
+++ b/include/linux/netfilter/nfnetlink_queue.h
@@ -83,6 +83,7 @@ enum nfqnl_attr_config {
 	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
 	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
 	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
+	NFQA_CFG_QUEUE_HTBLSIZ,		/* __u32 */
 	__NFQA_CFG_MAX
 };
 #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
diff --git a/init/Kconfig b/init/Kconfig
index cb6069e..4b4266f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1059,6 +1059,7 @@ choice
 
 config SLAB
 	bool "SLAB"
+	select RECIPROCAL_DIV
 	help
 	  The regular slab allocator that is established and known to work
 	  well in all environments. It organizes cache hot objects in
diff --git a/lib/Kconfig b/lib/Kconfig
index af12831..0c4b5ec 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -231,4 +231,7 @@ config IOQ
 
 	 If unsure, say N
 
+config RECIPROCAL_DIV
+       bool
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 0a6ab6f..c3555bd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,7 +10,7 @@ endif
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
+	 sha1.o irq_regs.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o flex_array.o
 
@@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
 
 obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
 
+obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index 6a3bd48..39f2e5e 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -1,5 +1,6 @@
 #include <asm/div64.h>
 #include <linux/reciprocal_div.h>
+#include <linux/module.h>
 
 u32 reciprocal_value(u32 k)
 {
@@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
 	do_div(val, k);
 	return (u32)val;
 }
+EXPORT_SYMBOL(reciprocal_value);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 18d77b5..40b34d5 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
 	tristate "Netfilter NFQUEUE over NFNETLINK interface"
 	depends on NETFILTER_ADVANCED
 	select NETFILTER_NETLINK
+	select RECIPROCAL_DIV
 	help
 	  If this option is enabled, the kernel will include support
 	  for queueing packets via NFNETLINK.
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index e70a6ef..d3d02b7 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -28,6 +28,8 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_queue.h>
 #include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/reciprocal_div.h>
 #include <net/sock.h>
 #include <net/netfilter/nf_queue.h>
 
@@ -37,11 +39,13 @@
 #include "../bridge/br_private.h"
 #endif
 
-#define NFQNL_QMAX_DEFAULT 1024
+#define NFQNL_QMAX_DEFAULT	1024
+#define NFQNL_QHTBLSIZ_DEFAULT	1
 
 struct nfqnl_instance {
 	struct hlist_node hlist;		/* global list of queues */
 	struct rcu_head rcu;
+	struct work_struct work;
 
 	int peer_pid;
 	unsigned int queue_maxlen;
@@ -49,15 +53,21 @@ struct nfqnl_instance {
 	unsigned int queue_total;
 	unsigned int queue_dropped;
 	unsigned int queue_user_dropped;
+	unsigned int queue_htblsiz;
+	u32 reciprocal_value;
 
 	unsigned int id_sequence;		/* 'sequence' of pkt ids */
+	unsigned int id_increment;
+	unsigned int id_offset;
+	unsigned int id_limit;
 
 	u_int16_t queue_num;			/* number of this queue */
 	u_int8_t copy_mode;
 
 	spinlock_t lock;
 
-	struct list_head queue_list;		/* packets in queue */
+	struct list_head *queue_htbl;		/* packets in queue */
+	bool vmalloc;
 };
 
 typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
@@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
 	return NULL;
 }
 
+static void instance_destroy_work(struct work_struct *work)
+{
+	struct nfqnl_instance *inst;
+
+	inst = container_of(work, struct nfqnl_instance, work);
+	if (inst->vmalloc)
+		vfree(inst->queue_htbl);
+	else
+		kfree(inst->queue_htbl);
+	kfree(inst);
+	module_put(THIS_MODULE);
+}
+
 static struct nfqnl_instance *
 instance_create(u_int16_t queue_num, int pid)
 {
 	struct nfqnl_instance *inst;
-	unsigned int h;
+	unsigned int h, i;
 	int err;
 
-	spin_lock(&instances_lock);
-	if (instance_lookup(queue_num)) {
-		err = -EEXIST;
-		goto out_unlock;
-	}
-
+	rcu_read_unlock();
 	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
 	if (!inst) {
 		err = -ENOMEM;
-		goto out_unlock;
+		goto out_lock;
 	}
 
+	INIT_WORK(&inst->work, instance_destroy_work);
 	inst->queue_num = queue_num;
 	inst->peer_pid = pid;
 	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
 	inst->copy_range = 0xfffff;
 	inst->copy_mode = NFQNL_COPY_NONE;
 	spin_lock_init(&inst->lock);
-	INIT_LIST_HEAD(&inst->queue_list);
+	inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
+	inst->id_increment = INT_MAX / inst->queue_htblsiz;
+	inst->id_limit = inst->id_increment * inst->queue_htblsiz;
+	inst->reciprocal_value = reciprocal_value(inst->id_increment);
+	inst->queue_htbl = kmalloc(sizeof(struct list_head) *
+				   inst->queue_htblsiz, GFP_KERNEL);
+	if (inst->queue_htbl == NULL) {
+		inst->queue_htbl = vmalloc(sizeof(struct list_head) *
+					   inst->queue_htblsiz);
+		if (inst->queue_htbl == NULL) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		inst->vmalloc = true;
+	}
+	for (i = 0; i < inst->queue_htblsiz; i++)
+		INIT_LIST_HEAD(&inst->queue_htbl[i]);
 
 	if (!try_module_get(THIS_MODULE)) {
 		err = -EAGAIN;
 		goto out_free;
 	}
+	rcu_read_lock();
 
+	spin_lock(&instances_lock);
+	if (instance_lookup(queue_num)) {
+		err = -EEXIST;
+		spin_unlock(&instances_lock);
+		rcu_read_unlock();
+		goto out_free;
+	}
 	h = instance_hashfn(queue_num);
 	hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
-
 	spin_unlock(&instances_lock);
 
 	return inst;
 
 out_free:
+	if (inst->queue_htbl) {
+		if (inst->vmalloc)
+			vfree(inst->queue_htbl);
+		else
+			kfree(inst->queue_htbl);
+	}
 	kfree(inst);
-out_unlock:
-	spin_unlock(&instances_lock);
+out_lock:
+	rcu_read_lock();
 	return ERR_PTR(err);
 }
 
@@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
 						   rcu);
 
 	nfqnl_flush(inst, NULL, 0);
-	kfree(inst);
-	module_put(THIS_MODULE);
+	schedule_work(&inst->work);
 }
 
 static void
@@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
 	spin_unlock(&instances_lock);
 }
 
+static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
+					       unsigned int id)
+{
+	return &queue->queue_htbl[reciprocal_divide(id,
+						    queue->reciprocal_value)];
+}
+
+static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
+					   u32 id)
+{
+	struct nf_queue_entry *entry;
+	struct list_head *head;
+
+	head = nfqnl_head_get(queue, id);
+	list_for_each_entry(entry, head, list) {
+		if (entry->id == id)
+			return entry;
+	}
+
+	return NULL;
+}
+
+static u32 __get_uniq_id(struct nfqnl_instance *queue)
+{
+	u32 i;
+
+	for (i = 0; i < INT_MAX; i++) {
+		queue->id_sequence += queue->id_increment;
+		if (queue->id_sequence >= queue->id_limit) {
+			if (++queue->id_offset >= queue->id_increment)
+				queue->id_offset = 0;
+			queue->id_sequence = queue->id_offset;
+		}
+		if (__find_entry(queue, queue->id_sequence) == NULL)
+			return queue->id_sequence;
+	}
+
+	return INT_MAX;
+}
+
 static inline void
 __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
-       list_add_tail(&entry->list, &queue->queue_list);
-       queue->queue_total++;
+	struct list_head *head;
+
+	head = nfqnl_head_get(queue, entry->id);
+	list_add_tail(&entry->list, head);
+	queue->queue_total++;
 }
 
 static struct nf_queue_entry *
 find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 {
-	struct nf_queue_entry *entry = NULL, *i;
+	struct nf_queue_entry *entry;
 
 	spin_lock_bh(&queue->lock);
-
-	list_for_each_entry(i, &queue->queue_list, list) {
-		if (i->id == id) {
-			entry = i;
-			break;
-		}
-	}
-
+	entry = __find_entry(queue, id);
 	if (entry) {
 		list_del(&entry->list);
 		queue->queue_total--;
 	}
-
 	spin_unlock_bh(&queue->lock);
 
 	return entry;
@@ -197,13 +279,22 @@ static void
 nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 {
 	struct nf_queue_entry *entry, *next;
+	unsigned int i, total;
+	struct list_head *head;
 
 	spin_lock_bh(&queue->lock);
-	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
-		if (!cmpfn || cmpfn(entry, data)) {
-			list_del(&entry->list);
-			queue->queue_total--;
-			nf_reinject(entry, NF_DROP);
+	total = queue->queue_total;
+	for (i = 0; i < queue->queue_htblsiz; i++) {
+		if (total < 1)
+			break;
+		head = &queue->queue_htbl[i];
+		list_for_each_entry_safe(entry, next, head, list) {
+			if (!cmpfn || cmpfn(entry, data)) {
+				list_del(&entry->list);
+				queue->queue_total--;
+				nf_reinject(entry, NF_DROP);
+			}
+			--total;
 		}
 	}
 	spin_unlock_bh(&queue->lock);
@@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 		break;
 	}
 
-	entry->id = queue->id_sequence++;
+	entry->id = __get_uniq_id(queue);
+	if (entry->id == INT_MAX) {
+		spin_unlock_bh(&queue->lock);
+		return NULL;
+	}
+	__enqueue_entry(queue, entry);
 
 	spin_unlock_bh(&queue->lock);
 
@@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 
 nlmsg_failure:
 nla_put_failure:
+	find_dequeue_entry(queue, entry->id);
 	if (skb)
 		kfree_skb(skb);
 	if (net_ratelimit())
@@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 		goto err_out_unlock;
 	}
 
-	__enqueue_entry(queue, entry);
-
 	spin_unlock_bh(&queue->lock);
 	return 0;
 
 err_out_free_nskb:
 	kfree_skb(nskb);
 err_out_unlock:
+	list_del(&entry->list);
+	queue->queue_total--;
 	spin_unlock_bh(&queue->lock);
 err_out:
 	return -1;
@@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
 	.outfn	= &nfqnl_enqueue_packet,
 };
 
+static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
+{
+	struct nfqnl_instance *queue;
+	unsigned int i, total;
+	struct list_head *h, *htbl;
+	bool is_vmalloc;
+	int err;
+
+	if (size < 1 || size > INT_MAX)
+		return -EINVAL;
+
+	rcu_read_unlock();
+	htbl = kmalloc(sizeof(*h) * size, GFP_KERNEL);
+	if (htbl == NULL) {
+		htbl = vmalloc(sizeof(*h) * size);
+		if (htbl == NULL) {
+			err = -ENOMEM;
+			goto out_lock;
+		}
+		is_vmalloc = true;
+	} else {
+		is_vmalloc = false;
+	}
+	rcu_read_lock();
+
+	queue = instance_lookup(queue_num);
+	if (!queue || queue->peer_pid != pid) {
+		err = -ENODEV;
+		goto out_free;
+	}
+
+	for (i = 0; i < size; i++)
+		INIT_LIST_HEAD(&htbl[i]);
+	spin_lock_bh(&queue->lock);
+	swap(size, queue->queue_htblsiz);
+	queue->id_increment = INT_MAX / queue->queue_htblsiz;
+	queue->id_limit = queue->id_increment * queue->queue_htblsiz;
+	if (queue->id_offset >= queue->id_increment)
+		queue->id_offset = 0;
+	if (queue->id_sequence >= queue->id_limit)
+		queue->id_sequence = queue->id_offset;
+	queue->reciprocal_value = reciprocal_value(queue->id_increment);
+	swap(htbl, queue->queue_htbl);
+	swap(is_vmalloc, queue->vmalloc);
+	total = queue->queue_total;
+	for (i = 0; i < size; i++) {
+		struct nf_queue_entry *entry, *next;
+
+		if (total < 1)
+			break;
+		h = &queue->queue_htbl[i];
+		list_for_each_entry_safe(entry, next, h, list) {
+			list_move_tail(&entry->list,
+				       nfqnl_head_get(queue, entry->id));
+			--total;
+		}
+	}
+	spin_unlock_bh(&queue->lock);
+	err = 0;
+
+out_free:
+	rcu_read_unlock();
+	if (is_vmalloc)
+		vfree(htbl);
+	else
+		kfree(htbl);
+out_lock:
+	rcu_read_lock();
+	return err;
+}
+
 static int
 nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		  const struct nlmsghdr *nlh,
@@ -772,6 +940,18 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		spin_unlock_bh(&queue->lock);
 	}
 
+	if (nfqa[NFQA_CFG_QUEUE_HTBLSIZ]) {
+		__be32 *htblsiz;
+
+		if (!queue) {
+			ret = -ENODEV;
+			goto err_out_unlock;
+		}
+		htblsiz = nla_data(nfqa[NFQA_CFG_QUEUE_HTBLSIZ]);
+		ret = nfqnl_htbl_resize(queue_num, NETLINK_CB(skb).pid,
+					ntohl(*htblsiz));
+	}
+
 err_out_unlock:
 	rcu_read_unlock();
 	return ret;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-20 14:31 [PATCH] nfnetlink_queue: use hash table to speed up entry lookup Changli Gao
@ 2010-04-20 14:57 ` Patrick McHardy
  2010-04-21  0:04   ` Changli Gao
  2010-04-21 20:23 ` Paul E. McKenney
  1 sibling, 1 reply; 9+ messages in thread
From: Patrick McHardy @ 2010-04-20 14:57 UTC (permalink / raw)
  To: Changli Gao; +Cc: netfilter-devel, Eric Dumazet

Changli Gao wrote:
> use hash table to speed up entry lookup
> 
> A hash table is used to speed up entry lookup when the verdicts aren't received
> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
> Its default value is 1. Reciprocal division is used to lower the cost of
> division, and the entry IDs are generated carefully to get fair entry
> distribution in the buckets of the hash table.

> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
> +{
> +	u32 i;
> +
> +	for (i = 0; i < INT_MAX; i++) {
> +		queue->id_sequence += queue->id_increment;
> +		if (queue->id_sequence >= queue->id_limit) {
> +			if (++queue->id_offset >= queue->id_increment)
> +				queue->id_offset = 0;
> +			queue->id_sequence = queue->id_offset;
> +		}
> +		if (__find_entry(queue, queue->id_sequence) == NULL)
> +			return queue->id_sequence;

No freaking way. So you want to lower the overhead for your case
any everyone else has to pay the price? This means that every
existing user will now have to walk the entire queue of queued
packets for every new packet.

How about you start with something simple and try to optimize
later in case there are actually performance problems? That
probably means use a simple modulo operation for cases where
the hash table size is > 1.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-20 14:57 ` Patrick McHardy
@ 2010-04-21  0:04   ` Changli Gao
  2010-04-21 12:35     ` Patrick McHardy
  0 siblings, 1 reply; 9+ messages in thread
From: Changli Gao @ 2010-04-21  0:04 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, Eric Dumazet

On Tue, Apr 20, 2010 at 10:57 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>> use hash table to speed up entry lookup
>>
>> A hash table is used to speed up entry lookup when the verdicts aren't received
>> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
>> Its default value is 1. Reciprocal division is used to lower the cost of
>> division, and the entry IDs are generated carefully to get fair entry
>> distribution in the buckets of the hash table.
>
>> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
>> +{
>> +     u32 i;
>> +
>> +     for (i = 0; i < INT_MAX; i++) {
>> +             queue->id_sequence += queue->id_increment;
>> +             if (queue->id_sequence >= queue->id_limit) {
>> +                     if (++queue->id_offset >= queue->id_increment)
>> +                             queue->id_offset = 0;
>> +                     queue->id_sequence = queue->id_offset;
>> +             }
>> +             if (__find_entry(queue, queue->id_sequence) == NULL)
>> +                     return queue->id_sequence;
>
> No freaking way. So you want to lower the overhead for your case
> any everyone else has to pay the price? This means that every
> existing user will now have to walk the entire queue of queued
> packets for every new packet.

Oh, it is really a bad news for the current users. But how to avoid
duplicate IDs? Since we use list(hash table with 1 buckets), we must
afford this cost, although it is rare there are duplicate IDs in one
queue. How about enlarging the default size of the hash table, and
change its size with the max size of queue?

>
> How about you start with something simple and try to optimize
> later in case there are actually performance problems? That
> probably means use a simple modulo operation for cases where
> the hash table size is > 1.
>

I also think there are too much tricks in my code above, but Eric
concerns the performance of modulo.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-21  0:04   ` Changli Gao
@ 2010-04-21 12:35     ` Patrick McHardy
  2010-04-21 19:04       ` Eric Dumazet
  2010-05-01  0:05       ` Changli Gao
  0 siblings, 2 replies; 9+ messages in thread
From: Patrick McHardy @ 2010-04-21 12:35 UTC (permalink / raw)
  To: Changli Gao; +Cc: netfilter-devel, Eric Dumazet

Changli Gao wrote:
> On Tue, Apr 20, 2010 at 10:57 PM, Patrick McHardy <kaber@trash.net> wrote:
>> Changli Gao wrote:
>>> use hash table to speed up entry lookup
>>>
>>> A hash table is used to speed up entry lookup when the verdicts aren't received
>>> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
>>> Its default value is 1. Reciprocal division is used to lower the cost of
>>> division, and the entry IDs are generated carefully to get fair entry
>>> distribution in the buckets of the hash table.
>>> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
>>> +{
>>> +     u32 i;
>>> +
>>> +     for (i = 0; i < INT_MAX; i++) {
>>> +             queue->id_sequence += queue->id_increment;
>>> +             if (queue->id_sequence >= queue->id_limit) {
>>> +                     if (++queue->id_offset >= queue->id_increment)
>>> +                             queue->id_offset = 0;
>>> +                     queue->id_sequence = queue->id_offset;
>>> +             }
>>> +             if (__find_entry(queue, queue->id_sequence) == NULL)
>>> +                     return queue->id_sequence;
>> No freaking way. So you want to lower the overhead for your case
>> any everyone else has to pay the price? This means that every
>> existing user will now have to walk the entire queue of queued
>> packets for every new packet.
> 
> Oh, it is really a bad news for the current users. But how to avoid
> duplicate IDs? Since we use list(hash table with 1 buckets), we must
> afford this cost, although it is rare there are duplicate IDs in one
> queue. How about enlarging the default size of the hash table, and
> change its size with the max size of queue?

All existing users I know of process packets in order, so there's
no need to worry about duplicate IDs.

Does it really matter in your case? I mean, how long do you intend
to keep packets in userspace? Even with 10 Mpps (which you're *very*
unlikely to reach with userspace queueing) it still won't wrap
around for ~430s.

>> How about you start with something simple and try to optimize
>> later in case there are actually performance problems? That
>> probably means use a simple modulo operation for cases where
>> the hash table size is > 1.
>>
> 
> I also think there are too much tricks in my code above, but Eric
> concerns the performance of modulo.

Alternatively we could round the hash size to the next power of
two to avoid this.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-21 12:35     ` Patrick McHardy
@ 2010-04-21 19:04       ` Eric Dumazet
  2010-05-01  0:05       ` Changli Gao
  1 sibling, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2010-04-21 19:04 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Changli Gao, netfilter-devel

Le mercredi 21 avril 2010 à 14:35 +0200, Patrick McHardy a écrit :

> > I also think there are too much tricks in my code above, but Eric
> > concerns the performance of modulo.
> 
> Alternatively we could round the hash size to the next power of
> two to avoid this.

Sure, and my comments were about flex_array() abuse of divides ;)


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-20 14:31 [PATCH] nfnetlink_queue: use hash table to speed up entry lookup Changli Gao
  2010-04-20 14:57 ` Patrick McHardy
@ 2010-04-21 20:23 ` Paul E. McKenney
  2010-05-01  0:14   ` Changli Gao
  1 sibling, 1 reply; 9+ messages in thread
From: Paul E. McKenney @ 2010-04-21 20:23 UTC (permalink / raw)
  To: Changli Gao; +Cc: Patrick McHardy, netfilter-devel, Eric Dumazet

On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote:
> use hash table to speed up entry lookup
> 
> A hash table is used to speed up entry lookup when the verdicts aren't received
> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
> Its default value is 1. Reciprocal division is used to lower the cost of
> division, and the entry IDs are generated carefully to get fair entry
> distribution in the buckets of the hash table.

A few questions interspersed below.

							Thanx, Paul

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  include/linux/netfilter/nfnetlink_queue.h |    1 
>  init/Kconfig                              |    1 
>  lib/Kconfig                               |    3 
>  lib/Makefile                              |    4 
>  lib/reciprocal_div.c                      |    2 
>  net/netfilter/Kconfig                     |    1 
>  net/netfilter/nfnetlink_queue.c           |  252 +++++++++++++++++++++++++-----
>  7 files changed, 227 insertions(+), 37 deletions(-)
> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
> index 2455fe5..77b1566 100644
> --- a/include/linux/netfilter/nfnetlink_queue.h
> +++ b/include/linux/netfilter/nfnetlink_queue.h
> @@ -83,6 +83,7 @@ enum nfqnl_attr_config {
>  	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
>  	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
>  	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
> +	NFQA_CFG_QUEUE_HTBLSIZ,		/* __u32 */
>  	__NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> diff --git a/init/Kconfig b/init/Kconfig
> index cb6069e..4b4266f 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1059,6 +1059,7 @@ choice
> 
>  config SLAB
>  	bool "SLAB"
> +	select RECIPROCAL_DIV
>  	help
>  	  The regular slab allocator that is established and known to work
>  	  well in all environments. It organizes cache hot objects in
> diff --git a/lib/Kconfig b/lib/Kconfig
> index af12831..0c4b5ec 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -231,4 +231,7 @@ config IOQ
> 
>  	 If unsure, say N
> 
> +config RECIPROCAL_DIV
> +       bool
> +
>  endmenu
> diff --git a/lib/Makefile b/lib/Makefile
> index 0a6ab6f..c3555bd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -10,7 +10,7 @@ endif
>  lib-y := ctype.o string.o vsprintf.o cmdline.o \
>  	 rbtree.o radix-tree.o dump_stack.o \
>  	 idr.o int_sqrt.o extable.o prio_tree.o \
> -	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
> +	 sha1.o irq_regs.o argv_split.o \
>  	 proportions.o prio_heap.o ratelimit.o show_mem.o \
>  	 is_single_threaded.o plist.o decompress.o flex_array.o
> 
> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
> 
>  obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
> 
> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
> +
>  hostprogs-y	:= gen_crc32table
>  clean-files	:= crc32table.h
> 
> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
> index 6a3bd48..39f2e5e 100644
> --- a/lib/reciprocal_div.c
> +++ b/lib/reciprocal_div.c
> @@ -1,5 +1,6 @@
>  #include <asm/div64.h>
>  #include <linux/reciprocal_div.h>
> +#include <linux/module.h>
> 
>  u32 reciprocal_value(u32 k)
>  {
> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
>  	do_div(val, k);
>  	return (u32)val;
>  }
> +EXPORT_SYMBOL(reciprocal_value);
> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
> index 18d77b5..40b34d5 100644
> --- a/net/netfilter/Kconfig
> +++ b/net/netfilter/Kconfig
> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
>  	tristate "Netfilter NFQUEUE over NFNETLINK interface"
>  	depends on NETFILTER_ADVANCED
>  	select NETFILTER_NETLINK
> +	select RECIPROCAL_DIV
>  	help
>  	  If this option is enabled, the kernel will include support
>  	  for queueing packets via NFNETLINK.
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index e70a6ef..d3d02b7 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> @@ -28,6 +28,8 @@
>  #include <linux/netfilter/nfnetlink.h>
>  #include <linux/netfilter/nfnetlink_queue.h>
>  #include <linux/list.h>
> +#include <linux/vmalloc.h>
> +#include <linux/reciprocal_div.h>
>  #include <net/sock.h>
>  #include <net/netfilter/nf_queue.h>
> 
> @@ -37,11 +39,13 @@
>  #include "../bridge/br_private.h"
>  #endif
> 
> -#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QMAX_DEFAULT	1024
> +#define NFQNL_QHTBLSIZ_DEFAULT	1
> 
>  struct nfqnl_instance {
>  	struct hlist_node hlist;		/* global list of queues */
>  	struct rcu_head rcu;
> +	struct work_struct work;
> 
>  	int peer_pid;
>  	unsigned int queue_maxlen;
> @@ -49,15 +53,21 @@ struct nfqnl_instance {
>  	unsigned int queue_total;
>  	unsigned int queue_dropped;
>  	unsigned int queue_user_dropped;
> +	unsigned int queue_htblsiz;
> +	u32 reciprocal_value;
> 
>  	unsigned int id_sequence;		/* 'sequence' of pkt ids */
> +	unsigned int id_increment;
> +	unsigned int id_offset;
> +	unsigned int id_limit;
> 
>  	u_int16_t queue_num;			/* number of this queue */
>  	u_int8_t copy_mode;
> 
>  	spinlock_t lock;
> 
> -	struct list_head queue_list;		/* packets in queue */
> +	struct list_head *queue_htbl;		/* packets in queue */
> +	bool vmalloc;
>  };
> 
>  typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
>  	return NULL;
>  }
> 
> +static void instance_destroy_work(struct work_struct *work)
> +{
> +	struct nfqnl_instance *inst;
> +
> +	inst = container_of(work, struct nfqnl_instance, work);
> +	if (inst->vmalloc)
> +		vfree(inst->queue_htbl);
> +	else
> +		kfree(inst->queue_htbl);
> +	kfree(inst);
> +	module_put(THIS_MODULE);
> +}
> +
>  static struct nfqnl_instance *
>  instance_create(u_int16_t queue_num, int pid)
>  {
>  	struct nfqnl_instance *inst;
> -	unsigned int h;
> +	unsigned int h, i;
>  	int err;
> 
> -	spin_lock(&instances_lock);
> -	if (instance_lookup(queue_num)) {
> -		err = -EEXIST;
> -		goto out_unlock;
> -	}
> -
> +	rcu_read_unlock();

This seems strange -- are all the callers aware that instance_create()
temporarily exits the RCU read-side critical section?

>  	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
>  	if (!inst) {
>  		err = -ENOMEM;
> -		goto out_unlock;
> +		goto out_lock;
>  	}
> 
> +	INIT_WORK(&inst->work, instance_destroy_work);
>  	inst->queue_num = queue_num;
>  	inst->peer_pid = pid;
>  	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>  	inst->copy_range = 0xfffff;
>  	inst->copy_mode = NFQNL_COPY_NONE;
>  	spin_lock_init(&inst->lock);
> -	INIT_LIST_HEAD(&inst->queue_list);
> +	inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
> +	inst->id_increment = INT_MAX / inst->queue_htblsiz;
> +	inst->id_limit = inst->id_increment * inst->queue_htblsiz;
> +	inst->reciprocal_value = reciprocal_value(inst->id_increment);
> +	inst->queue_htbl = kmalloc(sizeof(struct list_head) *
> +				   inst->queue_htblsiz, GFP_KERNEL);
> +	if (inst->queue_htbl == NULL) {
> +		inst->queue_htbl = vmalloc(sizeof(struct list_head) *
> +					   inst->queue_htblsiz);
> +		if (inst->queue_htbl == NULL) {
> +			err = -ENOMEM;
> +			goto out_free;
> +		}
> +		inst->vmalloc = true;
> +	}
> +	for (i = 0; i < inst->queue_htblsiz; i++)
> +		INIT_LIST_HEAD(&inst->queue_htbl[i]);
> 
>  	if (!try_module_get(THIS_MODULE)) {
>  		err = -EAGAIN;
>  		goto out_free;
>  	}
> +	rcu_read_lock();
> 
> +	spin_lock(&instances_lock);
> +	if (instance_lookup(queue_num)) {
> +		err = -EEXIST;
> +		spin_unlock(&instances_lock);
> +		rcu_read_unlock();
> +		goto out_free;
> +	}
>  	h = instance_hashfn(queue_num);
>  	hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
> -
>  	spin_unlock(&instances_lock);
> 
>  	return inst;
> 
>  out_free:
> +	if (inst->queue_htbl) {
> +		if (inst->vmalloc)
> +			vfree(inst->queue_htbl);
> +		else
> +			kfree(inst->queue_htbl);
> +	}
>  	kfree(inst);
> -out_unlock:
> -	spin_unlock(&instances_lock);
> +out_lock:
> +	rcu_read_lock();
>  	return ERR_PTR(err);
>  }
> 
> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
>  						   rcu);
> 
>  	nfqnl_flush(inst, NULL, 0);
> -	kfree(inst);
> -	module_put(THIS_MODULE);
> +	schedule_work(&inst->work);
>  }
> 
>  static void
> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
>  	spin_unlock(&instances_lock);
>  }
> 
> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
> +					       unsigned int id)
> +{
> +	return &queue->queue_htbl[reciprocal_divide(id,
> +						    queue->reciprocal_value)];
> +}
> +
> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
> +					   u32 id)
> +{
> +	struct nf_queue_entry *entry;
> +	struct list_head *head;
> +
> +	head = nfqnl_head_get(queue, id);
> +	list_for_each_entry(entry, head, list) {
> +		if (entry->id == id)
> +			return entry;
> +	}
> +
> +	return NULL;
> +}
> +
> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
> +{
> +	u32 i;
> +
> +	for (i = 0; i < INT_MAX; i++) {
> +		queue->id_sequence += queue->id_increment;
> +		if (queue->id_sequence >= queue->id_limit) {
> +			if (++queue->id_offset >= queue->id_increment)
> +				queue->id_offset = 0;
> +			queue->id_sequence = queue->id_offset;
> +		}
> +		if (__find_entry(queue, queue->id_sequence) == NULL)
> +			return queue->id_sequence;
> +	}
> +
> +	return INT_MAX;
> +}
> +
>  static inline void
>  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>  {
> -       list_add_tail(&entry->list, &queue->queue_list);
> -       queue->queue_total++;
> +	struct list_head *head;
> +
> +	head = nfqnl_head_get(queue, entry->id);
> +	list_add_tail(&entry->list, head);
> +	queue->queue_total++;
>  }
> 
>  static struct nf_queue_entry *
>  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>  {
> -	struct nf_queue_entry *entry = NULL, *i;
> +	struct nf_queue_entry *entry;
> 
>  	spin_lock_bh(&queue->lock);
> -
> -	list_for_each_entry(i, &queue->queue_list, list) {
> -		if (i->id == id) {
> -			entry = i;
> -			break;
> -		}
> -	}
> -
> +	entry = __find_entry(queue, id);
>  	if (entry) {
>  		list_del(&entry->list);
>  		queue->queue_total--;
>  	}
> -
>  	spin_unlock_bh(&queue->lock);
> 
>  	return entry;
> @@ -197,13 +279,22 @@ static void
>  nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>  {
>  	struct nf_queue_entry *entry, *next;
> +	unsigned int i, total;
> +	struct list_head *head;
> 
>  	spin_lock_bh(&queue->lock);
> -	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
> -		if (!cmpfn || cmpfn(entry, data)) {
> -			list_del(&entry->list);
> -			queue->queue_total--;
> -			nf_reinject(entry, NF_DROP);
> +	total = queue->queue_total;
> +	for (i = 0; i < queue->queue_htblsiz; i++) {
> +		if (total < 1)
> +			break;
> +		head = &queue->queue_htbl[i];
> +		list_for_each_entry_safe(entry, next, head, list) {
> +			if (!cmpfn || cmpfn(entry, data)) {
> +				list_del(&entry->list);
> +				queue->queue_total--;
> +				nf_reinject(entry, NF_DROP);
> +			}
> +			--total;
>  		}
>  	}
>  	spin_unlock_bh(&queue->lock);
> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>  		break;
>  	}
> 
> -	entry->id = queue->id_sequence++;
> +	entry->id = __get_uniq_id(queue);
> +	if (entry->id == INT_MAX) {
> +		spin_unlock_bh(&queue->lock);
> +		return NULL;
> +	}
> +	__enqueue_entry(queue, entry);
> 
>  	spin_unlock_bh(&queue->lock);
> 
> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
> 
>  nlmsg_failure:
>  nla_put_failure:
> +	find_dequeue_entry(queue, entry->id);
>  	if (skb)
>  		kfree_skb(skb);
>  	if (net_ratelimit())
> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
>  		goto err_out_unlock;
>  	}
> 
> -	__enqueue_entry(queue, entry);
> -
>  	spin_unlock_bh(&queue->lock);
>  	return 0;
> 
>  err_out_free_nskb:
>  	kfree_skb(nskb);
>  err_out_unlock:
> +	list_del(&entry->list);
> +	queue->queue_total--;
>  	spin_unlock_bh(&queue->lock);
>  err_out:
>  	return -1;
> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
>  	.outfn	= &nfqnl_enqueue_packet,
>  };
> 
> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
> +{
> +	struct nfqnl_instance *queue;
> +	unsigned int i, total;
> +	struct list_head *h, *htbl;
> +	bool is_vmalloc;
> +	int err;
> +
> +	if (size < 1 || size > INT_MAX)
> +		return -EINVAL;
> +
> +	rcu_read_unlock();

Again, this seems strange.  As near as I can tell, the caller immediately
exits the RCU read-side critical section, so why not have the caller do
the exit immediately before calling nfqnl_htbl_resize()?

> +	htbl = kmalloc(sizeof(*h) * size, GFP_KERNEL);
> +	if (htbl == NULL) {
> +		htbl = vmalloc(sizeof(*h) * size);
> +		if (htbl == NULL) {
> +			err = -ENOMEM;
> +			goto out_lock;
> +		}
> +		is_vmalloc = true;
> +	} else {
> +		is_vmalloc = false;
> +	}
> +	rcu_read_lock();
> +
> +	queue = instance_lookup(queue_num);
> +	if (!queue || queue->peer_pid != pid) {
> +		err = -ENODEV;
> +		goto out_free;
> +	}
> +
> +	for (i = 0; i < size; i++)
> +		INIT_LIST_HEAD(&htbl[i]);
> +	spin_lock_bh(&queue->lock);
> +	swap(size, queue->queue_htblsiz);
> +	queue->id_increment = INT_MAX / queue->queue_htblsiz;
> +	queue->id_limit = queue->id_increment * queue->queue_htblsiz;
> +	if (queue->id_offset >= queue->id_increment)
> +		queue->id_offset = 0;
> +	if (queue->id_sequence >= queue->id_limit)
> +		queue->id_sequence = queue->id_offset;
> +	queue->reciprocal_value = reciprocal_value(queue->id_increment);
> +	swap(htbl, queue->queue_htbl);
> +	swap(is_vmalloc, queue->vmalloc);
> +	total = queue->queue_total;
> +	for (i = 0; i < size; i++) {
> +		struct nf_queue_entry *entry, *next;
> +
> +		if (total < 1)
> +			break;
> +		h = &queue->queue_htbl[i];
> +		list_for_each_entry_safe(entry, next, h, list) {
> +			list_move_tail(&entry->list,
> +				       nfqnl_head_get(queue, entry->id));
> +			--total;
> +		}
> +	}
> +	spin_unlock_bh(&queue->lock);
> +	err = 0;
> +
> +out_free:
> +	rcu_read_unlock();
> +	if (is_vmalloc)
> +		vfree(htbl);
> +	else
> +		kfree(htbl);
> +out_lock:
> +	rcu_read_lock();
> +	return err;
> +}
> +
>  static int
>  nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
>  		  const struct nlmsghdr *nlh,
> @@ -772,6 +940,18 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
>  		spin_unlock_bh(&queue->lock);
>  	}
> 
> +	if (nfqa[NFQA_CFG_QUEUE_HTBLSIZ]) {
> +		__be32 *htblsiz;
> +
> +		if (!queue) {
> +			ret = -ENODEV;
> +			goto err_out_unlock;
> +		}
> +		htblsiz = nla_data(nfqa[NFQA_CFG_QUEUE_HTBLSIZ]);
> +		ret = nfqnl_htbl_resize(queue_num, NETLINK_CB(skb).pid,
> +					ntohl(*htblsiz));
> +	}
> +
>  err_out_unlock:
>  	rcu_read_unlock();
>  	return ret;
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-21 12:35     ` Patrick McHardy
  2010-04-21 19:04       ` Eric Dumazet
@ 2010-05-01  0:05       ` Changli Gao
  2010-05-01 16:42         ` Patrick McHardy
  1 sibling, 1 reply; 9+ messages in thread
From: Changli Gao @ 2010-05-01  0:05 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, Eric Dumazet

On Wed, Apr 21, 2010 at 8:35 PM, Patrick McHardy <kaber@trash.net> wrote:
>
> All existing users I know of process packets in order, so there's
> no need to worry about duplicate IDs.
>
> Does it really matter in your case? I mean, how long do you intend
> to keep packets in userspace? Even with 10 Mpps (which you're *very*
> unlikely to reach with userspace queueing) it still won't wrap
> around for ~430s.

Although it may not happen, we'd better deal with it. I have found a
way to speed up duplicate IDs lookup. Since the IDs in a bucket are in
order, we can first check if the new ID is in the range of first_ID
and last_ID, and in most cases, we don't need to travel the whole
bucket.

>
> Alternatively we could round the hash size to the next power of
> two to avoid this.
>

It is a better idea. Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-04-21 20:23 ` Paul E. McKenney
@ 2010-05-01  0:14   ` Changli Gao
  0 siblings, 0 replies; 9+ messages in thread
From: Changli Gao @ 2010-05-01  0:14 UTC (permalink / raw)
  To: paulmck; +Cc: Patrick McHardy, netfilter-devel, Eric Dumazet

On Thu, Apr 22, 2010 at 4:23 AM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote:
>> use hash table to speed up entry lookup
>>
>> A hash table is used to speed up entry lookup when the verdicts aren't received
>> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
>> Its default value is 1. Reciprocal division is used to lower the cost of
>> division, and the entry IDs are generated carefully to get fair entry
>> distribution in the buckets of the hash table.
>
> A few questions interspersed below.
>
>                                                        Thanx, Paul
>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>> ----
>>  include/linux/netfilter/nfnetlink_queue.h |    1
>>  init/Kconfig                              |    1
>>  lib/Kconfig                               |    3
>>  lib/Makefile                              |    4
>>  lib/reciprocal_div.c                      |    2
>>  net/netfilter/Kconfig                     |    1
>>  net/netfilter/nfnetlink_queue.c           |  252 +++++++++++++++++++++++++-----
>>  7 files changed, 227 insertions(+), 37 deletions(-)
>> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
>> index 2455fe5..77b1566 100644
>> --- a/include/linux/netfilter/nfnetlink_queue.h
>> +++ b/include/linux/netfilter/nfnetlink_queue.h
>> @@ -83,6 +83,7 @@ enum nfqnl_attr_config {
>>       NFQA_CFG_CMD,                   /* nfqnl_msg_config_cmd */
>>       NFQA_CFG_PARAMS,                /* nfqnl_msg_config_params */
>>       NFQA_CFG_QUEUE_MAXLEN,          /* __u32 */
>> +     NFQA_CFG_QUEUE_HTBLSIZ,         /* __u32 */
>>       __NFQA_CFG_MAX
>>  };
>>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
>> diff --git a/init/Kconfig b/init/Kconfig
>> index cb6069e..4b4266f 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1059,6 +1059,7 @@ choice
>>
>>  config SLAB
>>       bool "SLAB"
>> +     select RECIPROCAL_DIV
>>       help
>>         The regular slab allocator that is established and known to work
>>         well in all environments. It organizes cache hot objects in
>> diff --git a/lib/Kconfig b/lib/Kconfig
>> index af12831..0c4b5ec 100644
>> --- a/lib/Kconfig
>> +++ b/lib/Kconfig
>> @@ -231,4 +231,7 @@ config IOQ
>>
>>        If unsure, say N
>>
>> +config RECIPROCAL_DIV
>> +       bool
>> +
>>  endmenu
>> diff --git a/lib/Makefile b/lib/Makefile
>> index 0a6ab6f..c3555bd 100644
>> --- a/lib/Makefile
>> +++ b/lib/Makefile
>> @@ -10,7 +10,7 @@ endif
>>  lib-y := ctype.o string.o vsprintf.o cmdline.o \
>>        rbtree.o radix-tree.o dump_stack.o \
>>        idr.o int_sqrt.o extable.o prio_tree.o \
>> -      sha1.o irq_regs.o reciprocal_div.o argv_split.o \
>> +      sha1.o irq_regs.o argv_split.o \
>>        proportions.o prio_heap.o ratelimit.o show_mem.o \
>>        is_single_threaded.o plist.o decompress.o flex_array.o
>>
>> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
>>
>>  obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
>>
>> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
>> +
>>  hostprogs-y  := gen_crc32table
>>  clean-files  := crc32table.h
>>
>> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
>> index 6a3bd48..39f2e5e 100644
>> --- a/lib/reciprocal_div.c
>> +++ b/lib/reciprocal_div.c
>> @@ -1,5 +1,6 @@
>>  #include <asm/div64.h>
>>  #include <linux/reciprocal_div.h>
>> +#include <linux/module.h>
>>
>>  u32 reciprocal_value(u32 k)
>>  {
>> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
>>       do_div(val, k);
>>       return (u32)val;
>>  }
>> +EXPORT_SYMBOL(reciprocal_value);
>> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
>> index 18d77b5..40b34d5 100644
>> --- a/net/netfilter/Kconfig
>> +++ b/net/netfilter/Kconfig
>> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
>>       tristate "Netfilter NFQUEUE over NFNETLINK interface"
>>       depends on NETFILTER_ADVANCED
>>       select NETFILTER_NETLINK
>> +     select RECIPROCAL_DIV
>>       help
>>         If this option is enabled, the kernel will include support
>>         for queueing packets via NFNETLINK.
>> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
>> index e70a6ef..d3d02b7 100644
>> --- a/net/netfilter/nfnetlink_queue.c
>> +++ b/net/netfilter/nfnetlink_queue.c
>> @@ -28,6 +28,8 @@
>>  #include <linux/netfilter/nfnetlink.h>
>>  #include <linux/netfilter/nfnetlink_queue.h>
>>  #include <linux/list.h>
>> +#include <linux/vmalloc.h>
>> +#include <linux/reciprocal_div.h>
>>  #include <net/sock.h>
>>  #include <net/netfilter/nf_queue.h>
>>
>> @@ -37,11 +39,13 @@
>>  #include "../bridge/br_private.h"
>>  #endif
>>
>> -#define NFQNL_QMAX_DEFAULT 1024
>> +#define NFQNL_QMAX_DEFAULT   1024
>> +#define NFQNL_QHTBLSIZ_DEFAULT       1
>>
>>  struct nfqnl_instance {
>>       struct hlist_node hlist;                /* global list of queues */
>>       struct rcu_head rcu;
>> +     struct work_struct work;
>>
>>       int peer_pid;
>>       unsigned int queue_maxlen;
>> @@ -49,15 +53,21 @@ struct nfqnl_instance {
>>       unsigned int queue_total;
>>       unsigned int queue_dropped;
>>       unsigned int queue_user_dropped;
>> +     unsigned int queue_htblsiz;
>> +     u32 reciprocal_value;
>>
>>       unsigned int id_sequence;               /* 'sequence' of pkt ids */
>> +     unsigned int id_increment;
>> +     unsigned int id_offset;
>> +     unsigned int id_limit;
>>
>>       u_int16_t queue_num;                    /* number of this queue */
>>       u_int8_t copy_mode;
>>
>>       spinlock_t lock;
>>
>> -     struct list_head queue_list;            /* packets in queue */
>> +     struct list_head *queue_htbl;           /* packets in queue */
>> +     bool vmalloc;
>>  };
>>
>>  typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
>> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
>>       return NULL;
>>  }
>>
>> +static void instance_destroy_work(struct work_struct *work)
>> +{
>> +     struct nfqnl_instance *inst;
>> +
>> +     inst = container_of(work, struct nfqnl_instance, work);
>> +     if (inst->vmalloc)
>> +             vfree(inst->queue_htbl);
>> +     else
>> +             kfree(inst->queue_htbl);
>> +     kfree(inst);
>> +     module_put(THIS_MODULE);
>> +}
>> +
>>  static struct nfqnl_instance *
>>  instance_create(u_int16_t queue_num, int pid)
>>  {
>>       struct nfqnl_instance *inst;
>> -     unsigned int h;
>> +     unsigned int h, i;
>>       int err;
>>
>> -     spin_lock(&instances_lock);
>> -     if (instance_lookup(queue_num)) {
>> -             err = -EEXIST;
>> -             goto out_unlock;
>> -     }
>> -
>> +     rcu_read_unlock();
>
> This seems strange -- are all the callers aware that instance_create()
> temporarily exits the RCU read-side critical section?

There is only one caller, so it is awares. But it seems that I should
add suffix "_rcu_read_locked()" to instrance_create().

>
>>       inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
>>       if (!inst) {
>>               err = -ENOMEM;
>> -             goto out_unlock;
>> +             goto out_lock;
>>       }
>>
>> +     INIT_WORK(&inst->work, instance_destroy_work);
>>       inst->queue_num = queue_num;
>>       inst->peer_pid = pid;
>>       inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>>       inst->copy_range = 0xfffff;
>>       inst->copy_mode = NFQNL_COPY_NONE;
>>       spin_lock_init(&inst->lock);
>> -     INIT_LIST_HEAD(&inst->queue_list);
>> +     inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
>> +     inst->id_increment = INT_MAX / inst->queue_htblsiz;
>> +     inst->id_limit = inst->id_increment * inst->queue_htblsiz;
>> +     inst->reciprocal_value = reciprocal_value(inst->id_increment);
>> +     inst->queue_htbl = kmalloc(sizeof(struct list_head) *
>> +                                inst->queue_htblsiz, GFP_KERNEL);
>> +     if (inst->queue_htbl == NULL) {
>> +             inst->queue_htbl = vmalloc(sizeof(struct list_head) *
>> +                                        inst->queue_htblsiz);
>> +             if (inst->queue_htbl == NULL) {
>> +                     err = -ENOMEM;
>> +                     goto out_free;
>> +             }
>> +             inst->vmalloc = true;
>> +     }
>> +     for (i = 0; i < inst->queue_htblsiz; i++)
>> +             INIT_LIST_HEAD(&inst->queue_htbl[i]);
>>
>>       if (!try_module_get(THIS_MODULE)) {
>>               err = -EAGAIN;
>>               goto out_free;
>>       }
>> +     rcu_read_lock();
>>
>> +     spin_lock(&instances_lock);
>> +     if (instance_lookup(queue_num)) {
>> +             err = -EEXIST;
>> +             spin_unlock(&instances_lock);
>> +             rcu_read_unlock();
>> +             goto out_free;
>> +     }
>>       h = instance_hashfn(queue_num);
>>       hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
>> -
>>       spin_unlock(&instances_lock);
>>
>>       return inst;
>>
>>  out_free:
>> +     if (inst->queue_htbl) {
>> +             if (inst->vmalloc)
>> +                     vfree(inst->queue_htbl);
>> +             else
>> +                     kfree(inst->queue_htbl);
>> +     }
>>       kfree(inst);
>> -out_unlock:
>> -     spin_unlock(&instances_lock);
>> +out_lock:
>> +     rcu_read_lock();
>>       return ERR_PTR(err);
>>  }
>>
>> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
>>                                                  rcu);
>>
>>       nfqnl_flush(inst, NULL, 0);
>> -     kfree(inst);
>> -     module_put(THIS_MODULE);
>> +     schedule_work(&inst->work);
>>  }
>>
>>  static void
>> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
>>       spin_unlock(&instances_lock);
>>  }
>>
>> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
>> +                                            unsigned int id)
>> +{
>> +     return &queue->queue_htbl[reciprocal_divide(id,
>> +                                                 queue->reciprocal_value)];
>> +}
>> +
>> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
>> +                                        u32 id)
>> +{
>> +     struct nf_queue_entry *entry;
>> +     struct list_head *head;
>> +
>> +     head = nfqnl_head_get(queue, id);
>> +     list_for_each_entry(entry, head, list) {
>> +             if (entry->id == id)
>> +                     return entry;
>> +     }
>> +
>> +     return NULL;
>> +}
>> +
>> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
>> +{
>> +     u32 i;
>> +
>> +     for (i = 0; i < INT_MAX; i++) {
>> +             queue->id_sequence += queue->id_increment;
>> +             if (queue->id_sequence >= queue->id_limit) {
>> +                     if (++queue->id_offset >= queue->id_increment)
>> +                             queue->id_offset = 0;
>> +                     queue->id_sequence = queue->id_offset;
>> +             }
>> +             if (__find_entry(queue, queue->id_sequence) == NULL)
>> +                     return queue->id_sequence;
>> +     }
>> +
>> +     return INT_MAX;
>> +}
>> +
>>  static inline void
>>  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>>  {
>> -       list_add_tail(&entry->list, &queue->queue_list);
>> -       queue->queue_total++;
>> +     struct list_head *head;
>> +
>> +     head = nfqnl_head_get(queue, entry->id);
>> +     list_add_tail(&entry->list, head);
>> +     queue->queue_total++;
>>  }
>>
>>  static struct nf_queue_entry *
>>  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>>  {
>> -     struct nf_queue_entry *entry = NULL, *i;
>> +     struct nf_queue_entry *entry;
>>
>>       spin_lock_bh(&queue->lock);
>> -
>> -     list_for_each_entry(i, &queue->queue_list, list) {
>> -             if (i->id == id) {
>> -                     entry = i;
>> -                     break;
>> -             }
>> -     }
>> -
>> +     entry = __find_entry(queue, id);
>>       if (entry) {
>>               list_del(&entry->list);
>>               queue->queue_total--;
>>       }
>> -
>>       spin_unlock_bh(&queue->lock);
>>
>>       return entry;
>> @@ -197,13 +279,22 @@ static void
>>  nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>>  {
>>       struct nf_queue_entry *entry, *next;
>> +     unsigned int i, total;
>> +     struct list_head *head;
>>
>>       spin_lock_bh(&queue->lock);
>> -     list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
>> -             if (!cmpfn || cmpfn(entry, data)) {
>> -                     list_del(&entry->list);
>> -                     queue->queue_total--;
>> -                     nf_reinject(entry, NF_DROP);
>> +     total = queue->queue_total;
>> +     for (i = 0; i < queue->queue_htblsiz; i++) {
>> +             if (total < 1)
>> +                     break;
>> +             head = &queue->queue_htbl[i];
>> +             list_for_each_entry_safe(entry, next, head, list) {
>> +                     if (!cmpfn || cmpfn(entry, data)) {
>> +                             list_del(&entry->list);
>> +                             queue->queue_total--;
>> +                             nf_reinject(entry, NF_DROP);
>> +                     }
>> +                     --total;
>>               }
>>       }
>>       spin_unlock_bh(&queue->lock);
>> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>>               break;
>>       }
>>
>> -     entry->id = queue->id_sequence++;
>> +     entry->id = __get_uniq_id(queue);
>> +     if (entry->id == INT_MAX) {
>> +             spin_unlock_bh(&queue->lock);
>> +             return NULL;
>> +     }
>> +     __enqueue_entry(queue, entry);
>>
>>       spin_unlock_bh(&queue->lock);
>>
>> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>>
>>  nlmsg_failure:
>>  nla_put_failure:
>> +     find_dequeue_entry(queue, entry->id);
>>       if (skb)
>>               kfree_skb(skb);
>>       if (net_ratelimit())
>> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
>>               goto err_out_unlock;
>>       }
>>
>> -     __enqueue_entry(queue, entry);
>> -
>>       spin_unlock_bh(&queue->lock);
>>       return 0;
>>
>>  err_out_free_nskb:
>>       kfree_skb(nskb);
>>  err_out_unlock:
>> +     list_del(&entry->list);
>> +     queue->queue_total--;
>>       spin_unlock_bh(&queue->lock);
>>  err_out:
>>       return -1;
>> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
>>       .outfn  = &nfqnl_enqueue_packet,
>>  };
>>
>> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
>> +{
>> +     struct nfqnl_instance *queue;
>> +     unsigned int i, total;
>> +     struct list_head *h, *htbl;
>> +     bool is_vmalloc;
>> +     int err;
>> +
>> +     if (size < 1 || size > INT_MAX)
>> +             return -EINVAL;
>> +
>> +     rcu_read_unlock();
>
> Again, this seems strange.  As near as I can tell, the caller immediately
> exits the RCU read-side critical section, so why not have the caller do
> the exit immediately before calling nfqnl_htbl_resize()?
>

It sounds reasonable. Thanks.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
  2010-05-01  0:05       ` Changli Gao
@ 2010-05-01 16:42         ` Patrick McHardy
  0 siblings, 0 replies; 9+ messages in thread
From: Patrick McHardy @ 2010-05-01 16:42 UTC (permalink / raw)
  To: Changli Gao; +Cc: netfilter-devel, Eric Dumazet

Changli Gao wrote:
> On Wed, Apr 21, 2010 at 8:35 PM, Patrick McHardy <kaber@trash.net> wrote:
>> All existing users I know of process packets in order, so there's
>> no need to worry about duplicate IDs.
>>
>> Does it really matter in your case? I mean, how long do you intend
>> to keep packets in userspace? Even with 10 Mpps (which you're *very*
>> unlikely to reach with userspace queueing) it still won't wrap
>> around for ~430s.
> 
> Although it may not happen, we'd better deal with it. I have found a
> way to speed up duplicate IDs lookup. Since the IDs in a bucket are in
> order, we can first check if the new ID is in the range of first_ID
> and last_ID, and in most cases, we don't need to travel the whole
> bucket.

They are not necessarily in order since the ID counter will eventually
wrap, but elements are always appened (at least currently).

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-05-01 16:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-20 14:31 [PATCH] nfnetlink_queue: use hash table to speed up entry lookup Changli Gao
2010-04-20 14:57 ` Patrick McHardy
2010-04-21  0:04   ` Changli Gao
2010-04-21 12:35     ` Patrick McHardy
2010-04-21 19:04       ` Eric Dumazet
2010-05-01  0:05       ` Changli Gao
2010-05-01 16:42         ` Patrick McHardy
2010-04-21 20:23 ` Paul E. McKenney
2010-05-01  0:14   ` Changli Gao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.