All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] dst_entry structure use,lastuse and refcnt abstraction
@ 2005-06-24  3:04 Christoph Lameter
  2005-06-24  3:10 ` [PATCH] dst numa: Avoid dst counter cacheline bouncing Christoph Lameter
  2005-06-24  3:36 ` [PATCH] dst_entry structure use,lastuse and refcnt abstraction David S. Miller
  0 siblings, 2 replies; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  3:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-net, davem, shai, akpm

This patch introduces macros to handle the use, lastuse and refcnt fiels in
the dst_entry structure using macros. Having macros manipulate these fields
allows cleaner source code and provides an easy way for modifications to the
way these performance critical fields are handled.

The introduction of macros removes some code that is repeated in 
various places. Also

- decnet_dn_route: introduces dn_dst_useful to check the usefulness of a dst
		entry. dst_update_rtu used to reduce code duplication.

- net/ipv4/route.c: add ip_rt_copy. dst_update_rtu used to reduce code duplication.

The patch also removes the atomic_dec_and_test in dst_destroy. The issue
was discussed a couple of weeks ago on netdev@oss.sgi.com in the following
thread:

http://marc.theaimsgroup.com/?t=111272752600001&r=1&w=2

The patch is a prerequisite for the dst numa patch. Patch against 
2.6.12-mm1.

Signed-off-by: Pravin B. Shelar <pravins@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>

Index: linux-2.6.12/include/net/dst.h
===================================================================
--- linux-2.6.12.orig/include/net/dst.h	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/include/net/dst.h	2005-06-24 03:00:51.000000000 +0000
@@ -103,6 +103,30 @@ struct dst_ops
 
 #ifdef __KERNEL__
 
+#define dst_use(__dst) (__dst)->__use
+#define dst_use_inc(__dst) (__dst)->__use++
+
+#define dst_lastuse(__dst) (__dst)->lastuse
+#define dst_lastuse_set(__dst) (__dst)->lastuse = jiffies
+
+#define dst_update_tu(__dst) do { dst_lastuse_set(__dst);dst_use_inc(__dst); } while (0)
+#define dst_update_rtu(__dst) do { dst_lastuse_set(__dst);dst_hold(__dst);dst_use_inc(__dst); } while (0)
+
+#define dst_refcnt(__dst) atomic_read(&(__dst)->__refcnt)
+#define dst_refcnt_one(__dst) atomic_set(&(__dst)->__refcnt, 1)
+#define dst_refcnt_dec(__dst) atomic_dec(&(__dst)->__refcnt)
+#define dst_hold(__dst) atomic_inc(&(__dst)->__refcnt)
+
+static inline
+void dst_release(struct dst_entry * dst)
+{
+	if (dst) {
+		WARN_ON(dst_refcnt(dst) < 1);
+		smp_mb__before_atomic_dec();
+		dst_refcnt_dec(dst);
+	}
+}
+
 static inline u32
 dst_metric(const struct dst_entry *dst, int metric)
 {
@@ -134,29 +158,14 @@ dst_metric_locked(struct dst_entry *dst,
 	return dst_metric(dst, RTAX_LOCK) & (1<<metric);
 }
 
-static inline void dst_hold(struct dst_entry * dst)
-{
-	atomic_inc(&dst->__refcnt);
-}
-
 static inline
 struct dst_entry * dst_clone(struct dst_entry * dst)
 {
 	if (dst)
-		atomic_inc(&dst->__refcnt);
+		dst_hold(dst);
 	return dst;
 }
 
-static inline
-void dst_release(struct dst_entry * dst)
-{
-	if (dst) {
-		WARN_ON(atomic_read(&dst->__refcnt) < 1);
-		smp_mb__before_atomic_dec();
-		atomic_dec(&dst->__refcnt);
-	}
-}
-
 /* Children define the path of the packet through the
  * Linux networking.  Thus, destinations are stackable.
  */
@@ -177,7 +186,7 @@ static inline void dst_free(struct dst_e
 {
 	if (dst->obsolete > 1)
 		return;
-	if (!atomic_read(&dst->__refcnt)) {
+	if (!dst_refcnt(dst)) {
 		dst = dst_destroy(dst);
 		if (!dst)
 			return;
Index: linux-2.6.12/net/core/dst.c
===================================================================
--- linux-2.6.12.orig/net/core/dst.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/core/dst.c	2005-06-24 03:00:51.000000000 +0000
@@ -56,7 +56,7 @@ static void dst_run_gc(unsigned long dum
 	del_timer(&dst_gc_timer);
 	dstp = &dst_garbage_list;
 	while ((dst = *dstp) != NULL) {
-		if (atomic_read(&dst->__refcnt)) {
+		if (dst_refcnt(dst)) {
 			dstp = &dst->next;
 			delayed++;
 			continue;
@@ -169,9 +169,8 @@ struct dst_entry *dst_destroy(struct dst
 	struct neighbour *neigh;
 	struct hh_cache *hh;
 
-	smp_rmb();
-
 again:
+	smp_rmb();
 	neigh = dst->neighbour;
 	hh = dst->hh;
 	child = dst->child;
@@ -199,16 +198,16 @@ again:
 	dst = child;
 	if (dst) {
 		int nohash = dst->flags & DST_NOHASH;
-
-		if (atomic_dec_and_test(&dst->__refcnt)) {
-			/* We were real parent of this dst, so kill child. */
-			if (nohash)
+		dst_refcnt_dec(dst);
+		if (nohash) {
+			if (!dst_refcnt(dst)) {
+				/* We were real parent of this dst, so kill child. */
 				goto again;
-		} else {
-			/* Child is still referenced, return it for freeing. */
-			if (nohash)
+			} else {
+				/* Child is still referenced, return it for freeing. */
 				return dst;
-			/* Child is still in his hash table */
+				/* Child is still in his hash table */
+			}
 		}
 	}
 	return NULL;
Index: linux-2.6.12/net/decnet/dn_route.c
===================================================================
--- linux-2.6.12.orig/net/decnet/dn_route.c	2005-06-21 04:01:33.000000000 +0000
+++ linux-2.6.12/net/decnet/dn_route.c	2005-06-24 03:00:51.000000000 +0000
@@ -155,6 +155,11 @@ static inline void dnrt_drop(struct dn_r
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
+static inline int dn_dst_useful(struct dn_route *rth, unsigned long now, unsigned long expire)
+{
+	return  (atomic_read(&rth->u.dst.__refcnt) || (now - rth->u.dst.lastuse) < expire) ;
+}
+
 static void dn_dst_check_expire(unsigned long dummy)
 {
 	int i;
@@ -167,8 +172,7 @@ static void dn_dst_check_expire(unsigned
 
 		spin_lock(&dn_rt_hash_table[i].lock);
 		while((rt=*rtp) != NULL) {
-			if (atomic_read(&rt->u.dst.__refcnt) ||
-					(now - rt->u.dst.lastuse) < expire) {
+			if (dn_dst_useful(rt, now, expire)) {
 				rtp = &rt->u.rt_next;
 				continue;
 			}
@@ -198,8 +202,7 @@ static int dn_dst_gc(void)
 		rtp = &dn_rt_hash_table[i].chain;
 
 		while((rt=*rtp) != NULL) {
-			if (atomic_read(&rt->u.dst.__refcnt) ||
-					(now - rt->u.dst.lastuse) < expire) {
+			if (dn_dst_useful(rt, now, expire)) {
 				rtp = &rt->u.rt_next;
 				continue;
 			}
@@ -277,10 +280,8 @@ static inline int compare_keys(struct fl
 static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp)
 {
 	struct dn_route *rth, **rthp;
-	unsigned long now = jiffies;
-
-	rthp = &dn_rt_hash_table[hash].chain;
 
+ 	rthp = &dn_rt_hash_table[hash].chain;
 	spin_lock_bh(&dn_rt_hash_table[hash].lock);
 	while((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
@@ -290,9 +291,7 @@ static int dn_insert_route(struct dn_rou
 					   dn_rt_hash_table[hash].chain);
 			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
 
-			rth->u.dst.__use++;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.lastuse = now;
+			dst_update_rtu(&rth->u.dst);
 			spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 
 			dnrt_drop(rt);
@@ -304,10 +303,8 @@ static int dn_insert_route(struct dn_rou
 
 	rcu_assign_pointer(rt->u.rt_next, dn_rt_hash_table[hash].chain);
 	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
-	
-	dst_hold(&rt->u.dst);
-	rt->u.dst.__use++;
-	rt->u.dst.lastuse = now;
+
+	dst_update_rtu(&rt->u.dst);
 	spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 	*rp = rt;
 	return 0;
@@ -1091,7 +1088,7 @@ make_route:
 	if (rt == NULL)
 		goto e_nobufs;
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rt->u.dst);
 	rt->u.dst.flags   = DST_HOST;
 
 	rt->fl.fld_src    = oldflp->fld_src;
@@ -1115,7 +1112,7 @@ make_route:
 	rt->u.dst.neighbour = neigh;
 	neigh = NULL;
 
-	rt->u.dst.lastuse = jiffies;
+	dst_lastuse_set(&rt->u.dst);
 	rt->u.dst.output  = dn_output;
 	rt->u.dst.input   = dn_rt_bug;
 	rt->rt_flags      = flags;
@@ -1173,9 +1170,7 @@ static int __dn_route_output_key(struct 
 #endif
 			    (rt->fl.iif == 0) &&
 			    (rt->fl.oif == flp->oif)) {
-				rt->u.dst.lastuse = jiffies;
-				dst_hold(&rt->u.dst);
-				rt->u.dst.__use++;
+				dst_update_rtu(&rt->u.dst);
 				rcu_read_unlock_bh();
 				*pprt = &rt->u.dst;
 				return 0;
@@ -1381,7 +1376,7 @@ make_route:
 	rt->u.dst.flags = DST_HOST;
 	rt->u.dst.neighbour = neigh;
 	rt->u.dst.dev = out_dev;
-	rt->u.dst.lastuse = jiffies;
+	dst_lastuse_set(&rt->u.dst);
 	rt->u.dst.output = dn_rt_bug;
 	switch(res.type) {
 		case RTN_UNICAST:
@@ -1452,9 +1447,7 @@ int dn_route_input(struct sk_buff *skb)
 		    (rt->fl.fld_fwmark == skb->nfmark) &&
 #endif
 		    (rt->fl.iif == cb->iif)) {
-			rt->u.dst.lastuse = jiffies;
-			dst_hold(&rt->u.dst);
-			rt->u.dst.__use++;
+			dst_update_rtu(&rt->u.dst);
 			rcu_read_unlock();
 			skb->dst = (struct dst_entry *)rt;
 			return 0;
@@ -1504,9 +1497,9 @@ static int dn_rt_fill_info(struct sk_buf
 		RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway);
 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
 		goto rtattr_failure;
-	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
-	ci.rta_used     = rt->u.dst.__use;
-	ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
+	ci.rta_lastuse = jiffies_to_clock_t(jiffies - dst_lastuse(&rt->u.dst));
+	ci.rta_used     =  dst_use(&rt->u.dst);
+	ci.rta_clntref  = dst_refcnt(&rt->u.dst);
 	if (rt->u.dst.expires)
 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
 	else
@@ -1729,8 +1722,8 @@ static int dn_rt_cache_seq_show(struct s
 			rt->u.dst.dev ? rt->u.dst.dev->name : "*",
 			dn_addr2asc(dn_ntohs(rt->rt_daddr), buf1),
 			dn_addr2asc(dn_ntohs(rt->rt_saddr), buf2),
-			atomic_read(&rt->u.dst.__refcnt),
-			rt->u.dst.__use,
+			dst_refcnt(&rt->u.dst),
+			dst_use(&rt->u.dst),
 			(int) dst_metric(&rt->u.dst, RTAX_RTT));
 	return 0;
 } 
Index: linux-2.6.12/net/ipv4/ipvs/ip_vs_xmit.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/ipvs/ip_vs_xmit.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv4/ipvs/ip_vs_xmit.c	2005-06-24 02:18:00.000000000 +0000
@@ -88,7 +88,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
 			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
 			IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
 				  NIPQUAD(dest->addr),
-				  atomic_read(&rt->u.dst.__refcnt), rtos);
+				  dst_refcnt(&rt->u.dst), rtos);
 		}
 		spin_unlock(&dest->dst_lock);
 	} else {
Index: linux-2.6.12/net/ipv4/multipath_drr.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/multipath_drr.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv4/multipath_drr.c	2005-06-24 02:18:00.000000000 +0000
@@ -149,8 +149,7 @@ static void drr_select_route(const struc
 		    multipath_comparekeys(&nh->fl, flp)) {
 			int nh_ifidx = nh->u.dst.dev->ifindex;
 
-			nh->u.dst.lastuse = jiffies;
-			nh->u.dst.__use++;
+			dst_update_tu(&nh->u.dst);
 			if (result != NULL)
 				continue;
 
Index: linux-2.6.12/net/ipv4/multipath_random.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/multipath_random.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv4/multipath_random.c	2005-06-24 02:18:00.000000000 +0000
@@ -94,7 +94,8 @@ static void random_select_route(const st
 		for (rt = first; rt; rt = rt->u.rt_next) {
 			if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
 			    multipath_comparekeys(&rt->fl, flp)) {
-				rt->u.dst.lastuse = jiffies;
+
+				dst_lastuse_set(&rt->u.dst);
 
 				if (i == candidate_no)
 					decision = rt;
@@ -107,7 +108,7 @@ static void random_select_route(const st
 		}
 	}
 
-	decision->u.dst.__use++;
+	dst_use_inc(&decision->u.dst);
 	*rp = decision;
 }
 
Index: linux-2.6.12/net/ipv4/multipath_rr.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/multipath_rr.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv4/multipath_rr.c	2005-06-24 02:18:00.000000000 +0000
@@ -62,10 +62,11 @@ static void rr_select_route(const struct
  	     nh = rcu_dereference(nh->u.rt_next)) {
 		if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
 		    multipath_comparekeys(&nh->fl, flp)) {
-			nh->u.dst.lastuse = jiffies;
+			int __use = dst_use(&nh->u.dst);
+			dst_lastuse_set(&nh->u.dst);
 
-			if (min_use == -1 || nh->u.dst.__use < min_use) {
-				min_use = nh->u.dst.__use;
+			if (min_use == -1 || __use < min_use) {
+				min_use = __use;
 				min_use_cand = nh;
 			}
 		}
@@ -74,7 +75,7 @@ static void rr_select_route(const struct
 	if (!result)
 		result = first;
 
-	result->u.dst.__use++;
+	dst_use_inc(&result->u.dst);
 	*rp = result;
 }
 
Index: linux-2.6.12/net/ipv4/multipath_wrandom.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/multipath_wrandom.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv4/multipath_wrandom.c	2005-06-24 02:18:00.000000000 +0000
@@ -202,7 +202,7 @@ static void wrandom_select_route(const s
 	decision = first;
 	last_mpc = NULL;
 	for (mpc = first_mpc; mpc; mpc = mpc->next) {
-		mpc->rt->u.dst.lastuse = jiffies;
+		dst_lastuse_set(&mpc->rt->u.dst);
 		if (last_power <= selector && selector < mpc->power)
 			decision = mpc->rt;
 
@@ -217,8 +217,7 @@ static void wrandom_select_route(const s
 		/* concurrent __multipath_flush may lead to !last_mpc */
 		kfree(last_mpc);
 	}
-
-	decision->u.dst.__use++;
+	dst_use_inc(&decision->u.dst);
 	*rp = decision;
 }
 
Index: linux-2.6.12/net/ipv4/route.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/route.c	2005-06-21 04:01:33.000000000 +0000
+++ linux-2.6.12/net/ipv4/route.c	2005-06-24 03:00:51.000000000 +0000
@@ -303,8 +303,8 @@ static int rt_cache_seq_show(struct seq_
 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 			r->u.dst.dev ? r->u.dst.dev->name : "*",
 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
-			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
-			r->u.dst.__use, 0, (unsigned long)r->rt_src,
+			r->rt_flags, dst_refcnt(&r->u.dst),
+			dst_use(&r->u.dst), 0, (unsigned long)r->rt_src,
 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 			dst_metric(&r->u.dst, RTAX_WINDOW),
@@ -481,7 +481,7 @@ static int rt_may_expire(struct rtable *
 	unsigned long age;
 	int ret = 0;
 
-	if (atomic_read(&rth->u.dst.__refcnt))
+	if (dst_refcnt(&rth->u.dst))
 		goto out;
 
 	ret = 1;
@@ -505,7 +505,7 @@ out:	return ret;
  */
 static inline u32 rt_score(struct rtable *rt)
 {
-	u32 score = jiffies - rt->u.dst.lastuse;
+	u32 score = jiffies - dst_lastuse(&rt->u.dst);
 
 	score = ~score & ~(3<<30);
 
@@ -905,9 +905,8 @@ restart:
 			 */
 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
-			rth->u.dst.__use++;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.lastuse = now;
+			dst_update_rtu(&rth->u.dst);
+
 			spin_unlock_bh(&rt_hash_table[hash].lock);
 
 			rt_drop(rt);
@@ -915,7 +914,7 @@ restart:
 			return 0;
 		}
 
-		if (!atomic_read(&rth->u.dst.__refcnt)) {
+		if (!dst_refcnt(&rth->u.dst)) {
 			u32 score = rt_score(rth);
 
 			if (score <= min_score) {
@@ -1070,6 +1069,12 @@ static void rt_del(unsigned hash, struct
 	spin_unlock_bh(&rt_hash_table[hash].lock);
 }
 
+void ip_rt_copy(struct rtable *to, struct rtable *from)
+{
+	*to = *from;
+	to->u.dst.__use 	= 1;
+}
+
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 		    u32 saddr, u8 tos, struct net_device *dev)
 {
@@ -1137,17 +1142,17 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
 				}
 
 				/* Copy all the information. */
-				*rt = *rth;
- 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
-				rt->u.dst.__use		= 1;
-				atomic_set(&rt->u.dst.__refcnt, 1);
+				ip_rt_copy(rt, rth);
+
+				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
+				dst_lastuse_set(&rt->u.dst);
+				dst_refcnt_one(&rt->u.dst);
 				rt->u.dst.child		= NULL;
 				if (rt->u.dst.dev)
 					dev_hold(rt->u.dst.dev);
 				if (rt->idev)
 					in_dev_hold(rt->idev);
 				rt->u.dst.obsolete	= 0;
-				rt->u.dst.lastuse	= jiffies;
 				rt->u.dst.path		= &rt->u.dst;
 				rt->u.dst.neighbour	= NULL;
 				rt->u.dst.hh		= NULL;
@@ -1581,7 +1586,7 @@ static int ip_route_input_mc(struct sk_b
 
 	rth->u.dst.output= ip_rt_bug;
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rth->u.dst);
 	rth->u.dst.flags= DST_HOST;
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
@@ -1780,7 +1785,7 @@ static inline int ip_mkroute_input_def(s
 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
 	if (err)
 		return err;
-	atomic_set(&rth->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rth->u.dst);
 
 	/* put it into the cache */
 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
@@ -1838,7 +1843,7 @@ static inline int ip_mkroute_input(struc
 		 * outside
 		 */
 		if (hop == lasthop)
-			atomic_set(&(skb->dst->__refcnt), 1);
+			dst_refcnt_one(skb->dst);
 	}
 	return err;
 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
@@ -1974,7 +1979,7 @@ local_input:
 
 	rth->u.dst.output= ip_rt_bug;
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rth->u.dst);
 	rth->u.dst.flags= DST_HOST;
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
@@ -2059,9 +2064,7 @@ int ip_route_input(struct sk_buff *skb, 
 		    rth->fl.fl4_fwmark == skb->nfmark &&
 #endif
 		    rth->fl.fl4_tos == tos) {
-			rth->u.dst.lastuse = jiffies;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.__use++;
+			dst_update_rtu(&rth->u.dst);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
 			skb->dst = (struct dst_entry*)rth;
@@ -2245,7 +2248,7 @@ static inline int ip_mkroute_output_def(
 	if (err == 0) {
 		u32 tos = RT_FL_TOS(oldflp);
 
-		atomic_set(&rth->u.dst.__refcnt, 1);
+		dst_refcnt_one(&rth->u.dst);
 		
 		hash = rt_hash_code(oldflp->fl4_dst, 
 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
@@ -2305,7 +2308,7 @@ static inline int ip_mkroute_output(stru
 			if (err != 0)
 				return err;
 		}
-		atomic_set(&(*rp)->u.dst.__refcnt, 1);
+		dst_refcnt_one(&(*rp)->u.dst);
 		return err;
 	} else {
 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
@@ -2541,10 +2544,7 @@ int __ip_route_output_key(struct rtable 
 				rcu_read_unlock_bh();
 				return 0;
 			}
-
-			rth->u.dst.lastuse = jiffies;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.__use++;
+			dst_update_rtu(&rth->u.dst);
 			RT_CACHE_STAT_INC(out_hit);
 			rcu_read_unlock_bh();
 			*rp = rth;
@@ -2630,9 +2630,9 @@ static int rt_fill_info(struct sk_buff *
 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
 		goto rtattr_failure;
-	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
-	ci.rta_used	= rt->u.dst.__use;
-	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
+	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - dst_lastuse(&rt->u.dst));
+	ci.rta_used	= dst_use(&rt->u.dst);
+	ci.rta_clntref	= dst_refcnt(&rt->u.dst);
 	if (rt->u.dst.expires)
 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
 	else
Index: linux-2.6.12/net/ipv4/xfrm4_policy.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/xfrm4_policy.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv4/xfrm4_policy.c	2005-06-24 02:18:00.000000000 +0000
@@ -135,7 +135,7 @@ __xfrm4_bundle_create(struct xfrm_policy
 			dev_hold(rt->u.dst.dev);
 		dst_prev->obsolete	= -1;
 		dst_prev->flags	       |= DST_HOST;
-		dst_prev->lastuse	= jiffies;
+		dst_lastuse_set(dst_prev);
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
Index: linux-2.6.12/net/ipv6/ip6_fib.c
===================================================================
--- linux-2.6.12.orig/net/ipv6/ip6_fib.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv6/ip6_fib.c	2005-06-24 03:00:51.000000000 +0000
@@ -1159,8 +1159,8 @@ static int fib6_age(struct rt6_info *rt,
 		}
 		gc_args.more++;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
-		if (atomic_read(&rt->u.dst.__refcnt) == 0 &&
-		    time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) {
+		if (dst_refcnt(&rt->u.dst) == 0 &&
+		    time_after_eq(now, dst_lastuse(&rt->u.dst) + gc_args.timeout)) {
 			RT6_TRACE("aging clone %p\n", rt);
 			return -1;
 		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
Index: linux-2.6.12/net/ipv6/route.c
===================================================================
--- linux-2.6.12.orig/net/ipv6/route.c	2005-06-21 04:01:33.000000000 +0000
+++ linux-2.6.12/net/ipv6/route.c	2005-06-24 03:00:51.000000000 +0000
@@ -368,10 +368,9 @@ struct rt6_info *rt6_lookup(struct in6_a
 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
 	rt = rt6_device_match(fn->leaf, oif, strict);
 	dst_hold(&rt->u.dst);
-	rt->u.dst.__use++;
-	read_unlock_bh(&rt6_lock);
 
-	rt->u.dst.lastuse = jiffies;
+	read_unlock_bh(&rt6_lock);
+	dst_update_tu(&rt->u.dst);
 	if (rt->u.dst.error == 0)
 		return rt;
 	dst_release(&rt->u.dst);
@@ -510,8 +509,7 @@ restart:
 out:
 	read_unlock_bh(&rt6_lock);
 out2:
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.__use++;
+	dst_update_tu(&rt->u.dst);
 	skb->dst = (struct dst_entry *) rt;
 }
 
@@ -570,8 +568,7 @@ restart:
 out:
 	read_unlock_bh(&rt6_lock);
 out2:
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.__use++;
+	dst_update_tu(&rt->u.dst);
 	return &rt->u.dst;
 }
 
@@ -683,7 +680,7 @@ struct dst_entry *ndisc_dst_alloc(struct
 	rt->rt6i_dev	  = dev;
 	rt->rt6i_idev     = idev;
 	rt->rt6i_nexthop  = neigh;
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rt->u.dst);
 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
@@ -717,7 +714,7 @@ int ndisc_dst_gc(int *more)
 	pprev = &ndisc_dst_gc_list;
 	freed = 0;
 	while ((dst = *pprev) != NULL) {
-		if (!atomic_read(&dst->__refcnt)) {
+		if (!dst_refcnt(dst)) {
 			*pprev = dst->next;
 			dst_free(dst);
 			freed++;
@@ -1258,7 +1255,7 @@ static struct rt6_info * ip6_rt_copy(str
 		rt->rt6i_idev = ort->rt6i_idev;
 		if (rt->rt6i_idev)
 			in6_dev_hold(rt->rt6i_idev);
-		rt->u.dst.lastuse = jiffies;
+		dst_lastuse_set(&rt->u.dst);
 		rt->rt6i_expires = 0;
 
 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
@@ -1421,7 +1418,7 @@ struct rt6_info *addrconf_dst_alloc(stru
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rt->u.dst);
 
 	return rt;
 }
@@ -1641,13 +1638,13 @@ static int rt6_fill_node(struct sk_buff 
 	if (rt->u.dst.dev)
 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
-	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
+	ci.rta_lastuse = jiffies_to_clock_t(jiffies - dst_lastuse(&rt->u.dst));
 	if (rt->rt6i_expires)
 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
 	else
 		ci.rta_expires = 0;
-	ci.rta_used = rt->u.dst.__use;
-	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
+	ci.rta_used = dst_use(&rt->u.dst);
+	ci.rta_clntref = dst_refcnt(&rt->u.dst);
 	ci.rta_error = rt->u.dst.error;
 	ci.rta_id = 0;
 	ci.rta_ts = 0;
@@ -1923,8 +1920,8 @@ static int rt6_info_route(struct rt6_inf
 	}
 	arg->len += sprintf(arg->buffer + arg->len,
 			    " %08x %08x %08x %08x %8s\n",
-			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
-			    rt->u.dst.__use, rt->rt6i_flags, 
+			    rt->rt6i_metric, dst_refcnt(&rt->u.dst),
+			    dst_use(&rt->u.dst), rt->rt6i_flags,
 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
 	return 0;
 }
Index: linux-2.6.12/net/ipv6/xfrm6_policy.c
===================================================================
--- linux-2.6.12.orig/net/ipv6/xfrm6_policy.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/ipv6/xfrm6_policy.c	2005-06-24 02:18:00.000000000 +0000
@@ -156,7 +156,7 @@ __xfrm6_bundle_create(struct xfrm_policy
 			dev_hold(rt->u.dst.dev);
 		dst_prev->obsolete	= -1;
 		dst_prev->flags	       |= DST_HOST;
-		dst_prev->lastuse	= jiffies;
+		dst_lastuse_set(dst_prev);
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
Index: linux-2.6.12/net/xfrm/xfrm_policy.c
===================================================================
--- linux-2.6.12.orig/net/xfrm/xfrm_policy.c	2005-06-21 04:01:33.000000000 +0000
+++ linux-2.6.12/net/xfrm/xfrm_policy.c	2005-06-24 02:18:00.000000000 +0000
@@ -1091,7 +1091,7 @@ static void xfrm_prune_bundles(int (*fun
 
 static int unused_bundle(struct dst_entry *dst)
 {
-	return !atomic_read(&dst->__refcnt);
+	return !dst_refcnt(dst);
 }
 
 static void __xfrm_garbage_collect(void)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] dst numa: Avoid dst counter cacheline bouncing
  2005-06-24  3:04 [PATCH] dst_entry structure use,lastuse and refcnt abstraction Christoph Lameter
@ 2005-06-24  3:10 ` Christoph Lameter
  2005-06-24  3:32   ` David S. Miller
  2005-06-24  4:58   ` Dipankar Sarma
  2005-06-24  3:36 ` [PATCH] dst_entry structure use,lastuse and refcnt abstraction David S. Miller
  1 sibling, 2 replies; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  3:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-net, davem, shai, akpm

The dst_entry structure contains an atomic counter that is incremented and
decremented for each network packet being sent. If there is a high number of
packets being sent from multiple processors then cacheline bouncing between
NUMA nodes can limit the tcp performance in a NUMA system.

The following patch splits the usage counter into counters per node.

Patch depends on the numa slab allocator in mm in order to be able to allocate
node local memory in a fast way. Patch will work without the numa slab allocator
but the node specific allocations will be very slow.

The patch also depends on the dst abstraction patch.

AIM7 results (tcp_test and udp_test) on i386 (IBM x445 8p 16G):

No patch	94788.2
w/patch		97739.2 (+3.11%)

The numbers will be much higher on larger NUMA machines.

Detailed AIM7 result no patch:

Benchmark       Version Machine Run Date
AIM Multiuser Benchmark - Suite VII     "1.1"           Jun 22 07:58:48 2005

Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
1       568.4           100     10.4    0.5     9.4737
101     37148.0         87      16.1    48.0    6.1300
201     54024.4         81      22.1    95.5    4.4796
301     63424.6         76      28.2    143.2   3.5119
401     69913.1         75      34.1    191.4   2.9058
501     74157.5         72      40.1    239.2   2.4670
601     77388.7         70      46.1    286.9   2.1461
701     79784.2         68      52.2    335.1   1.8969
914     83809.2         67      64.8    435.7   1.5282
1127    86123.5         71      77.7    539.1   1.2736
1595    89371.8         67      106.0   764.3   0.9339
1792    90047.2         64      118.2   861.2   0.8375
2208    91729.8         64      143.0   1059.9  0.6924
2624    92699.9         63      168.1   1260.2  0.5888
3529    93853.9         63      223.3   1701.4  0.4433
3917    94339.6         70      246.6   1886.8  0.4014
4733    94698.3         64      296.9   2285.2  0.3335
5000    94788.2         64      313.3   2413.5  0.3160

AIM7 with patch:

Benchmark       Version Machine Run Date
AIM Multiuser Benchmark - Suite VII     "1.1"           Jun 22 06:27:47 2005

Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
1       568.4           100     10.4    0.4     9.4737
101     37426.1         87      16.0    46.5    6.1759
201     54742.8         81      21.8    92.5    4.5392
301     64898.0         77      27.6    138.7   3.5935
401     71272.9         74      33.4    185.2   2.9623
501     75955.6         73      39.2    231.2   2.5268
601     79473.3         73      44.9    277.6   2.2039
701     81967.3         70      50.8    323.7   1.9488
914     85836.5         70      63.2    422.6   1.5652
1127    88585.2         67      75.6    521.7   1.3100
1594    92194.4         78      102.7   738.6   0.9640
1791    93091.9         67      114.3   830.9   0.8663
2207    94517.5         72      138.7   1025.4  0.7138
2623    95358.5         64      163.4   1221.5  0.6059
3529    96449.2         65      217.3   1650.9  0.4555
3917    97075.2         64      239.7   1829.8  0.4131
4732    97485.8         63      288.3   2214.5  0.3434
5000    97739.2         67      303.9   2341.2  0.3258

Signed-off-by: Pravin B. Shelar <pravins@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>

Index: linux-2.6.12/include/net/dst.h
===================================================================
--- linux-2.6.12.orig/include/net/dst.h	2005-06-24 02:18:00.000000000 +0000
+++ linux-2.6.12/include/net/dst.h	2005-06-24 02:18:02.000000000 +0000
@@ -12,6 +12,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
 #include <linux/jiffies.h>
+#include <linux/nodemask.h>
 #include <net/neighbour.h>
 #include <asm/processor.h>
 
@@ -35,11 +36,34 @@
 
 struct sk_buff;
 
+#ifdef CONFIG_NUMA
+
+/*	A per node instance of this exist for every dst_entry.
+ *	These are the most written fields of dst_entry.
+ */
+struct per_node_cnt
+{
+	atomic_t	refcnt;
+	int 		use;
+	unsigned long	lastuse;
+};
+
+extern kmem_cache_t *rtu_cachep;
+#endif
+
 struct dst_entry
 {
 	struct dst_entry        *next;
+#ifdef CONFIG_NUMA
+	/* first node that should be checked for time-out */
+	int 			s_nid;
+	/* per node client references   */
+	struct per_node_cnt	*__ncnt[MAX_NUMNODES];
+#else
 	atomic_t		__refcnt;	/* client references	*/
 	int			__use;
+	unsigned long		lastuse;
+#endif
 	struct dst_entry	*child;
 	struct net_device       *dev;
 	short			error;
@@ -50,7 +74,6 @@ struct dst_entry
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
 #define DST_BALANCED            0x10
-	unsigned long		lastuse;
 	unsigned long		expires;
 
 	unsigned short		header_len;	/* more space at head required */
@@ -103,25 +126,70 @@ struct dst_ops
 
 #ifdef __KERNEL__
 
+#ifdef CONFIG_NUMA
+
+static inline int dst_use(struct dst_entry *dst)
+{
+	int total = 0, nid;
+
+	for_each_online_node(nid)
+		total += dst->__ncnt[nid]->use;
+	return total;
+}
+
+#define dst_use_inc(__dst) (__dst)->__ncnt[numa_node_id()]->use++
+
+static inline unsigned long dst_lastuse(struct dst_entry *dst)
+{
+	unsigned long max = 0;
+	int nid;
+
+	for_each_online_node(nid)
+		if (max < dst->__ncnt[nid]->lastuse)
+			max = dst->__ncnt[nid]->lastuse;
+	return max;
+}
+
+#define dst_lastuse_set(__dst) (__dst)->__ncnt[numa_node_id()]->lastuse = jiffies
+
+static inline int dst_refcnt(struct dst_entry *dst)
+{
+	int nid, sum = 0;
+
+	for_each_online_node(nid)
+		sum += atomic_read(&dst->__ncnt[nid]->refcnt);
+
+	return sum;
+}
+
+#define dst_refcnt_one(__dst) atomic_set(&(__dst)->__ncnt[numa_node_id()]->refcnt, 1)
+#define dst_refcnt_dec(__dst) atomic_dec(&(__dst)->__ncnt[numa_node_id()]->refcnt)
+#define dst_hold(__dst) atomic_inc(&(__dst)->__ncnt[numa_node_id()]->refcnt)
+
+#else
+
 #define dst_use(__dst) (__dst)->__use
 #define dst_use_inc(__dst) (__dst)->__use++
 
 #define dst_lastuse(__dst) (__dst)->lastuse
 #define dst_lastuse_set(__dst) (__dst)->lastuse = jiffies
 
-#define dst_update_tu(__dst) do { dst_lastuse_set(__dst);dst_use_inc(__dst); } while (0)
-#define dst_update_rtu(__dst) do { dst_lastuse_set(__dst);dst_hold(__dst);dst_use_inc(__dst); } while (0)
-
 #define dst_refcnt(__dst) atomic_read(&(__dst)->__refcnt)
 #define dst_refcnt_one(__dst) atomic_set(&(__dst)->__refcnt, 1)
 #define dst_refcnt_dec(__dst) atomic_dec(&(__dst)->__refcnt)
 #define dst_hold(__dst) atomic_inc(&(__dst)->__refcnt)
 
+#endif
+#define dst_update_tu(__dst) do { dst_lastuse_set(__dst);dst_use_inc(__dst); } while (0)
+#define dst_update_rtu(__dst) do { dst_lastuse_set(__dst);dst_hold(__dst);dst_use_inc(__dst); } while (0)
+
 static inline
 void dst_release(struct dst_entry * dst)
 {
 	if (dst) {
+#if  (!defined (CONFIG_NUMA) || (RT_CACHE_DEBUG >= 2 ))
 		WARN_ON(dst_refcnt(dst) < 1);
+#endif
 		smp_mb__before_atomic_dec();
 		dst_refcnt_dec(dst);
 	}
@@ -271,6 +339,55 @@ static inline struct dst_entry *dst_chec
 
 extern void		dst_init(void);
 
+/*	This function allocates and initializes rtu array of given dst-entry.
+ */
+static inline int dst_init_rtu_array(struct dst_entry *dst)
+{
+#ifdef CONFIG_NUMA
+	int nid, curr_node;
+
+	curr_node = numa_node_id();
+	for_each_node(nid) {
+		int online_node;
+		online_node = node_online(nid) ? nid : curr_node;
+		if (!(dst->__ncnt[nid] = kmem_cache_alloc_node(rtu_cachep,
+						SLAB_ATOMIC, online_node))) {
+			for(; nid>=0; nid--)
+				kmem_cache_free(rtu_cachep, dst->__ncnt[nid]);
+			return -ENOMEM;
+		}
+		dst->__ncnt[nid]->use = 0;
+		atomic_set(&dst->__ncnt[nid]->refcnt, 0);
+		dst->__ncnt[nid]->lastuse = jiffies;
+	}
+	dst->s_nid = curr_node;
+#else
+	atomic_set(&dst->__refcnt, 0);
+	dst->lastuse = jiffies;
+#endif
+	return 0;
+}
+
+static inline void dst_free_rtu_array(struct dst_entry *dst)
+{
+#ifdef CONFIG_NUMA
+	int nid;
+	for_each_node(nid)
+		kmem_cache_free(rtu_cachep, dst->__ncnt[nid]);
+#endif
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+inline static void dst_ref_xfr_node_down(struct dst_entry *dst, int nid)
+{
+	int refcnt = atomic_read(&dst->__ncnt[nid]->refcnt);
+	if (refcnt) {
+		atomic_add(refcnt, &dst->__ncnt[numa_node_id()]->refcnt);
+		atomic_set(&dst->__ncnt[nid]->refcnt, 0);
+	}
+}
+#endif
+
 struct flowi;
 #ifndef CONFIG_XFRM
 static inline int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
Index: linux-2.6.12/net/bridge/br_netfilter.c
===================================================================
--- linux-2.6.12.orig/net/bridge/br_netfilter.c	2005-06-17 19:48:29.000000000 +0000
+++ linux-2.6.12/net/bridge/br_netfilter.c	2005-06-24 02:18:02.000000000 +0000
@@ -85,7 +85,6 @@ static struct net_device __fake_net_devi
 static struct rtable __fake_rtable = {
 	.u = {
 		.dst = {
-			.__refcnt		= ATOMIC_INIT(1),
 			.dev			= &__fake_net_device,
 			.path			= &__fake_rtable.u.dst,
 			.metrics		= {[RTAX_MTU - 1] = 1500},
@@ -1048,6 +1047,10 @@ int br_netfilter_init(void)
 {
 	int i;
 
+	if (dst_init_rtu_array(&__fake_rtable.u.dst) < 0)
+		panic("br_netfilter : cannot allocate memory for dst-entry rtu array");
+	dst_refcnt_one(&__fake_rtable.u.dst);
+
 	for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++) {
 		int ret;
 
@@ -1084,4 +1087,5 @@ void br_netfilter_fini(void)
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(brnf_sysctl_header);
 #endif
+	dst_free_rtu_array(&__fake_rtable.u.dst);
 }
Index: linux-2.6.12/net/core/dst.c
===================================================================
--- linux-2.6.12.orig/net/core/dst.c	2005-06-24 02:18:00.000000000 +0000
+++ linux-2.6.12/net/core/dst.c	2005-06-24 02:18:02.000000000 +0000
@@ -124,9 +124,9 @@ void * dst_alloc(struct dst_ops * ops)
 	if (!dst)
 		return NULL;
 	memset(dst, 0, ops->entry_size);
-	atomic_set(&dst->__refcnt, 0);
+	if (dst_init_rtu_array(dst) < 0)
+		return NULL;
 	dst->ops = ops;
-	dst->lastuse = jiffies;
 	dst->path = dst;
 	dst->input = dst_discard_in;
 	dst->output = dst_discard_out;
@@ -193,6 +193,7 @@ again:
 #if RT_CACHE_DEBUG >= 2 
 	atomic_dec(&dst_total);
 #endif
+	dst_free_rtu_array(dst);
 	kmem_cache_free(dst->ops->kmem_cachep, dst);
 
 	dst = child;
@@ -267,8 +268,20 @@ static struct notifier_block dst_dev_not
 	.notifier_call	= dst_dev_event,
 };
 
+#ifdef CONFIG_NUMA
+kmem_cache_t *rtu_cachep;
+#endif
+
 void __init dst_init(void)
 {
+#ifdef CONFIG_NUMA
+	rtu_cachep =  kmem_cache_create("dst_rtu",
+			sizeof(struct per_node_cnt),
+			0, 0, NULL, NULL);
+
+	if (!rtu_cachep)
+		panic("IP: failed to create rtu_cachep\n");
+#endif
 	register_netdevice_notifier(&dst_dev_notifier);
 }
 
Index: linux-2.6.12/net/decnet/dn_route.c
===================================================================
--- linux-2.6.12.orig/net/decnet/dn_route.c	2005-06-24 02:18:00.000000000 +0000
+++ linux-2.6.12/net/decnet/dn_route.c	2005-06-24 02:18:02.000000000 +0000
@@ -77,6 +77,7 @@
 #include <linux/netfilter_decnet.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/cpu.h>
 #include <asm/errno.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
@@ -157,7 +158,29 @@ static inline void dnrt_drop(struct dn_r
 
 static inline int dn_dst_useful(struct dn_route *rth, unsigned long now, unsigned long expire)
 {
+#ifdef CONFIG_NUMA
+	{
+		int max, sum = 0, age, nid;
+		struct dst_entry *dst = &rth->u.dst;
+
+		nid = dst->s_nid;
+		max = nid + MAX_NUMNODES;
+		for(sum = 0; nid < max; nid++) {
+			int nid_ = nid % MAX_NUMNODES;
+			if (node_online(nid_)) {
+				sum += atomic_read(&dst->__ncnt[nid_]->refcnt);
+				age = now - dst->__ncnt[nid_]->lastuse;
+				if (age <= expire) {
+					dst->s_nid = nid_ ;
+					return 1;
+				}
+			}
+		}
+		return (sum != 0);
+	}
+#else
 	return  (atomic_read(&rth->u.dst.__refcnt) || (now - rth->u.dst.lastuse) < expire) ;
+#endif
 }
 
 static void dn_dst_check_expire(unsigned long dummy)
@@ -1766,6 +1789,45 @@ static struct file_operations dn_rt_cach
 
 #endif /* CONFIG_PROC_FS */
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+static int __devinit dn_rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+                                   void *hcpu)
+{
+	int node = cpu_to_node((int) hcpu);
+	if ( node_online(node))
+		return NOTIFY_OK;
+
+	switch(action) {
+		int i;
+		struct dn_route *rt, *next;
+
+		case CPU_DEAD:
+
+		for(i = 0; i < dn_rt_hash_mask; i++) {
+			spin_lock_bh(&dn_rt_hash_table[i].lock);
+
+			if ((rt = dn_rt_hash_table[i].chain) == NULL)
+				goto nothing_to_do;
+
+			for(; rt; rt=next) {
+				dst_ref_xfr_node_down(&rt->u.dst, node);
+				next = rt->u.rt_next;
+			}
+nothing_to_do:
+			spin_unlock_bh(&dn_rt_hash_table[i].lock);
+		}
+
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dn_rtcache_cpu_notifier =
+			{ &dn_rtcache_cpu_callback, NULL, 0 };
+
+#endif
+
 void __init dn_route_init(void)
 {
 	int i, goal, order;
@@ -1822,10 +1884,16 @@ void __init dn_route_init(void)
         dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
 
 	proc_net_fops_create("decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops);
+#if	defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&dn_rtcache_cpu_notifier);
+#endif
 }
 
 void __exit dn_route_cleanup(void)
 {
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	unregister_cpu_notifier(&dn_rtcache_cpu_notifier);
+#endif
 	del_timer(&dn_route_timer);
 	dn_run_flush(0);
 
Index: linux-2.6.12/net/ipv4/route.c
===================================================================
--- linux-2.6.12.orig/net/ipv4/route.c	2005-06-24 02:18:00.000000000 +0000
+++ linux-2.6.12/net/ipv4/route.c	2005-06-24 02:18:02.000000000 +0000
@@ -90,6 +90,7 @@
 #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/cpu.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -476,6 +477,54 @@ static __inline__ int rt_valuable(struct
 		rth->u.dst.expires;
 }
 
+#ifdef CONFIG_NUMA
+
+/*
+ * For NUMA systems, we do not want to sum up all local node refcnts every
+ * time. So we consider lastuse element of the dst_entry and start loop
+ * with the node where this entry was allocated. If dst_entry is not timed
+ * out then update s_nid of this dst_entry so that next time we can start from
+ * that node.
+ */
+static inline int rt_check_age(struct rtable *rth,
+			unsigned long tmo1, unsigned long tmo2)
+{
+	int max, sum = 0, age, idx;
+	struct dst_entry *dst = &rth->u.dst;
+	unsigned long now = jiffies;
+
+	idx = dst->s_nid;
+	max = idx + MAX_NUMNODES;
+	for(sum = 0; idx < max; idx++) {
+		int nid_ = idx % MAX_NUMNODES;
+		if (node_online(nid_)) {
+			sum += atomic_read(&dst->__ncnt[nid_]->refcnt);
+			age = now - dst->__ncnt[nid_]->lastuse;
+			if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+					(age <= tmo2 && rt_valuable(rth))) {
+				dst->s_nid = nid_ ;
+				return 0;
+			}
+		}
+	}
+	return (sum == 0);
+}
+
+/*
+ * In this function order of examining three factors (ref_cnt, expires,
+ * lastuse) is changed, considering the cost of analyzing refcnt and lastuse
+ * which are localized for each node on NUMA.
+ */
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+	if (rth->u.dst.expires && time_after_eq(jiffies, rth->u.dst.expires))
+		return (dst_refcnt(&rth->u.dst) == 0) ;
+
+	return rt_check_age(rth, tmo1, tmo2);
+}
+
+#else
+
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 {
 	unsigned long age;
@@ -498,6 +547,8 @@ static int rt_may_expire(struct rtable *
 out:	return ret;
 }
 
+#endif
+
 /* Bits of score are:
  * 31: very valuable
  * 30: not quite useless
@@ -1071,8 +1122,21 @@ static void rt_del(unsigned hash, struct
 
 void ip_rt_copy(struct rtable *to, struct rtable *from)
 {
+#ifdef CONFIG_NUMA
+	struct per_node_cnt *tmp_pnc[MAX_NUMNODES];
+
+	memcpy(&tmp_pnc[0], &to->u.dst.__ncnt[0],
+			sizeof(struct per_node_cnt *)*MAX_NUMNODES);
+	*to = *from;
+	memcpy(&to->u.dst.__ncnt[0], &tmp_pnc[0],
+			sizeof(struct per_node_cnt *)*MAX_NUMNODES);
+
+	to->u.dst.__ncnt[numa_node_id()]->use = 1;
+	to->u.dst.s_nid = numa_node_id();
+#else
 	*to = *from;
 	to->u.dst.__use 	= 1;
+#endif
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3066,6 +3130,35 @@ static int __init set_rhash_entries(char
 }
 __setup("rhash_entries=", set_rhash_entries);
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+static int __devinit rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+				   void *hcpu)
+{
+	int node = cpu_to_node((int) hcpu);
+	if (node_online(node))
+		return NOTIFY_OK;
+
+	switch(action) {
+		int i ;
+		struct rtable *rth;
+		case CPU_DEAD:
+			for(i = rt_hash_mask; i >= 0; i--) {
+				spin_lock_irq(&rt_hash_table[i].lock);
+				rth = rt_hash_table[i].chain;
+				while(rth) {
+					dst_ref_xfr_node_down(&rth->u.dst, node);
+					rth = rth->u.rt_next;
+				}
+				spin_unlock_irq(&rt_hash_table[i].lock);
+			}
+			break;
+	}
+	return NOTIFY_OK;
+}
+static struct notifier_block rtcache_cpu_notifier = { &rtcache_cpu_callback, NULL, 0 };
+#endif
+
 int __init ip_rt_init(void)
 {
 	int i, order, goal, rc = 0;
@@ -3169,6 +3262,9 @@ int __init ip_rt_init(void)
 	xfrm_init();
 	xfrm4_init();
 #endif
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&rtcache_cpu_notifier);
+#endif
 	return rc;
 }
 
Index: linux-2.6.12/net/ipv6/ip6_fib.c
===================================================================
--- linux-2.6.12.orig/net/ipv6/ip6_fib.c	2005-06-24 02:18:00.000000000 +0000
+++ linux-2.6.12/net/ipv6/ip6_fib.c	2005-06-24 02:18:02.000000000 +0000
@@ -1208,6 +1208,37 @@ void fib6_run_gc(unsigned long dummy)
 	spin_unlock_bh(&fib6_gc_lock);
 }
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+#include <linux/cpu.h>
+inline static int rt6_ref_xfr_node_down(struct rt6_info *rt, void *arg)
+{
+	dst_ref_xfr_node_down(&rt->u.dst, (int)arg);
+	return 0;
+}
+
+static int __devinit ipv6_rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+                                   void *hcpu)
+{
+	int node = cpu_to_node((int) hcpu);
+	if ( node_online(node))
+		return NOTIFY_OK;
+
+	switch(action) {
+		case CPU_DEAD:
+			write_lock_bh(&rt6_lock);
+			fib6_clean_tree(&ip6_routing_table, rt6_ref_xfr_node_down,
+					0, (void *)node);
+			write_unlock_bh(&rt6_lock);
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipv6_rtcache_cpu_notifier =
+				{ &ipv6_rtcache_cpu_callback, NULL, 0 };
+#endif
+
 void __init fib6_init(void)
 {
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
@@ -1216,10 +1247,16 @@ void __init fib6_init(void)
 					   NULL, NULL);
 	if (!fib6_node_kmem)
 		panic("cannot create fib6_nodes cache");
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&ipv6_rtcache_cpu_notifier);
+#endif
 }
 
 void fib6_gc_cleanup(void)
 {
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	unregister_cpu_notifier(&ipv6_rtcache_cpu_notifier);
+#endif
 	del_timer(&ip6_fib_timer);
 	kmem_cache_destroy(fib6_node_kmem);
 }
Index: linux-2.6.12/net/ipv6/route.c
===================================================================
--- linux-2.6.12.orig/net/ipv6/route.c	2005-06-24 02:18:00.000000000 +0000
+++ linux-2.6.12/net/ipv6/route.c	2005-06-24 02:58:08.000000000 +0000
@@ -110,8 +110,6 @@ static struct dst_ops ip6_dst_ops = {
 struct rt6_info ip6_null_entry = {
 	.u = {
 		.dst = {
-			.__refcnt	= ATOMIC_INIT(1),
-			.__use		= 1,
 			.dev		= &loopback_dev,
 			.obsolete	= -1,
 			.error		= -ENETUNREACH,
@@ -2100,6 +2098,10 @@ void __init ip6_route_init(void)
 						     NULL, NULL);
 	if (!ip6_dst_ops.kmem_cachep)
 		panic("cannot create ip6_dst_cache");
+	if (dst_init_rtu_array(&ip6_null_entry.u.dst) < 0)
+		panic("ip6_route : can't allocate memory for dst-entry array");
+	dst_use_inc(&ipv6_null_entry.u.dist);
+	dst_refcnt_one(&ip6_null_entry.u.dst);
 
 	fib6_init();
 #ifdef 	CONFIG_PROC_FS
@@ -2126,4 +2128,5 @@ void ip6_route_cleanup(void)
 	rt6_ifdown(NULL);
 	fib6_gc_cleanup();
 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
+	dst_free_rtu_array(&ip6_null_entry.u.dst);
 }

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst numa: Avoid dst counter cacheline bouncing
  2005-06-24  3:10 ` [PATCH] dst numa: Avoid dst counter cacheline bouncing Christoph Lameter
@ 2005-06-24  3:32   ` David S. Miller
  2005-06-24  4:58   ` Dipankar Sarma
  1 sibling, 0 replies; 17+ messages in thread
From: David S. Miller @ 2005-06-24  3:32 UTC (permalink / raw)
  To: christoph; +Cc: linux-kernel, linux-net, shai, akpm

From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 20:10:06 -0700 (PDT)

> AIM7 results (tcp_test and udp_test) on i386 (IBM x445 8p 16G):
> 
> No patch	94788.2
> w/patch		97739.2 (+3.11%)
> 
> The numbers will be much higher on larger NUMA machines.

How much higher?  I don't believe it.  And %3 doesn't justify the
complexity (both in time and memory usage) added by this patch.

Performance of our stack's routing cache is _DEEPLY_ tied to the
size of the routing cache entries, of which dst entry is a member.
Every single byte counts.

You've exploded this to be (NR_NODES * sizeof(void *)) larger.  That
is totally unacceptable, even for NUMA.

Secondly, inlining the "for_each_online_node()" loops isn't very nice
either.

Consider making a per-node routing cache instead, just like the flow
cache is per-cpu, or make socket dst entries have a per-node array of
object pointers.  Fill the per-node array in lazily, just as you do
for the dst.  The first time you try to clone a dst on a cpu for a
socket, create the per-cpu entry slot.

We don't need to make them per-node system wide, only per-socket is
this really needed.

This way you do per-node walking when you detach the dst from the
socket at close() time, not at every dst_release() call, and thus for
every packet in the system.

In light of that, I don't see what the advantage is.  Instead of
atomic inc/dec on every packet sent in the system, you walk the whole
array of counters for every packet sent in the system.  If you really
get contention amongst nodes for a DST entry, this walk should result
in ReadToShare transactions, and thus cacheline movement, between the
NUMA nodes, on every __kfree_skb() call.

Essentially you're trading 1 atomic inc (ReadToOwn) and 1 atomic dec
(ReadToOwn) per packet for significant extra memory, much bigger code,
and 1 ReadToShare transaction per packet.

And since you're still using atomic_inc/atomic_dec you'll still hit
ReadToOwn transactions within the node.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  3:04 [PATCH] dst_entry structure use,lastuse and refcnt abstraction Christoph Lameter
  2005-06-24  3:10 ` [PATCH] dst numa: Avoid dst counter cacheline bouncing Christoph Lameter
@ 2005-06-24  3:36 ` David S. Miller
  2005-06-24  3:40   ` Christoph Lameter
  1 sibling, 1 reply; 17+ messages in thread
From: David S. Miller @ 2005-06-24  3:36 UTC (permalink / raw)
  To: christoph; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 20:04:52 -0700 (PDT)

> The patch also removes the atomic_dec_and_test in dst_destroy.

That is the only part of this patch I'm willing to entertain
at this time, as long as Herbert Xu ACKs it.  How much of that
quoted %3 gain does this change alone give you?

Also, please post networking patches to netdev@vger.kernel.org.
Because you didn't, the majority of the networking maintainers
did not see your dst patch submissions.  It's mentioned in
linux/MAINTAINERS for a reason:

	NETWORKING [GENERAL]
	P:	Networking Team
	M:	netdev@vger.kernel.org
	L:	netdev@vger.kernel.org
	S:	Maintained

Thanks.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  3:36 ` [PATCH] dst_entry structure use,lastuse and refcnt abstraction David S. Miller
@ 2005-06-24  3:40   ` Christoph Lameter
  2005-06-24  3:47     ` David S. Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  3:40 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

On Thu, 23 Jun 2005, David S. Miller wrote:

> From: Christoph Lameter <christoph@lameter.com>
> Date: Thu, 23 Jun 2005 20:04:52 -0700 (PDT)
> 
> > The patch also removes the atomic_dec_and_test in dst_destroy.
> 
> That is the only part of this patch I'm willing to entertain
> at this time, as long as Herbert Xu ACKs it.  How much of that
> quoted %3 gain does this change alone give you?

Nothing as far as I can tell. The main benefit may be reorganization of 
the code.

> Also, please post networking patches to netdev@vger.kernel.org.
> Because you didn't, the majority of the networking maintainers
> did not see your dst patch submissions.  It's mentioned in
> linux/MAINTAINERS for a reason:
> 
> 	NETWORKING [GENERAL]
> 	P:	Networking Team
> 	M:	netdev@vger.kernel.org
> 	L:	netdev@vger.kernel.org
> 	S:	Maintained
> 
> Thanks.

Yes and it was recently changed. Typical use is linux-xxx@vger.kernel.org


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  3:40   ` Christoph Lameter
@ 2005-06-24  3:47     ` David S. Miller
  2005-06-24  3:49       ` Christoph Lameter
  0 siblings, 1 reply; 17+ messages in thread
From: David S. Miller @ 2005-06-24  3:47 UTC (permalink / raw)
  To: christoph; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 20:40:25 -0700 (PDT)

> Nothing as far as I can tell. The main benefit may be reorganization of 
> the code.

You told us way back in the original thread that the final atomic dec
shows up very much on NUMA, and if it could be avoided (and changed
into a read test), it would help a lot on NUMA.

> > 	NETWORKING [GENERAL]
> > 	P:	Networking Team
> > 	M:	netdev@vger.kernel.org
> > 	L:	netdev@vger.kernel.org
> > 	S:	Maintained
> > 
> > Thanks.
> 
> Yes and it was recently changed. Typical use is linux-xxx@vger.kernel.org

netdev@oss.sgi.com is what used to be the place for networking
stuff, it's not netdev@vger.kernel.org


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  3:47     ` David S. Miller
@ 2005-06-24  3:49       ` Christoph Lameter
  2005-06-24  3:54         ` David S. Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  3:49 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

On Thu, 23 Jun 2005, David S. Miller wrote:

> From: Christoph Lameter <christoph@lameter.com>
> Date: Thu, 23 Jun 2005 20:40:25 -0700 (PDT)
> 
> > Nothing as far as I can tell. The main benefit may be reorganization of 
> > the code.
> 
> You told us way back in the original thread that the final atomic dec
> shows up very much on NUMA, and if it could be avoided (and changed
> into a read test), it would help a lot on NUMA.

No I told you that we need to disassemble the atomic dec_and_test 
in order to be able to split the counters.

> > Yes and it was recently changed. Typical use is linux-xxx@vger.kernel.org
> 
> netdev@oss.sgi.com is what used to be the place for networking
> stuff, it's not netdev@vger.kernel.org

s/not/now/ right?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  3:49       ` Christoph Lameter
@ 2005-06-24  3:54         ` David S. Miller
  2005-06-24  4:06           ` Christoph Lameter
  0 siblings, 1 reply; 17+ messages in thread
From: David S. Miller @ 2005-06-24  3:54 UTC (permalink / raw)
  To: christoph; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 20:49:17 -0700 (PDT)

> No I told you that we need to disassemble the atomic dec_and_test 
> in order to be able to split the counters.

Ok.  You're going to have to come up with something better
than a %3 AIM benchmark increase with 5000 threads to justify
those invasive NUMA changes, and thus this infrastructure for
it.

In order to convince me on this NUMA dst stuff, you need to put away
the benchmark microscope and instead use a "macroscrope" to analyze
the side effects of these changes on a higher level.

Really, what are the effects on other things?  For example, what
does this do to routing performance?  Does the packets per second
go down, if so by how much?

What happens if you bind network interfaces, and the processes that
use them, to cpus.  Why would that be too intrusive to setup for the
administrator of such large NUMA machines?

What about loads that have low route locality, and thus touch many
different dst entries, not necessarily contending for a single one
amongst several nodes?  Does performance go up or down for such a
case?

I'm picking those examples, because I am rather certain your patches
will hurt performance in those cases.  The data structure size
expansion and extra memory allocations alone for the per-node dst
stuff should be good about doing that.

> > > Yes and it was recently changed. Typical use is linux-xxx@vger.kernel.org
> > 
> > netdev@oss.sgi.com is what used to be the place for networking
> > stuff, it's not netdev@vger.kernel.org
> 
> s/not/now/ right?

Right. :)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  3:54         ` David S. Miller
@ 2005-06-24  4:06           ` Christoph Lameter
  2005-06-24  4:11             ` David S. Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  4:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

On Thu, 23 Jun 2005, David S. Miller wrote:

> Ok.  You're going to have to come up with something better
> than a %3 AIM benchmark increase with 5000 threads to justify
> those invasive NUMA changes, and thus this infrastructure for
> it.

I am sure one can do better than that. The x445 is the smallest NUMA box 
that I know of and the only one available in the context of that project. 
But I am also not sure that this will reach proportions  that will 
outweigh the complexity added by these patches. What would be 
the performance benefit that we would need to see to feel that is 
is right to get such a change in?

> I'm picking those examples, because I am rather certain your patches
> will hurt performance in those cases.  The data structure size
> expansion and extra memory allocations alone for the per-node dst
> stuff should be good about doing that.

Hmm. I like the idea of a separate routing cache per node. May actually 
reach higher performance than splitting the counters. Lets think a bit 
about that.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  4:06           ` Christoph Lameter
@ 2005-06-24  4:11             ` David S. Miller
  2005-06-24  6:03               ` Christoph Lameter
  2005-06-24 10:57               ` [PATCH] bugfix and scalability changes in net/ipv4/route.c Eric Dumazet
  0 siblings, 2 replies; 17+ messages in thread
From: David S. Miller @ 2005-06-24  4:11 UTC (permalink / raw)
  To: christoph; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 21:06:30 -0700 (PDT)

> I am sure one can do better than that. The x445 is the smallest NUMA box 
> that I know of and the only one available in the context of that project. 
> But I am also not sure that this will reach proportions  that will 
> outweigh the complexity added by these patches. What would be 
> the performance benefit that we would need to see to feel that is 
> is right to get such a change in?

Something on the order of %10.  If I recall correctly, that's
basically what we got on SMP from the RCU changes to the routing
cache.

> Hmm. I like the idea of a separate routing cache per node. May actually 
> reach higher performance than splitting the counters. Lets think a bit 
> about that.

One thing that may happen down the line is that the flow cache
becomes a more unified thing.  Ie. local sockets get found there,
netfilter rules can match, as well as normal "routes".

The idea is that on a miss, you go down into the "slow" lookup path
where the data structure is less distributed (SMP and NUMA wise) and
slower to search.  And this populates the flow tables.

But that seems to work best on a per-cpu basis.

Unfortunately, it really doesn't address your problem in and of
itself.  This is because flow entries, especially when used on
sockets, would still get accessed by other cpus and thus nodes.
We would need to build something on top of the per-cpu flowcache,
such as the socket node array idea, to get something that helps
this NUMA issue as well.

Just tossing out some forward looking ideas :)

To be honest, the unified flow cache idea has been tossed around
a lot, and it's still intellectual masterbation.  But then again,
so was the design for the IPSEC stuff we have now for several years.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst numa: Avoid dst counter cacheline bouncing
  2005-06-24  3:10 ` [PATCH] dst numa: Avoid dst counter cacheline bouncing Christoph Lameter
  2005-06-24  3:32   ` David S. Miller
@ 2005-06-24  4:58   ` Dipankar Sarma
  2005-06-24  5:05     ` Christoph Lameter
  1 sibling, 1 reply; 17+ messages in thread
From: Dipankar Sarma @ 2005-06-24  4:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, netdev, davem, shai, akpm

On Thu, Jun 23, 2005 at 08:10:06PM -0700, Christoph Lameter wrote:
> The dst_entry structure contains an atomic counter that is incremented and
> decremented for each network packet being sent. If there is a high number of
> packets being sent from multiple processors then cacheline bouncing between
> NUMA nodes can limit the tcp performance in a NUMA system.
> 
> The following patch splits the usage counter into counters per node.
> 

Do we really need to do a distributed reference counter implementation
inside dst cache code ? If you are willing to wait for a while,
we should have modified Rusty's bigref implementation on top of the 
interleaving dynamic per-cpu allocator. We can look at distributed 
reference counter for dst refcount then and see how that can be 
worked out.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst numa: Avoid dst counter cacheline bouncing
  2005-06-24  4:58   ` Dipankar Sarma
@ 2005-06-24  5:05     ` Christoph Lameter
  2005-06-24  7:29       ` Dipankar Sarma
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  5:05 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: linux-kernel, netdev, davem, shai, akpm

On Fri, 24 Jun 2005, Dipankar Sarma wrote:

> Do we really need to do a distributed reference counter implementation
> inside dst cache code ? If you are willing to wait for a while,
> we should have modified Rusty's bigref implementation on top of the 
> interleaving dynamic per-cpu allocator. We can look at distributed 
> reference counter for dst refcount then and see how that can be 
> worked out.

Is that code available somewhere?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  4:11             ` David S. Miller
@ 2005-06-24  6:03               ` Christoph Lameter
  2005-06-24  6:16                 ` David S. Miller
  2005-06-24 10:57               ` [PATCH] bugfix and scalability changes in net/ipv4/route.c Eric Dumazet
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2005-06-24  6:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

On Thu, 23 Jun 2005, David S. Miller wrote:

> > outweigh the complexity added by these patches. What would be 
> > the performance benefit that we would need to see to feel that is 
> > is right to get such a change in?
> 
> Something on the order of %10.  If I recall correctly, that's
> basically what we got on SMP from the RCU changes to the routing
> cache.

Ok. Then we are done. With 58 Itanium processors and 200G Ram I get 
more than 10% improvement ;-). With 500 tasks we have 453 vs. 499 j/m/t.
That is 9.21%. For 300 tasks we have 9.4% etc. I am sure that I can push 
this some more with bigger counts of processors and also some other NUMA 
related performance issues.

58p SGI Altix 200G Ram running 2.6.12-mm1.

with patch:

AIM Multiuser Benchmark - Suite VII Run Beginning

Tasks    jobs/min  jti  jobs/min/task      real       cpu
    1     2439.02  100      2439.0244      2.46      0.02   Fri Jun 24 00:28:01 2005
  100   138664.20   92      1386.6420      4.33    106.04   Fri Jun 24 00:28:19 2005
  200   191113.23   87       955.5662      6.28    218.09   Fri Jun 24 00:28:45 2005
  300   219968.23   85       733.2274      8.18    329.09   Fri Jun 24 00:29:19 2005
  400   237037.04   85       592.5926     10.12    442.07   Fri Jun 24 00:30:01 2005
  500   249418.02   82       498.8360     12.03    551.93   Fri Jun 24 00:30:51 2005
  600   257879.66   80       429.7994     13.96    663.44   Fri Jun 24 00:31:49 2005
  700   264550.26   84       377.9289     15.88    774.88   Fri Jun 24 00:32:54 2005
  800   269587.19   81       336.9840     17.80    886.85   Fri Jun 24 00:34:07 2005
  900   273736.50   83       304.1517     19.73    997.90   Fri Jun 24 00:35:28 2005
 1000   277225.89   82       277.2259     21.64   1109.18   Fri Jun 24 00:36:57 2005

without patch

AIM Multiuser Benchmark - Suite VII Run Beginning

Tasks    jobs/min  jti  jobs/min/task      real       cpu
    1     2439.02  100      2439.0244      2.46      0.02   Fri Jun 24 00:46:03 2005
  100   131955.14   92      1319.5514      4.55    114.52   Fri Jun 24 00:46:22 2005
  200   177409.82   88       887.0491      6.76    246.62   Fri Jun 24 00:46:50 2005
  300   201027.47   85       670.0916      8.95    373.49   Fri Jun 24 00:47:27 2005
  400   216313.65   87       540.7841     11.09    497.36   Fri Jun 24 00:48:13 2005
  500   226637.46   83       453.2749     13.24    620.68   Fri Jun 24 00:49:07 2005
  600   233781.41   85       389.6357     15.40    746.18   Fri Jun 24 00:50:10 2005
  700   239561.94   84       342.2313     17.53    869.34   Fri Jun 24 00:51:22 2005
  800   243630.09   85       304.5376     19.70    996.39   Fri Jun 24 00:52:43 2005
  900   246992.64   85       274.4363     21.86   1121.90   Fri Jun 24 00:54:13 2005
 1000   249979.17   84       249.9792     24.00   1245.55   Fri Jun 24 00:55:52 2005



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst_entry structure use,lastuse and refcnt abstraction
  2005-06-24  6:03               ` Christoph Lameter
@ 2005-06-24  6:16                 ` David S. Miller
  0 siblings, 0 replies; 17+ messages in thread
From: David S. Miller @ 2005-06-24  6:16 UTC (permalink / raw)
  To: clameter; +Cc: linux-kernel, linux-net, shai, akpm, netdev, herbert

From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 23 Jun 2005 23:03:45 -0700 (PDT)

> Ok. Then we are done. With 58 Itanium processors and 200G Ram I get 
> more than 10% improvement ;-). With 500 tasks we have 453 vs. 499 j/m/t.
> That is 9.21%. For 300 tasks we have 9.4% etc. I am sure that I can push 
> this some more with bigger counts of processors and also some other NUMA 
> related performance issues.

So it took 7 times more processors to increase the performance gain by
just over 3 on a microscopic synthetic benchmark.  That's not
impressive at all.

And you still haven't shown what happens for the workloads I
suggested.  A web benchmark, with say a thousand unique clients, would
be sufficient for one of those btw.  That case has very low dst
locality, yet dsts are useful because you'll have about 2 or 3
concurrent connections per dst.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dst numa: Avoid dst counter cacheline bouncing
  2005-06-24  5:05     ` Christoph Lameter
@ 2005-06-24  7:29       ` Dipankar Sarma
  0 siblings, 0 replies; 17+ messages in thread
From: Dipankar Sarma @ 2005-06-24  7:29 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, netdev, davem, shai, akpm

On Thu, Jun 23, 2005 at 10:05:09PM -0700, Christoph Lameter wrote:
> On Fri, 24 Jun 2005, Dipankar Sarma wrote:
> 
> > Do we really need to do a distributed reference counter implementation
> > inside dst cache code ? If you are willing to wait for a while,
> > we should have modified Rusty's bigref implementation on top of the 
> > interleaving dynamic per-cpu allocator. We can look at distributed 
> > reference counter for dst refcount then and see how that can be 
> > worked out.
> 
> Is that code available somewhere?

Various places in lkml discussions. Search for discussions on dynamic
per-cpu allocator. Currently Bharata is adding
cpu hotplug awareness in it, but the basic patches work.

BTW, I am not saying that bigref has what you need. What I am trying
to say is that you should see if something like bigref can
be tweaked to use in your case before implementing a new type of
ref counting wholly in dst code.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] bugfix and scalability changes in net/ipv4/route.c
  2005-06-24  4:11             ` David S. Miller
  2005-06-24  6:03               ` Christoph Lameter
@ 2005-06-24 10:57               ` Eric Dumazet
  2005-06-28 20:14                 ` David S. Miller
  1 sibling, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2005-06-24 10:57 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: linux-net, herbert

[-- Attachment #1: Type: text/plain, Size: 1117 bytes --]

Hi David

This is based on a previous patch I sent 3 months ago, reduced to only the essential part, since
the previous version got no success in netdev (some people said they were working on this stuff but still no progress)

reminder of the bugfix :

The rt_check_expire() has a serious problem on machines with large
route caches, and a standard HZ value of 1000.

With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;

the loop count :

     for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;


overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536
slots in route cache hash table).

In this case, rt_check_expire() does nothing at all


Thank you

Eric Dumazet


[NET] Scalability fixes in net/ipv4/route.c, bugfix in rt_check_expire

  - Locking abstraction
  - Spinlocks moved out of rt hash table : Less memory (50%) used by rt hash table. it's a win even on UP.
  - Sizing of spinlocks table depends on NR_CPUS
  - rt hash table allocated using alloc_large_system_hash() function
  - rt_check_expire() fixes (an overflow was possible)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


[-- Attachment #2: patch.route --]
[-- Type: text/plain, Size: 7698 bytes --]

--- linux-2.6.12/net/ipv4/route.c	2005-06-17 21:48:29.000000000 +0200
+++ linux-2.6.12-ed/net/ipv4/route.c	2005-06-24 11:10:06.000000000 +0200
@@ -54,6 +54,7 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -201,8 +203,37 @@
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ	4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ	2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ	1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ	512
+#else
+#define RT_HASH_LOCK_SZ	256
+#endif
+
+	static spinlock_t	*rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[slot & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			spin_lock_init(&rt_hash_locks[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -575,19 +606,24 @@
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	int unsigned i = rover, goal;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
-
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
+	u64 mult ;
+	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	for ( ; goal > 0 ; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
-		spin_lock(&rt_hash_table[i].lock);
+		if (*rthp == 0) continue ;
+		spin_lock(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
@@ -620,8 +656,7 @@
  			rt_free(rth);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 		}
-		spin_unlock(&rt_hash_table[i].lock);
-
+		spin_unlock(rt_hash_lock_addr(i));
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
 			break;
@@ -643,11 +678,11 @@
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -780,7 +815,7 @@
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -812,7 +847,7 @@
 				goal--;
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -882,7 +917,7 @@
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 		if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +943,7 @@
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -949,7 +984,7 @@
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -990,7 +1025,7 @@
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -1058,7 +1093,7 @@
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1102,7 @@
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3069,12 +3104,14 @@
 
 int __init ip_rt_init(void)
 {
-	int i, order, goal, rc = 0;
+	int rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
 
 #ifdef CONFIG_NET_CLS_ROUTE
+	{
+	int order;
 	for (order = 0;
 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
 		/* NOTHING */;
@@ -3082,6 +3119,7 @@
 	if (!ip_rt_acct)
 		panic("IP: failed to allocate ip_rt_acct\n");
 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
+	}
 #endif
 
 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3092,39 +3130,22 @@
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	goal = num_physpages >> (26 - PAGE_SHIFT);
-	if (rhash_entries)
-		goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
-	for (order = 0; (1UL << order) < goal; order++)
-		/* NOTHING */;
-
-	do {
-		rt_hash_mask = (1UL << order) * PAGE_SIZE /
-			sizeof(struct rt_hash_bucket);
-		while (rt_hash_mask & (rt_hash_mask - 1))
-			rt_hash_mask--;
-		rt_hash_table = (struct rt_hash_bucket *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (rt_hash_table == NULL && --order > 0);
-
-	if (!rt_hash_table)
-		panic("Failed to allocate IP route cache hash table\n");
-
-	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
-	       rt_hash_mask,
-	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-
-	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
-		/* NOTHING */;
-
+	rt_hash_table = (struct rt_hash_bucket *)
+		alloc_large_system_hash("IP route cache",
+					sizeof(struct rt_hash_bucket),
+					rhash_entries,
+					(num_physpages >= 128 * 1024) ?
+						(27 - PAGE_SHIFT) :
+						(29 - PAGE_SHIFT),
+					HASH_HIGHMEM,
+					&rt_hash_log,
+					&rt_hash_mask,
+					0);
+	memset(rt_hash_table, 0, rt_hash_mask * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
+	ipv4_dst_ops.gc_thresh = rt_hash_mask;
+	ip_rt_max_size = rt_hash_mask * 16;
 	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
-		rt_hash_table[i].chain = NULL;
-	}
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
 	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
 	if (!rt_cache_stat)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bugfix and scalability changes in net/ipv4/route.c
  2005-06-24 10:57               ` [PATCH] bugfix and scalability changes in net/ipv4/route.c Eric Dumazet
@ 2005-06-28 20:14                 ` David S. Miller
  0 siblings, 0 replies; 17+ messages in thread
From: David S. Miller @ 2005-06-28 20:14 UTC (permalink / raw)
  To: dada1; +Cc: netdev, linux-net, herbert

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 24 Jun 2005 12:57:47 +0200

> reminder of the bugfix :
> 
> The rt_check_expire() has a serious problem on machines with large
> route caches, and a standard HZ value of 1000.
> 
> With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ;
> 
> the loop count :
> 
>      for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
> 
> 
> overflows (t is a 31 bit value) as soon rt_hash_log is >= 16  (65536
> slots in route cache hash table).
> 
> In this case, rt_check_expire() does nothing at all

I'd like this bug fix as a seperate patch.

Also, coding style problems in the spinlock part of the patch:

+	static spinlock_t	*rt_hash_locks;

That's a file static variable, no need to tab it at all.

+		if (*rthp == 0) continue ;

Please put the continue on a seperate line, properly
tabbed, and without a space between the continue and the
closing semicolon.

Please scan the rest of your patch for problems like this.

So, again please submit a seperate patch for the overflow
bug, then one for the locking changes, so they may be evaluated
and applied seperately.

Thanks a lot.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2005-06-28 20:14 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-06-24  3:04 [PATCH] dst_entry structure use,lastuse and refcnt abstraction Christoph Lameter
2005-06-24  3:10 ` [PATCH] dst numa: Avoid dst counter cacheline bouncing Christoph Lameter
2005-06-24  3:32   ` David S. Miller
2005-06-24  4:58   ` Dipankar Sarma
2005-06-24  5:05     ` Christoph Lameter
2005-06-24  7:29       ` Dipankar Sarma
2005-06-24  3:36 ` [PATCH] dst_entry structure use,lastuse and refcnt abstraction David S. Miller
2005-06-24  3:40   ` Christoph Lameter
2005-06-24  3:47     ` David S. Miller
2005-06-24  3:49       ` Christoph Lameter
2005-06-24  3:54         ` David S. Miller
2005-06-24  4:06           ` Christoph Lameter
2005-06-24  4:11             ` David S. Miller
2005-06-24  6:03               ` Christoph Lameter
2005-06-24  6:16                 ` David S. Miller
2005-06-24 10:57               ` [PATCH] bugfix and scalability changes in net/ipv4/route.c Eric Dumazet
2005-06-28 20:14                 ` David S. Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.