All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] net: add real socket cookies
@ 2015-03-10 22:45 Eric Dumazet
  2015-03-11 21:53 ` David Miller
  0 siblings, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2015-03-10 22:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Eric Salo

From: Eric Dumazet <edumazet@google.com>

A long standing problem in netlink socket dumps is the use
of kernel socket addresses as cookies.

1) It is a security concern.

2) Sockets can be reused quite quickly, so there is
   no guarantee a cookie is used once and identify
   a flow.

3) request sock, establish sock, and timewait socks
   for a given flow have different cookies.


Part of our effort to bring better TCP statistics requires
to switch to a different allocator.

In this patch, I chose to use a per network namespace 64bit generator,
and to use it only in the case a socket needs to be dumped to netlink.

Note that I tried to carry cookies from request sock, to establish sock,
then timewait sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eric Salo <salo@google.com>
---
 include/linux/sock_diag.h        |    4 +--
 include/net/inet_sock.h          |    2 +
 include/net/inet_timewait_sock.h |    1 
 include/net/net_namespace.h      |    2 +
 include/net/sock.h               |    3 ++
 net/core/sock.c                  |    1 
 net/core/sock_diag.c             |   37 +++++++++++++++++++++--------
 net/dccp/ipv4.c                  |    2 +
 net/ipv4/inet_connection_sock.c  |    3 +-
 net/ipv4/inet_diag.c             |   14 +++++++---
 net/ipv4/inet_timewait_sock.c    |    1 
 net/ipv4/syncookies.c            |    1 
 net/ipv4/tcp_input.c             |    2 +
 13 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index b5ad7d35a636..083ac388098e 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -19,8 +19,8 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
-int sock_diag_check_cookie(void *sk, const __u32 *cookie);
-void sock_diag_save_cookie(void *sk, __u32 *cookie);
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
 
 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index eb16c7beed1e..e565afdc14ad 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -77,6 +77,8 @@ struct inet_request_sock {
 #define ir_v6_rmt_addr		req.__req_common.skc_v6_daddr
 #define ir_v6_loc_addr		req.__req_common.skc_v6_rcv_saddr
 #define ir_iif			req.__req_common.skc_bound_dev_if
+#define ir_cookie		req.__req_common.skc_cookie
+#define ireq_net		req.__req_common.skc_net
 
 	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6c566034e26d..b7ce1003c429 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -122,6 +122,7 @@ struct inet_timewait_sock {
 #define tw_v6_rcv_saddr    	__tw_common.skc_v6_rcv_saddr
 #define tw_dport		__tw_common.skc_dport
 #define tw_num			__tw_common.skc_num
+#define tw_cookie		__tw_common.skc_cookie
 
 	int			tw_timeout;
 	volatile unsigned char	tw_substate;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 2cb9acb618e9..e086f4030dd2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,8 @@ struct net {
 #endif
 	spinlock_t		rules_mod_lock;
 
+	atomic64_t		cookie_gen;
+
 	struct list_head	list;		/* list of network namespaces */
 	struct list_head	cleanup_list;	/* namespaces on death row */
 	struct list_head	exit_list;	/* Use only net_mutex */
diff --git a/include/net/sock.h b/include/net/sock.h
index 250822cc1e02..d996c633bec2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -199,6 +199,8 @@ struct sock_common {
 	struct in6_addr		skc_v6_rcv_saddr;
 #endif
 
+	atomic64_t		skc_cookie;
+
 	/*
 	 * fields between dontcopy_begin/dontcopy_end
 	 * are not copied in sock_copy()
@@ -329,6 +331,7 @@ struct sock {
 #define sk_net			__sk_common.skc_net
 #define sk_v6_daddr		__sk_common.skc_v6_daddr
 #define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
+#define sk_cookie		__sk_common.skc_cookie
 
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
diff --git a/net/core/sock.c b/net/core/sock.c
index 726e1f99aa8d..a9a9c2ff9260 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1538,6 +1538,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
+		atomic64_set(&newsk->sk_cookie, 0);
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 96e70ee05a8d..74dddf84adcd 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -13,22 +13,39 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
 static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
 static DEFINE_MUTEX(sock_diag_table_mutex);
 
-int sock_diag_check_cookie(void *sk, const __u32 *cookie)
+static u64 sock_gen_cookie(struct sock *sk)
 {
-	if ((cookie[0] != INET_DIAG_NOCOOKIE ||
-	     cookie[1] != INET_DIAG_NOCOOKIE) &&
-	    ((u32)(unsigned long)sk != cookie[0] ||
-	     (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
-		return -ESTALE;
-	else
+	while (1) {
+		u64 res = atomic64_read(&sk->sk_cookie);
+
+		if (res)
+			return res;
+		res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+		atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+	}
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+	u64 res;
+
+	if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
 		return 0;
+
+	res = sock_gen_cookie(sk);
+	if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+		return -ESTALE;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
 
-void sock_diag_save_cookie(void *sk, __u32 *cookie)
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
 {
-	cookie[0] = (u32)(unsigned long)sk;
-	cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+	u64 res = sock_gen_cookie(sk);
+
+	cookie[0] = (u32)res;
+	cookie[1] = (u32)(res >> 32);
 }
 EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e45b968613a4..207281ae3536 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -641,6 +641,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+	ireq->ireq_net = sock_net(sk);
+	atomic64_set(&ireq->ir_cookie, 0);
 
 	/*
 	 * Step 3: Process LISTEN state
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 14d02ea905b6..a097ae38b639 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -678,7 +678,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newsk->sk_mark = inet_rsk(req)->ir_mark;
-
+		atomic64_cmpxchg(&newsk->sk_cookie, 0,
+				 atomic64_read(&inet_rsk(req)->ir_cookie));
 		newicsk->icsk_retransmits = 0;
 		newicsk->icsk_backoff	  = 0;
 		newicsk->icsk_probes_out  = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ac3bfb458afd..29317ff4a007 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -221,12 +221,13 @@ static int inet_csk_diag_fill(struct sock *sk,
 				 user_ns, portid, seq, nlmsg_flags, unlh);
 }
 
-static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_fill(struct sock *sk,
 			       struct sk_buff *skb,
 			       const struct inet_diag_req_v2 *req,
 			       u32 portid, u32 seq, u16 nlmsg_flags,
 			       const struct nlmsghdr *unlh)
 {
+	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	s32 tmo;
@@ -247,7 +248,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	r->idiag_retrans      = 0;
 
 	r->id.idiag_if	      = tw->tw_bound_dev_if;
-	sock_diag_save_cookie(tw, r->id.idiag_cookie);
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
 
 	r->id.idiag_sport     = tw->tw_sport;
 	r->id.idiag_dport     = tw->tw_dport;
@@ -283,7 +284,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct nlmsghdr *unlh)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
-		return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+		return inet_twsk_diag_fill(sk, skb, r, portid, seq,
 					   nlmsg_flags, unlh);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
@@ -675,7 +676,7 @@ static int inet_twsk_diag_dump(struct sock *sk,
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
 
-	return inet_twsk_diag_fill(inet_twsk(sk), skb, r,
+	return inet_twsk_diag_fill(sk, skb, r,
 				   NETLINK_CB(cb->skb).portid,
 				   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -734,7 +735,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	r->idiag_retrans = req->num_retrans;
 
 	r->id.idiag_if = sk->sk_bound_dev_if;
-	sock_diag_save_cookie(req, r->id.idiag_cookie);
+
+	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+		     offsetof(struct sock, sk_cookie));
+	sock_diag_save_cookie((struct sock *)ireq, r->id.idiag_cookie);
 
 	tmo = req->expires - jiffies;
 	if (tmo < 0)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8555fb..2bd980526631 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -195,6 +195,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_ipv6only	    = 0;
 		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
+		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
 		twsk_net_set(tw, hold_net(sock_net(sk)));
 		/*
 		 * Because we use RCU lookups, we should not set tw_refcnt
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 45fe60c5238e..ece31b426013 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -346,6 +346,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->listener		= NULL;
+	ireq->ireq_net		= sock_net(sk);
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb4cf8b8e121..d7045f5f6ebf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5965,6 +5965,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
+	inet_rsk(req)->ireq_net = sock_net(sk);
+	atomic64_set(&inet_rsk(req)->ir_cookie, 0);
 
 	af_ops->init_req(req, sk, skb);
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next] net: add real socket cookies
  2015-03-10 22:45 [PATCH net-next] net: add real socket cookies Eric Dumazet
@ 2015-03-11 21:53 ` David Miller
  2015-03-11 22:30   ` Eric Dumazet
  0 siblings, 1 reply; 22+ messages in thread
From: David Miller @ 2015-03-11 21:53 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, salo

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 10 Mar 2015 15:45:33 -0700

> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 14d02ea905b6..a097ae38b639 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -678,7 +678,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
>  		newsk->sk_write_space = sk_stream_write_space;
>  
>  		newsk->sk_mark = inet_rsk(req)->ir_mark;
> -
> +		atomic64_cmpxchg(&newsk->sk_cookie, 0,
> +				 atomic64_read(&inet_rsk(req)->ir_cookie));
>  		newicsk->icsk_retransmits = 0;
>  		newicsk->icsk_backoff	  = 0;
>  		newicsk->icsk_probes_out  = 0;

I think you have to be more careful here.

sk_clone_lock() is not going to zero out sk_cookie for you, it only
does so if the priority has __GFP_ZERO in it and some callsites use
just plain GFP_ATOMIC.

Therefore, you can't just assume sk_cookie is zero.

Just use atomic64_set() here, you have exclusive access to this piece
of memory at this point in time, and you'll save an unnecessary atomic
operation as well.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next] net: add real socket cookies
  2015-03-11 21:53 ` David Miller
@ 2015-03-11 22:30   ` Eric Dumazet
  2015-03-11 22:56     ` [PATCH v2 " Eric Dumazet
  2015-03-12  1:21     ` [PATCH " David Miller
  0 siblings, 2 replies; 22+ messages in thread
From: Eric Dumazet @ 2015-03-11 22:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo

On Wed, 2015-03-11 at 17:53 -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 10 Mar 2015 15:45:33 -0700
> 
> > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> > index 14d02ea905b6..a097ae38b639 100644
> > --- a/net/ipv4/inet_connection_sock.c
> > +++ b/net/ipv4/inet_connection_sock.c
> > @@ -678,7 +678,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
> >  		newsk->sk_write_space = sk_stream_write_space;
> >  
> >  		newsk->sk_mark = inet_rsk(req)->ir_mark;
> > -
> > +		atomic64_cmpxchg(&newsk->sk_cookie, 0,
> > +				 atomic64_read(&inet_rsk(req)->ir_cookie));
> >  		newicsk->icsk_retransmits = 0;
> >  		newicsk->icsk_backoff	  = 0;
> >  		newicsk->icsk_probes_out  = 0;
> 
> I think you have to be more careful here.
> 
> sk_clone_lock() is not going to zero out sk_cookie for you, it only
> does so if the priority has __GFP_ZERO in it and some callsites use
> just plain GFP_ATOMIC.
> 
> Therefore, you can't just assume sk_cookie is zero.
> 
> Just use atomic64_set() here, you have exclusive access to this piece
> of memory at this point in time, and you'll save an unnecessary atomic
> operation as well.

Note that sk_clone_lock() really does the clear for us :

net/core/sock.c:1541:		atomic64_set(&newsk->sk_cookie, 0);

atomic_cmpxchg() is needed here in case a concurrent dumper already
caught the new socket.

I thought of adding a 'u64 cookie' parameter to sk_clone(), but it
probably can be done in a followup patch, not sure it is worth the
complexity.

This would avoid the atomic64_cmpxchg(), but we also simply can avoid
this atomic64_cmpxchg() if inet_rsk(req)->ir_cookie is 0, as it is
probably 0 anyway.

I can send a v2 with :

u64 cookie = atomic64_read(&inet_rsk(req)->ir_cookie);
if (cookie)
	atomic64_cmpxchg(&newsk->sk_cookie, 0, cooke);

Thanks !

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 net-next] net: add real socket cookies
  2015-03-11 22:30   ` Eric Dumazet
@ 2015-03-11 22:56     ` Eric Dumazet
  2015-03-12  1:53       ` [PATCH v3 " Eric Dumazet
  2015-03-12  1:21     ` [PATCH " David Miller
  1 sibling, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2015-03-11 22:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo

From: Eric Dumazet <edumazet@google.com>

A long standing problem in netlink socket dumps is the use
of kernel socket addresses as cookies.

1) It is a security concern.

2) Sockets can be reused quite quickly, so there is
   no guarantee a cookie is used once and identify
   a flow.

3) request sock, establish sock, and timewait socks
   for a given flow have different cookies.


Part of our effort to bring better TCP statistics requires
to switch to a different allocator.

In this patch, I chose to use a per network namespace 64bit generator,
and to use it only in the case a socket needs to be dumped to netlink.
(This might be refined later if needed)

Note that I tried to carry cookies from request sock, to establish sock,
then timewait sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eric Salo <salo@google.com>
---
v2: added a comment in inet_csk_clone_lock() and try to avoid
atomic64_cmpxchg() after David feedback.

 include/linux/sock_diag.h        |    4 +--
 include/net/inet_sock.h          |    2 +
 include/net/inet_timewait_sock.h |    1 
 include/net/net_namespace.h      |    2 +
 include/net/sock.h               |    3 ++
 net/core/sock.c                  |    1 
 net/core/sock_diag.c             |   37 +++++++++++++++++++++--------
 net/dccp/ipv4.c                  |    2 +
 net/ipv4/inet_connection_sock.c  |   11 +++++++-
 net/ipv4/inet_diag.c             |   14 +++++++---
 net/ipv4/inet_timewait_sock.c    |    1 
 net/ipv4/syncookies.c            |    1 
 net/ipv4/tcp_input.c             |    2 +
 13 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index b5ad7d35a636..083ac388098e 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -19,8 +19,8 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
-int sock_diag_check_cookie(void *sk, const __u32 *cookie);
-void sock_diag_save_cookie(void *sk, __u32 *cookie);
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
 
 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index eb16c7beed1e..e565afdc14ad 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -77,6 +77,8 @@ struct inet_request_sock {
 #define ir_v6_rmt_addr		req.__req_common.skc_v6_daddr
 #define ir_v6_loc_addr		req.__req_common.skc_v6_rcv_saddr
 #define ir_iif			req.__req_common.skc_bound_dev_if
+#define ir_cookie		req.__req_common.skc_cookie
+#define ireq_net		req.__req_common.skc_net
 
 	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6c566034e26d..b7ce1003c429 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -122,6 +122,7 @@ struct inet_timewait_sock {
 #define tw_v6_rcv_saddr    	__tw_common.skc_v6_rcv_saddr
 #define tw_dport		__tw_common.skc_dport
 #define tw_num			__tw_common.skc_num
+#define tw_cookie		__tw_common.skc_cookie
 
 	int			tw_timeout;
 	volatile unsigned char	tw_substate;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 2cb9acb618e9..e086f4030dd2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,8 @@ struct net {
 #endif
 	spinlock_t		rules_mod_lock;
 
+	atomic64_t		cookie_gen;
+
 	struct list_head	list;		/* list of network namespaces */
 	struct list_head	cleanup_list;	/* namespaces on death row */
 	struct list_head	exit_list;	/* Use only net_mutex */
diff --git a/include/net/sock.h b/include/net/sock.h
index 250822cc1e02..d996c633bec2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -199,6 +199,8 @@ struct sock_common {
 	struct in6_addr		skc_v6_rcv_saddr;
 #endif
 
+	atomic64_t		skc_cookie;
+
 	/*
 	 * fields between dontcopy_begin/dontcopy_end
 	 * are not copied in sock_copy()
@@ -329,6 +331,7 @@ struct sock {
 #define sk_net			__sk_common.skc_net
 #define sk_v6_daddr		__sk_common.skc_v6_daddr
 #define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
+#define sk_cookie		__sk_common.skc_cookie
 
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
diff --git a/net/core/sock.c b/net/core/sock.c
index 726e1f99aa8d..a9a9c2ff9260 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1538,6 +1538,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
+		atomic64_set(&newsk->sk_cookie, 0);
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 96e70ee05a8d..74dddf84adcd 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -13,22 +13,39 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
 static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
 static DEFINE_MUTEX(sock_diag_table_mutex);
 
-int sock_diag_check_cookie(void *sk, const __u32 *cookie)
+static u64 sock_gen_cookie(struct sock *sk)
 {
-	if ((cookie[0] != INET_DIAG_NOCOOKIE ||
-	     cookie[1] != INET_DIAG_NOCOOKIE) &&
-	    ((u32)(unsigned long)sk != cookie[0] ||
-	     (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
-		return -ESTALE;
-	else
+	while (1) {
+		u64 res = atomic64_read(&sk->sk_cookie);
+
+		if (res)
+			return res;
+		res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+		atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+	}
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+	u64 res;
+
+	if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
 		return 0;
+
+	res = sock_gen_cookie(sk);
+	if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+		return -ESTALE;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
 
-void sock_diag_save_cookie(void *sk, __u32 *cookie)
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
 {
-	cookie[0] = (u32)(unsigned long)sk;
-	cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+	u64 res = sock_gen_cookie(sk);
+
+	cookie[0] = (u32)res;
+	cookie[1] = (u32)(res >> 32);
 }
 EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e45b968613a4..207281ae3536 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -641,6 +641,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+	ireq->ireq_net = sock_net(sk);
+	atomic64_set(&ireq->ir_cookie, 0);
 
 	/*
 	 * Step 3: Process LISTEN state
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 14d02ea905b6..0873885ca43b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -666,8 +666,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 {
 	struct sock *newsk = sk_clone_lock(sk, priority);
 
-	if (newsk != NULL) {
+	if (newsk) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
+		u64 cookie;
 
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
@@ -679,6 +680,14 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 
 		newsk->sk_mark = inet_rsk(req)->ir_mark;
 
+		/* sk_clone_lock() cleared newsk->sk_cookie, but maybe we
+		 * had a cookie on the request sock. Avoid the expensive
+		 * atomic operation if possible.
+		 */
+		cookie = atomic64_read(&inet_rsk(req)->ir_cookie);
+		if (unlikely(cookie))
+			atomic64_cmpxchg(&newsk->sk_cookie, 0, cookie);
+
 		newicsk->icsk_retransmits = 0;
 		newicsk->icsk_backoff	  = 0;
 		newicsk->icsk_probes_out  = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ac3bfb458afd..29317ff4a007 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -221,12 +221,13 @@ static int inet_csk_diag_fill(struct sock *sk,
 				 user_ns, portid, seq, nlmsg_flags, unlh);
 }
 
-static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_fill(struct sock *sk,
 			       struct sk_buff *skb,
 			       const struct inet_diag_req_v2 *req,
 			       u32 portid, u32 seq, u16 nlmsg_flags,
 			       const struct nlmsghdr *unlh)
 {
+	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	s32 tmo;
@@ -247,7 +248,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	r->idiag_retrans      = 0;
 
 	r->id.idiag_if	      = tw->tw_bound_dev_if;
-	sock_diag_save_cookie(tw, r->id.idiag_cookie);
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
 
 	r->id.idiag_sport     = tw->tw_sport;
 	r->id.idiag_dport     = tw->tw_dport;
@@ -283,7 +284,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct nlmsghdr *unlh)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
-		return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+		return inet_twsk_diag_fill(sk, skb, r, portid, seq,
 					   nlmsg_flags, unlh);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
@@ -675,7 +676,7 @@ static int inet_twsk_diag_dump(struct sock *sk,
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
 
-	return inet_twsk_diag_fill(inet_twsk(sk), skb, r,
+	return inet_twsk_diag_fill(sk, skb, r,
 				   NETLINK_CB(cb->skb).portid,
 				   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -734,7 +735,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	r->idiag_retrans = req->num_retrans;
 
 	r->id.idiag_if = sk->sk_bound_dev_if;
-	sock_diag_save_cookie(req, r->id.idiag_cookie);
+
+	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+		     offsetof(struct sock, sk_cookie));
+	sock_diag_save_cookie((struct sock *)ireq, r->id.idiag_cookie);
 
 	tmo = req->expires - jiffies;
 	if (tmo < 0)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8555fb..2bd980526631 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -195,6 +195,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_ipv6only	    = 0;
 		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
+		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
 		twsk_net_set(tw, hold_net(sock_net(sk)));
 		/*
 		 * Because we use RCU lookups, we should not set tw_refcnt
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 45fe60c5238e..ece31b426013 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -346,6 +346,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->listener		= NULL;
+	ireq->ireq_net		= sock_net(sk);
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb4cf8b8e121..d7045f5f6ebf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5965,6 +5965,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
+	inet_rsk(req)->ireq_net = sock_net(sk);
+	atomic64_set(&inet_rsk(req)->ir_cookie, 0);
 
 	af_ops->init_req(req, sk, skb);
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next] net: add real socket cookies
  2015-03-11 22:30   ` Eric Dumazet
  2015-03-11 22:56     ` [PATCH v2 " Eric Dumazet
@ 2015-03-12  1:21     ` David Miller
  2015-03-12  1:31       ` Eric Dumazet
  1 sibling, 1 reply; 22+ messages in thread
From: David Miller @ 2015-03-12  1:21 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, salo

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Mar 2015 15:30:35 -0700

> Note that sk_clone_lock() really does the clear for us :
> 
> net/core/sock.c:1541:		atomic64_set(&newsk->sk_cookie, 0);
> 
> atomic_cmpxchg() is needed here in case a concurrent dumper already
> caught the new socket.

How is newsk visible at this moment?

It is freshly allocated, and not present in any tables.  Nothing even
indirectly links to it at this point.

Visibility only happens after we return from this function, in
routines such as tcp_v4_syn_recv_sock() which do the hash table
insertion.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next] net: add real socket cookies
  2015-03-12  1:21     ` [PATCH " David Miller
@ 2015-03-12  1:31       ` Eric Dumazet
  2015-03-12  1:54         ` David Miller
  0 siblings, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  1:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo

On Wed, 2015-03-11 at 21:21 -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 11 Mar 2015 15:30:35 -0700
> 
> > Note that sk_clone_lock() really does the clear for us :
> > 
> > net/core/sock.c:1541:		atomic64_set(&newsk->sk_cookie, 0);
> > 
> > atomic_cmpxchg() is needed here in case a concurrent dumper already
> > caught the new socket.
> 
> How is newsk visible at this moment?
> 
> It is freshly allocated, and not present in any tables.  Nothing even
> indirectly links to it at this point.
> 
> Visibility only happens after we return from this function, in
> routines such as tcp_v4_syn_recv_sock() which do the hash table
> insertion.

Yes, but remember we use RCU with SLAB_DESTROY_BY_RCU thing,
and inet_diag_dump_one_icsk() uses RCU as well.

I wanted to be extra careful here. Note that with v2, this
atomic_cmpxchg() wont happen unless one dump operation previously
revealed the SYN_RECV to the world.

I have no strong opinion and definitely it is currently safe to use
atomic64_set() here.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v3 net-next] net: add real socket cookies
  2015-03-11 22:56     ` [PATCH v2 " Eric Dumazet
@ 2015-03-12  1:53       ` Eric Dumazet
  2015-03-12  1:55         ` David Miller
  0 siblings, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  1:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo

From: Eric Dumazet <edumazet@google.com>

A long standing problem in netlink socket dumps is the use
of kernel socket addresses as cookies.

1) It is a security concern.

2) Sockets can be reused quite quickly, so there is
   no guarantee a cookie is used once and identify
   a flow.

3) request sock, establish sock, and timewait socks
   for a given flow have different cookies.


Part of our effort to bring better TCP statistics requires
to switch to a different allocator.

In this patch, I chose to use a per network namespace 64bit generator,
and to use it only in the case a socket needs to be dumped to netlink.
(This might be refined later if needed)

Note that I tried to carry cookies from request sock, to establish sock,
then timewait sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eric Salo <salo@google.com>
---
v3:  use atomic64_set() in inet_csk_clone_lock() as David requested.

 include/linux/sock_diag.h        |    4 +--
 include/net/inet_sock.h          |    2 +
 include/net/inet_timewait_sock.h |    1 
 include/net/net_namespace.h      |    2 +
 include/net/sock.h               |    3 ++
 net/core/sock.c                  |    1 
 net/core/sock_diag.c             |   37 +++++++++++++++++++++--------
 net/dccp/ipv4.c                  |    2 +
 net/ipv4/inet_connection_sock.c  |    2 +
 net/ipv4/inet_diag.c             |   14 +++++++---
 net/ipv4/inet_timewait_sock.c    |    1 
 net/ipv4/syncookies.c            |    1 
 net/ipv4/tcp_input.c             |    2 +
 13 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index b5ad7d35a636..083ac388098e 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -19,8 +19,8 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
-int sock_diag_check_cookie(void *sk, const __u32 *cookie);
-void sock_diag_save_cookie(void *sk, __u32 *cookie);
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
 
 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index eb16c7beed1e..e565afdc14ad 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -77,6 +77,8 @@ struct inet_request_sock {
 #define ir_v6_rmt_addr		req.__req_common.skc_v6_daddr
 #define ir_v6_loc_addr		req.__req_common.skc_v6_rcv_saddr
 #define ir_iif			req.__req_common.skc_bound_dev_if
+#define ir_cookie		req.__req_common.skc_cookie
+#define ireq_net		req.__req_common.skc_net
 
 	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6c566034e26d..b7ce1003c429 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -122,6 +122,7 @@ struct inet_timewait_sock {
 #define tw_v6_rcv_saddr    	__tw_common.skc_v6_rcv_saddr
 #define tw_dport		__tw_common.skc_dport
 #define tw_num			__tw_common.skc_num
+#define tw_cookie		__tw_common.skc_cookie
 
 	int			tw_timeout;
 	volatile unsigned char	tw_substate;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 2cb9acb618e9..e086f4030dd2 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,8 @@ struct net {
 #endif
 	spinlock_t		rules_mod_lock;
 
+	atomic64_t		cookie_gen;
+
 	struct list_head	list;		/* list of network namespaces */
 	struct list_head	cleanup_list;	/* namespaces on death row */
 	struct list_head	exit_list;	/* Use only net_mutex */
diff --git a/include/net/sock.h b/include/net/sock.h
index 250822cc1e02..d996c633bec2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -199,6 +199,8 @@ struct sock_common {
 	struct in6_addr		skc_v6_rcv_saddr;
 #endif
 
+	atomic64_t		skc_cookie;
+
 	/*
 	 * fields between dontcopy_begin/dontcopy_end
 	 * are not copied in sock_copy()
@@ -329,6 +331,7 @@ struct sock {
 #define sk_net			__sk_common.skc_net
 #define sk_v6_daddr		__sk_common.skc_v6_daddr
 #define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
+#define sk_cookie		__sk_common.skc_cookie
 
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
diff --git a/net/core/sock.c b/net/core/sock.c
index 726e1f99aa8d..a9a9c2ff9260 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1538,6 +1538,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
+		atomic64_set(&newsk->sk_cookie, 0);
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 96e70ee05a8d..74dddf84adcd 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -13,22 +13,39 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
 static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
 static DEFINE_MUTEX(sock_diag_table_mutex);
 
-int sock_diag_check_cookie(void *sk, const __u32 *cookie)
+static u64 sock_gen_cookie(struct sock *sk)
 {
-	if ((cookie[0] != INET_DIAG_NOCOOKIE ||
-	     cookie[1] != INET_DIAG_NOCOOKIE) &&
-	    ((u32)(unsigned long)sk != cookie[0] ||
-	     (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
-		return -ESTALE;
-	else
+	while (1) {
+		u64 res = atomic64_read(&sk->sk_cookie);
+
+		if (res)
+			return res;
+		res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+		atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+	}
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+	u64 res;
+
+	if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
 		return 0;
+
+	res = sock_gen_cookie(sk);
+	if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+		return -ESTALE;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
 
-void sock_diag_save_cookie(void *sk, __u32 *cookie)
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
 {
-	cookie[0] = (u32)(unsigned long)sk;
-	cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+	u64 res = sock_gen_cookie(sk);
+
+	cookie[0] = (u32)res;
+	cookie[1] = (u32)(res >> 32);
 }
 EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e45b968613a4..207281ae3536 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -641,6 +641,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+	ireq->ireq_net = sock_net(sk);
+	atomic64_set(&ireq->ir_cookie, 0);
 
 	/*
 	 * Step 3: Process LISTEN state
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 14d02ea905b6..34581f928afa 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -678,6 +678,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newsk->sk_mark = inet_rsk(req)->ir_mark;
+		atomic64_set(&newsk->sk_cookie,
+			     atomic64_read(&inet_rsk(req)->ir_cookie));
 
 		newicsk->icsk_retransmits = 0;
 		newicsk->icsk_backoff	  = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ac3bfb458afd..29317ff4a007 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -221,12 +221,13 @@ static int inet_csk_diag_fill(struct sock *sk,
 				 user_ns, portid, seq, nlmsg_flags, unlh);
 }
 
-static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_fill(struct sock *sk,
 			       struct sk_buff *skb,
 			       const struct inet_diag_req_v2 *req,
 			       u32 portid, u32 seq, u16 nlmsg_flags,
 			       const struct nlmsghdr *unlh)
 {
+	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	s32 tmo;
@@ -247,7 +248,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	r->idiag_retrans      = 0;
 
 	r->id.idiag_if	      = tw->tw_bound_dev_if;
-	sock_diag_save_cookie(tw, r->id.idiag_cookie);
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
 
 	r->id.idiag_sport     = tw->tw_sport;
 	r->id.idiag_dport     = tw->tw_dport;
@@ -283,7 +284,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct nlmsghdr *unlh)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
-		return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+		return inet_twsk_diag_fill(sk, skb, r, portid, seq,
 					   nlmsg_flags, unlh);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
@@ -675,7 +676,7 @@ static int inet_twsk_diag_dump(struct sock *sk,
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
 
-	return inet_twsk_diag_fill(inet_twsk(sk), skb, r,
+	return inet_twsk_diag_fill(sk, skb, r,
 				   NETLINK_CB(cb->skb).portid,
 				   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -734,7 +735,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	r->idiag_retrans = req->num_retrans;
 
 	r->id.idiag_if = sk->sk_bound_dev_if;
-	sock_diag_save_cookie(req, r->id.idiag_cookie);
+
+	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+		     offsetof(struct sock, sk_cookie));
+	sock_diag_save_cookie((struct sock *)ireq, r->id.idiag_cookie);
 
 	tmo = req->expires - jiffies;
 	if (tmo < 0)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8555fb..2bd980526631 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -195,6 +195,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_ipv6only	    = 0;
 		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
+		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
 		twsk_net_set(tw, hold_net(sock_net(sk)));
 		/*
 		 * Because we use RCU lookups, we should not set tw_refcnt
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 45fe60c5238e..ece31b426013 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -346,6 +346,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->listener		= NULL;
+	ireq->ireq_net		= sock_net(sk);
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb4cf8b8e121..d7045f5f6ebf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5965,6 +5965,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
+	inet_rsk(req)->ireq_net = sock_net(sk);
+	atomic64_set(&inet_rsk(req)->ir_cookie, 0);
 
 	af_ops->init_req(req, sk, skb);
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next] net: add real socket cookies
  2015-03-12  1:31       ` Eric Dumazet
@ 2015-03-12  1:54         ` David Miller
  0 siblings, 0 replies; 22+ messages in thread
From: David Miller @ 2015-03-12  1:54 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, salo

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Mar 2015 18:31:42 -0700

> Yes, but remember we use RCU with SLAB_DESTROY_BY_RCU thing,
> and inet_diag_dump_one_icsk() uses RCU as well.
> 
> I wanted to be extra careful here. Note that with v2, this
> atomic_cmpxchg() wont happen unless one dump operation previously
> revealed the SYN_RECV to the world.
> 
> I have no strong opinion and definitely it is currently safe to use
> atomic64_set() here.

Ok, it's just that we've worked so hard to minimize atomics in these
paths.

I'm ok with v2 of your patch and have applied it, thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 net-next] net: add real socket cookies
  2015-03-12  1:53       ` [PATCH v3 " Eric Dumazet
@ 2015-03-12  1:55         ` David Miller
  2015-03-12  2:23           ` Eric Dumazet
  2015-03-12  2:54           ` Eric Dumazet
  0 siblings, 2 replies; 22+ messages in thread
From: David Miller @ 2015-03-12  1:55 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, salo

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Mar 2015 18:53:14 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> A long standing problem in netlink socket dumps is the use
> of kernel socket addresses as cookies.
> 
> 1) It is a security concern.
> 
> 2) Sockets can be reused quite quickly, so there is
>    no guarantee a cookie is used once and identify
>    a flow.
> 
> 3) request sock, establish sock, and timewait socks
>    for a given flow have different cookies.
> 
> 
> Part of our effort to bring better TCP statistics requires
> to switch to a different allocator.
> 
> In this patch, I chose to use a per network namespace 64bit generator,
> and to use it only in the case a socket needs to be dumped to netlink.
> (This might be refined later if needed)
> 
> Note that I tried to carry cookies from request sock, to establish sock,
> then timewait sockets.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Eric Salo <salo@google.com>
> ---
> v3:  use atomic64_set() in inet_csk_clone_lock() as David requested.

Oh, even better, I'll push this one actually.

Thanks!

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 net-next] net: add real socket cookies
  2015-03-12  1:55         ` David Miller
@ 2015-03-12  2:23           ` Eric Dumazet
  2015-03-12  2:54           ` Eric Dumazet
  1 sibling, 0 replies; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  2:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo

On Wed, 2015-03-11 at 21:55 -0400, David Miller wrote:

> Oh, even better, I'll push this one actually.
> 
> Thanks!

Perfect, thanks David.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 net-next] net: add real socket cookies
  2015-03-12  1:55         ` David Miller
  2015-03-12  2:23           ` Eric Dumazet
@ 2015-03-12  2:54           ` Eric Dumazet
  2015-03-12  3:21             ` Eric Dumazet
                               ` (2 more replies)
  1 sibling, 3 replies; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  2:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo, Eric W. Biederman

On Wed, 2015-03-11 at 21:55 -0400, David Miller wrote:

> Oh, even better, I'll push this one actually.
> 

I'll send an update, because I broke the build when CONFIG_NET_NS=n

Or maybe we should adapt/push Eric patch 2/8 (net: Introduce
possible_net_t)

Eric, do you mind if I respin your patch, or do you prefer to do this
yourself ?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 net-next] net: add real socket cookies
  2015-03-12  2:54           ` Eric Dumazet
@ 2015-03-12  3:21             ` Eric Dumazet
  2015-03-12  3:27             ` [PATCH net-next] net: fix CONFIG_NET_NS=n compilation Eric Dumazet
  2015-03-12  3:29             ` [PATCH v3 net-next] net: add real socket cookies Eric W. Biederman
  2 siblings, 0 replies; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  3:21 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo, Eric W. Biederman

On Wed, 2015-03-11 at 19:54 -0700, Eric Dumazet wrote:
> On Wed, 2015-03-11 at 21:55 -0400, David Miller wrote:
> 
> > Oh, even better, I'll push this one actually.
> > 
> 
> I'll send an update, because I broke the build when CONFIG_NET_NS=n
> 
> Or maybe we should adapt/push Eric patch 2/8 (net: Introduce
> possible_net_t)
> 
> Eric, do you mind if I respin your patch, or do you prefer to do this
> yourself ?

Hmm.. no hurry, I'll send a small fix in a few minutes, I simply have to
use write_pnet() on few places.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net-next] net: fix CONFIG_NET_NS=n compilation
  2015-03-12  2:54           ` Eric Dumazet
  2015-03-12  3:21             ` Eric Dumazet
@ 2015-03-12  3:27             ` Eric Dumazet
  2015-03-12  3:29               ` David Miller
  2015-03-12  3:29             ` [PATCH v3 net-next] net: add real socket cookies Eric W. Biederman
  2 siblings, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  3:27 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, salo, Eric W. Biederman

From: Eric Dumazet <edumazet@google.com>

I forgot to use write_pnet() in three locations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 33cf7c90fe2f9 ("net: add real socket cookies")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
---
 net/dccp/ipv4.c       |    2 +-
 net/ipv4/syncookies.c |    2 +-
 net/ipv4/tcp_input.c  |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 207281ae3536..a78e0b999f96 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -641,7 +641,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
-	ireq->ireq_net = sock_net(sk);
+	write_pnet(&ireq->ireq_net, sock_net(sk));
 	atomic64_set(&ireq->ir_cookie, 0);
 
 	/*
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index ece31b426013..18e5a67fda81 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -346,7 +346,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->listener		= NULL;
-	ireq->ireq_net		= sock_net(sk);
+	write_pnet(&ireq->ireq_net, sock_net(sk));
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d7045f5f6ebf..26f24995bd3d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5965,7 +5965,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
-	inet_rsk(req)->ireq_net = sock_net(sk);
+	write_pnet(&inet_rsk(req)->ireq_net, sock_net(sk));
 	atomic64_set(&inet_rsk(req)->ir_cookie, 0);
 
 	af_ops->init_req(req, sk, skb);

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 net-next] net: add real socket cookies
  2015-03-12  2:54           ` Eric Dumazet
  2015-03-12  3:21             ` Eric Dumazet
  2015-03-12  3:27             ` [PATCH net-next] net: fix CONFIG_NET_NS=n compilation Eric Dumazet
@ 2015-03-12  3:29             ` Eric W. Biederman
  2 siblings, 0 replies; 22+ messages in thread
From: Eric W. Biederman @ 2015-03-12  3:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, salo

Eric Dumazet <eric.dumazet@gmail.com> writes:

> On Wed, 2015-03-11 at 21:55 -0400, David Miller wrote:
>
>> Oh, even better, I'll push this one actually.
>> 
>
> I'll send an update, because I broke the build when CONFIG_NET_NS=n
>
> Or maybe we should adapt/push Eric patch 2/8 (net: Introduce
> possible_net_t)

Yeah that would make the kind of mistake that broke the build difficult.

> Eric, do you mind if I respin your patch, or do you prefer to do this
> yourself ?

Give me a minute and I will send an updated version of my first two
patches.

Eric

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next] net: fix CONFIG_NET_NS=n compilation
  2015-03-12  3:27             ` [PATCH net-next] net: fix CONFIG_NET_NS=n compilation Eric Dumazet
@ 2015-03-12  3:29               ` David Miller
  2015-03-12  4:03                 ` [PATCH net-next 0/2] Introduce possible_net_t Eric W. Biederman
  0 siblings, 1 reply; 22+ messages in thread
From: David Miller @ 2015-03-12  3:29 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, salo, ebiederm

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Mar 2015 20:27:52 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> I forgot to use write_pnet() in three locations.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Fixes: 33cf7c90fe2f9 ("net: add real socket cookies")
> Reported-by: kbuild test robot <fengguang.wu@intel.com>

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net-next 0/2] Introduce possible_net_t
  2015-03-12  3:29               ` David Miller
@ 2015-03-12  4:03                 ` Eric W. Biederman
  2015-03-12  4:04                   ` [PATCH net-next 1/2] net: Kill hold_net release_net Eric W. Biederman
                                     ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Eric W. Biederman @ 2015-03-12  4:03 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, salo


The current usage of write_pnet and read_pnet is a little laborious and
error prone as you only notice if you failed to include them if are
compiling with network namespaces enabled.

possible_net_t remedies that by using a type that is 0 bytes when
network namespaces are disabled and can only be read and written to with
read_pnet and write_pnet.

Aka less work and safer for the same effect.

I kill hold_net and release_net first as are they are haven't been used
since 2008 and are noise at the points where write_pnet and read_pnet
are used.

I have folded in Eric Dumazets suggestions to improve the killing of
hold_net and release net.  And respon.  I had to respin anyway as
there was enough changes elsewhere in the tree the previous version
of these patches did not quite apply cleanly.

Eric W. Biederman (2):
      net: Kill hold_net release_net
      net: Introduce possible_net_t

 include/linux/netdevice.h            |  9 ++-----
 include/net/cfg80211.h               |  4 +--
 include/net/fib_rules.h              |  9 +------
 include/net/genetlink.h              |  4 +--
 include/net/inet_hashtables.h        |  4 +--
 include/net/ip_vs.h                  |  8 +++---
 include/net/neighbour.h              |  8 ++----
 include/net/net_namespace.h          | 52 +++++++++---------------------------
 include/net/netfilter/nf_conntrack.h |  5 ++--
 include/net/sock.h                   |  6 ++---
 include/net/xfrm.h                   |  8 ++----
 net/9p/trans_fd.c                    |  4 +--
 net/core/dev.c                       |  2 --
 net/core/fib_rules.c                 | 17 +++---------
 net/core/neighbour.c                 |  9 ++-----
 net/core/net_namespace.c             | 11 --------
 net/core/sock.c                      |  1 -
 net/ipv4/fib_semantics.c             |  3 +--
 net/ipv4/inet_hashtables.c           |  3 +--
 net/ipv4/inet_timewait_sock.c        |  3 +--
 net/ipv4/ipmr.c                      |  4 +--
 net/ipv6/addrlabel.c                 | 11 ++------
 net/ipv6/ip6_flowlabel.c             |  3 +--
 net/ipv6/ip6mr.c                     |  4 +--
 net/openvswitch/datapath.c           |  4 +--
 net/openvswitch/datapath.h           |  4 +--
 net/packet/internal.h                |  4 +--
 27 files changed, 49 insertions(+), 155 deletions(-)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net-next 1/2] net: Kill hold_net release_net
  2015-03-12  4:03                 ` [PATCH net-next 0/2] Introduce possible_net_t Eric W. Biederman
@ 2015-03-12  4:04                   ` Eric W. Biederman
  2015-03-12  4:35                     ` Eric Dumazet
  2015-03-12  4:06                   ` [PATCH net-next 2/2] net: Introduce possible_net_t Eric W. Biederman
  2015-03-12 18:40                   ` [PATCH net-next 0/2] " David Miller
  2 siblings, 1 reply; 22+ messages in thread
From: Eric W. Biederman @ 2015-03-12  4:04 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, salo


hold_net and release_net were an idea that turned out to be useless.
The code has been disabled since 2008.  Kill the code it is long past due.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/netdevice.h     |  3 +--
 include/net/fib_rules.h       |  9 +--------
 include/net/net_namespace.h   | 29 -----------------------------
 include/net/sock.h            |  2 +-
 net/core/dev.c                |  2 --
 net/core/fib_rules.c          | 17 +++--------------
 net/core/neighbour.c          |  9 ++-------
 net/core/net_namespace.c      | 11 -----------
 net/core/sock.c               |  1 -
 net/ipv4/fib_semantics.c      |  3 +--
 net/ipv4/inet_hashtables.c    |  3 +--
 net/ipv4/inet_timewait_sock.c |  3 +--
 net/ipv6/addrlabel.c          |  5 +----
 net/ipv6/ip6_flowlabel.c      |  3 +--
 net/openvswitch/datapath.c    |  4 +---
 15 files changed, 14 insertions(+), 90 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1354ae83efc8..cede40d9cac9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1864,8 +1864,7 @@ static inline
 void dev_net_set(struct net_device *dev, struct net *net)
 {
 #ifdef CONFIG_NET_NS
-	release_net(dev->nd_net);
-	dev->nd_net = hold_net(net);
+	dev->nd_net = net;
 #endif
 }
 
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 88d2ae526961..6d67383a5114 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -95,17 +95,10 @@ static inline void fib_rule_get(struct fib_rule *rule)
 	atomic_inc(&rule->refcnt);
 }
 
-static inline void fib_rule_put_rcu(struct rcu_head *head)
-{
-	struct fib_rule *rule = container_of(head, struct fib_rule, rcu);
-	release_net(rule->fr_net);
-	kfree(rule);
-}
-
 static inline void fib_rule_put(struct fib_rule *rule)
 {
 	if (atomic_dec_and_test(&rule->refcnt))
-		call_rcu(&rule->rcu, fib_rule_put_rcu);
+		kfree_rcu(rule, rcu);
 }
 
 static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index e086f4030dd2..fab51ceeabf3 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -49,11 +49,6 @@ struct net {
 	atomic_t		count;		/* To decided when the network
 						 *  namespace should be shut down.
 						 */
-#ifdef NETNS_REFCNT_DEBUG
-	atomic_t		use_count;	/* To track references we
-						 * destroy on demand
-						 */
-#endif
 	spinlock_t		rules_mod_lock;
 
 	atomic64_t		cookie_gen;
@@ -236,30 +231,6 @@ int net_eq(const struct net *net1, const struct net *net2)
 #endif
 
 
-#ifdef NETNS_REFCNT_DEBUG
-static inline struct net *hold_net(struct net *net)
-{
-	if (net)
-		atomic_inc(&net->use_count);
-	return net;
-}
-
-static inline void release_net(struct net *net)
-{
-	if (net)
-		atomic_dec(&net->use_count);
-}
-#else
-static inline struct net *hold_net(struct net *net)
-{
-	return net;
-}
-
-static inline void release_net(struct net *net)
-{
-}
-#endif
-
 #ifdef CONFIG_NET_NS
 
 static inline void write_pnet(struct net **pnet, struct net *net)
diff --git a/include/net/sock.h b/include/net/sock.h
index d996c633bec2..95b2c1c220f9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2204,7 +2204,7 @@ static inline void sk_change_net(struct sock *sk, struct net *net)
 
 	if (!net_eq(current_net, net)) {
 		put_net(current_net);
-		sock_net_set(sk, hold_net(net));
+		sock_net_set(sk, net);
 	}
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 962ee9d71964..39fe369b46ad 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6841,8 +6841,6 @@ void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
-	release_net(dev_net(dev));
-
 	netif_free_tx_queues(dev);
 #ifdef CONFIG_SYSFS
 	kvfree(dev->_rx);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index b55677fed1c8..68ea6950cad1 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -31,7 +31,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 	r->pref = pref;
 	r->table = table;
 	r->flags = flags;
-	r->fr_net = hold_net(ops->fro_net);
+	r->fr_net = ops->fro_net;
 
 	r->suppress_prefixlen = -1;
 	r->suppress_ifgroup = -1;
@@ -116,7 +116,6 @@ static int __fib_rules_register(struct fib_rules_ops *ops)
 		if (ops->family == o->family)
 			goto errout;
 
-	hold_net(net);
 	list_add_tail_rcu(&ops->list, &net->rules_ops);
 	err = 0;
 errout:
@@ -160,15 +159,6 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
 	}
 }
 
-static void fib_rules_put_rcu(struct rcu_head *head)
-{
-	struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
-	struct net *net = ops->fro_net;
-
-	release_net(net);
-	kfree(ops);
-}
-
 void fib_rules_unregister(struct fib_rules_ops *ops)
 {
 	struct net *net = ops->fro_net;
@@ -178,7 +168,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
 	fib_rules_cleanup_ops(ops);
 	spin_unlock(&net->rules_mod_lock);
 
-	call_rcu(&ops->rcu, fib_rules_put_rcu);
+	kfree_rcu(ops, rcu);
 }
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
 
@@ -303,7 +293,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 		err = -ENOMEM;
 		goto errout;
 	}
-	rule->fr_net = hold_net(net);
+	rule->fr_net = net;
 
 	if (tb[FRA_PRIORITY])
 		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
@@ -423,7 +413,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	return 0;
 
 errout_free:
-	release_net(rule->fr_net);
 	kfree(rule);
 errout:
 	rules_ops_put(ops);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ad07990e943d..0e8b32efc031 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -591,7 +591,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
 	if (!n)
 		goto out;
 
-	write_pnet(&n->net, hold_net(net));
+	write_pnet(&n->net, net);
 	memcpy(n->key, pkey, key_len);
 	n->dev = dev;
 	if (dev)
@@ -600,7 +600,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
 	if (tbl->pconstructor && tbl->pconstructor(n)) {
 		if (dev)
 			dev_put(dev);
-		release_net(net);
 		kfree(n);
 		n = NULL;
 		goto out;
@@ -634,7 +633,6 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 				tbl->pdestructor(n);
 			if (n->dev)
 				dev_put(n->dev);
-			release_net(pneigh_net(n));
 			kfree(n);
 			return 0;
 		}
@@ -657,7 +655,6 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 					tbl->pdestructor(n);
 				if (n->dev)
 					dev_put(n->dev);
-				release_net(pneigh_net(n));
 				kfree(n);
 				continue;
 			}
@@ -1428,11 +1425,10 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
 		dev_hold(dev);
 		p->dev = dev;
-		write_pnet(&p->net, hold_net(net));
+		write_pnet(&p->net, net);
 		p->sysctl_table = NULL;
 
 		if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
-			release_net(net);
 			dev_put(dev);
 			kfree(p);
 			return NULL;
@@ -1472,7 +1468,6 @@ EXPORT_SYMBOL(neigh_parms_release);
 
 static void neigh_parms_destroy(struct neigh_parms *parms)
 {
-	release_net(neigh_parms_net(parms));
 	kfree(parms);
 }
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index cb5290b8c428..e5e96b0f6717 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -236,10 +236,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	net->user_ns = user_ns;
 	idr_init(&net->netns_ids);
 
-#ifdef NETNS_REFCNT_DEBUG
-	atomic_set(&net->use_count, 0);
-#endif
-
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
 		if (error < 0)
@@ -294,13 +290,6 @@ out_free:
 
 static void net_free(struct net *net)
 {
-#ifdef NETNS_REFCNT_DEBUG
-	if (unlikely(atomic_read(&net->use_count) != 0)) {
-		pr_emerg("network namespace not free! Usage: %d\n",
-			 atomic_read(&net->use_count));
-		return;
-	}
-#endif
 	kfree(rcu_access_pointer(net->gen));
 	kmem_cache_free(net_cachep, net);
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index a9a9c2ff9260..c8842f279f7a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1455,7 +1455,6 @@ void sk_release_kernel(struct sock *sk)
 
 	sock_hold(sk);
 	sock_release(sk->sk_socket);
-	release_net(sock_net(sk));
 	sock_net_set(sk, get_net(&init_net));
 	sock_put(sk);
 }
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c6d267442dac..66c1e4fbf884 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head)
 		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
 	} endfor_nexthops(fi);
 
-	release_net(fi->fib_net);
 	if (fi->fib_metrics != (u32 *) dst_default_metrics)
 		kfree(fi->fib_metrics);
 	kfree(fi);
@@ -814,7 +813,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	} else
 		fi->fib_metrics = (u32 *) dst_default_metrics;
 
-	fi->fib_net = hold_net(net);
+	fi->fib_net = net;
 	fi->fib_protocol = cfg->fc_protocol;
 	fi->fib_scope = cfg->fc_scope;
 	fi->fib_flags = cfg->fc_flags;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9111a4e22155..f6a12b97d12b 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -61,7 +61,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
 
 	if (tb != NULL) {
-		write_pnet(&tb->ib_net, hold_net(net));
+		write_pnet(&tb->ib_net, net);
 		tb->port      = snum;
 		tb->fastreuse = 0;
 		tb->fastreuseport = 0;
@@ -79,7 +79,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
 {
 	if (hlist_empty(&tb->owners)) {
 		__hlist_del(&tb->node);
-		release_net(ib_net(tb));
 		kmem_cache_free(cachep, tb);
 	}
 }
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2bd980526631..86ebf020925b 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -98,7 +98,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw)
 #ifdef SOCK_REFCNT_DEBUG
 	pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
 #endif
-	release_net(twsk_net(tw));
 	kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
 	module_put(owner);
 }
@@ -196,7 +195,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_transparent  = inet->transparent;
 		tw->tw_prot	    = sk->sk_prot_creator;
 		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
-		twsk_net_set(tw, hold_net(sock_net(sk)));
+		twsk_net_set(tw, sock_net(sk));
 		/*
 		 * Because we use RCU lookups, we should not set tw_refcnt
 		 * to a non null value before everything is setup for this
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index e43e79d0a612..59c793040498 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -129,9 +129,6 @@ static const __net_initconst struct ip6addrlbl_init_table
 /* Object management */
 static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p)
 {
-#ifdef CONFIG_NET_NS
-	release_net(p->lbl_net);
-#endif
 	kfree(p);
 }
 
@@ -241,7 +238,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 	newp->label = label;
 	INIT_HLIST_NODE(&newp->list);
 #ifdef CONFIG_NET_NS
-	newp->lbl_net = hold_net(net);
+	newp->lbl_net = net;
 #endif
 	atomic_set(&newp->refcnt, 1);
 	return newp;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index f45d6db50a45..457303886fd4 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -100,7 +100,6 @@ static void fl_free(struct ip6_flowlabel *fl)
 	if (fl) {
 		if (fl->share == IPV6_FL_S_PROCESS)
 			put_pid(fl->owner.pid);
-		release_net(fl->fl_net);
 		kfree(fl->opt);
 		kfree_rcu(fl, rcu);
 	}
@@ -403,7 +402,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 		}
 	}
 
-	fl->fl_net = hold_net(net);
+	fl->fl_net = net;
 	fl->expires = jiffies;
 	err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
 	if (err)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 5bae7243c577..096c6276e6b9 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -203,7 +203,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 
 	ovs_flow_tbl_destroy(&dp->table);
 	free_percpu(dp->stats_percpu);
-	release_net(ovs_dp_get_net(dp));
 	kfree(dp->ports);
 	kfree(dp);
 }
@@ -1501,7 +1500,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (dp == NULL)
 		goto err_free_reply;
 
-	ovs_dp_set_net(dp, hold_net(sock_net(skb->sk)));
+	ovs_dp_set_net(dp, sock_net(skb->sk));
 
 	/* Allocate table. */
 	err = ovs_flow_tbl_init(&dp->table);
@@ -1575,7 +1574,6 @@ err_destroy_percpu:
 err_destroy_table:
 	ovs_flow_tbl_destroy(&dp->table);
 err_free_dp:
-	release_net(ovs_dp_get_net(dp));
 	kfree(dp);
 err_free_reply:
 	kfree_skb(reply);
-- 
2.2.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH net-next 2/2] net: Introduce possible_net_t
  2015-03-12  4:03                 ` [PATCH net-next 0/2] Introduce possible_net_t Eric W. Biederman
  2015-03-12  4:04                   ` [PATCH net-next 1/2] net: Kill hold_net release_net Eric W. Biederman
@ 2015-03-12  4:06                   ` Eric W. Biederman
  2015-03-12  4:37                     ` Eric Dumazet
  2015-03-12 18:40                   ` [PATCH net-next 0/2] " David Miller
  2 siblings, 1 reply; 22+ messages in thread
From: Eric W. Biederman @ 2015-03-12  4:06 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, salo


Having to say
> #ifdef CONFIG_NET_NS
> 	struct net *net;
> #endif

in structures is a little bit wordy and a little bit error prone.

Instead it is possible to say:
> typedef struct {
> #ifdef CONFIG_NET_NS
>       struct net *net;
> #endif
> } possible_net_t;

And then in a header say:

> 	possible_net_t net;

Which is cleaner and easier to use and easier to test, as the
possible_net_t is always there no matter what the compile options.

Further this allows read_pnet and write_pnet to be functions in all
cases which is better at catching typos.

This change adds possible_net_t, updates the definitions of read_pnet
and write_pnet, updates optional struct net * variables that
write_pnet uses on to have the type possible_net_t, and finally fixes
up the b0rked users of read_pnet and write_pnet.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

When I was testing this caught all three locations Eric Dumazet needed
to add write_pnet as compile errors with network namespaces enabled.

 include/linux/netdevice.h            |  8 ++------
 include/net/cfg80211.h               |  4 +---
 include/net/genetlink.h              |  4 +---
 include/net/inet_hashtables.h        |  4 +---
 include/net/ip_vs.h                  |  8 ++++----
 include/net/neighbour.h              |  8 ++------
 include/net/net_namespace.h          | 23 +++++++++++++----------
 include/net/netfilter/nf_conntrack.h |  5 ++---
 include/net/sock.h                   |  4 +---
 include/net/xfrm.h                   |  8 ++------
 net/9p/trans_fd.c                    |  4 ++--
 net/ipv4/ipmr.c                      |  4 +---
 net/ipv6/addrlabel.c                 |  8 ++------
 net/ipv6/ip6mr.c                     |  4 +---
 net/openvswitch/datapath.h           |  4 +---
 net/packet/internal.h                |  4 +---
 16 files changed, 37 insertions(+), 67 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cede40d9cac9..ddab1a2a07a0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1721,9 +1721,7 @@ struct net_device {
 	struct netpoll_info __rcu	*npinfo;
 #endif
 
-#ifdef CONFIG_NET_NS
-	struct net		*nd_net;
-#endif
+	possible_net_t			nd_net;
 
 	/* mid-layer private */
 	union {
@@ -1863,9 +1861,7 @@ struct net *dev_net(const struct net_device *dev)
 static inline
 void dev_net_set(struct net_device *dev, struct net *net)
 {
-#ifdef CONFIG_NET_NS
-	dev->nd_net = net;
-#endif
+	write_pnet(&dev->nd_net, net);
 }
 
 static inline bool netdev_uses_dsa(struct net_device *dev)
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 64e09e1e8099..f977abec07f6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3183,10 +3183,8 @@ struct wiphy {
 	const struct ieee80211_ht_cap *ht_capa_mod_mask;
 	const struct ieee80211_vht_cap *vht_capa_mod_mask;
 
-#ifdef CONFIG_NET_NS
 	/* the network namespace this phy lives in currently */
-	struct net *_net;
-#endif
+	possible_net_t _net;
 
 #ifdef CONFIG_CFG80211_WEXT
 	const struct iw_handler_def *wext;
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 0574abd3db86..a9af1cc8c1bc 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -92,9 +92,7 @@ struct genl_info {
 	struct genlmsghdr *	genlhdr;
 	void *			userhdr;
 	struct nlattr **	attrs;
-#ifdef CONFIG_NET_NS
-	struct net *		_net;
-#endif
+	possible_net_t		_net;
 	void *			user_ptr[2];
 	struct sock *		dst_sk;
 };
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index dd1950a7e273..bcd64756e5fe 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -76,9 +76,7 @@ struct inet_ehash_bucket {
  * ports are created in O(1) time?  I thought so. ;-)	-DaveM
  */
 struct inet_bind_bucket {
-#ifdef CONFIG_NET_NS
-	struct net		*ib_net;
-#endif
+	possible_net_t		ib_net;
 	unsigned short		port;
 	signed char		fastreuse;
 	signed char		fastreuseport;
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 20fd23398537..4e3731ee4eac 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -47,13 +47,13 @@ static inline struct net *skb_net(const struct sk_buff *skb)
 	 * Start with the most likely hit
 	 * End with BUG
 	 */
-	if (likely(skb->dev && skb->dev->nd_net))
+	if (likely(skb->dev && dev_net(skb->dev)))
 		return dev_net(skb->dev);
 	if (skb_dst(skb) && skb_dst(skb)->dev)
 		return dev_net(skb_dst(skb)->dev);
 	WARN(skb->sk, "Maybe skb_sknet should be used in %s() at line:%d\n",
 		      __func__, __LINE__);
-	if (likely(skb->sk && skb->sk->sk_net))
+	if (likely(skb->sk && sock_net(skb->sk)))
 		return sock_net(skb->sk);
 	pr_err("There is no net ptr to find in the skb in %s() line:%d\n",
 		__func__, __LINE__);
@@ -71,11 +71,11 @@ static inline struct net *skb_sknet(const struct sk_buff *skb)
 #ifdef CONFIG_NET_NS
 #ifdef CONFIG_IP_VS_DEBUG
 	/* Start with the most likely hit */
-	if (likely(skb->sk && skb->sk->sk_net))
+	if (likely(skb->sk && sock_net(skb->sk)))
 		return sock_net(skb->sk);
 	WARN(skb->dev, "Maybe skb_net should be used instead in %s() line:%d\n",
 		       __func__, __LINE__);
-	if (likely(skb->dev && skb->dev->nd_net))
+	if (likely(skb->dev && dev_net(skb->dev)))
 		return dev_net(skb->dev);
 	pr_err("There is no net ptr to find in the skb in %s() line:%d\n",
 		__func__, __LINE__);
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index d48b8ec8b5f4..e7bdf5170802 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -65,9 +65,7 @@ enum {
 };
 
 struct neigh_parms {
-#ifdef CONFIG_NET_NS
-	struct net *net;
-#endif
+	possible_net_t net;
 	struct net_device *dev;
 	struct list_head list;
 	int	(*neigh_setup)(struct neighbour *);
@@ -167,9 +165,7 @@ struct neigh_ops {
 
 struct pneigh_entry {
 	struct pneigh_entry	*next;
-#ifdef CONFIG_NET_NS
-	struct net		*net;
-#endif
+	possible_net_t		net;
 	struct net_device	*dev;
 	u8			flags;
 	u8			key[0];
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fab51ceeabf3..f733656404de 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -231,24 +231,27 @@ int net_eq(const struct net *net1, const struct net *net2)
 #endif
 
 
+typedef struct {
 #ifdef CONFIG_NET_NS
+	struct net *net;
+#endif
+} possible_net_t;
 
-static inline void write_pnet(struct net **pnet, struct net *net)
+static inline void write_pnet(possible_net_t *pnet, struct net *net)
 {
-	*pnet = net;
+#ifdef CONFIG_NET_NS
+	pnet->net = net;
+#endif
 }
 
-static inline struct net *read_pnet(struct net * const *pnet)
+static inline struct net *read_pnet(const possible_net_t *pnet)
 {
-	return *pnet;
-}
-
+#ifdef CONFIG_NET_NS
+	return pnet->net;
 #else
-
-#define write_pnet(pnet, net)	do { (void)(net);} while (0)
-#define read_pnet(pnet)		(&init_net)
-
+	return &init_net;
 #endif
+}
 
 #define for_each_net(VAR)				\
 	list_for_each_entry(VAR, &net_namespace_list, list)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 74f271a172dd..095433b8a8b0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -95,9 +95,8 @@ struct nf_conn {
 	/* Timer function; drops refcnt when it goes off. */
 	struct timer_list timeout;
 
-#ifdef CONFIG_NET_NS
-	struct net *ct_net;
-#endif
+	possible_net_t ct_net;
+
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 95b2c1c220f9..9411c3421dd3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -190,9 +190,7 @@ struct sock_common {
 		struct hlist_nulls_node skc_portaddr_node;
 	};
 	struct proto		*skc_prot;
-#ifdef CONFIG_NET_NS
-	struct net	 	*skc_net;
-#endif
+	possible_net_t		skc_net;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	struct in6_addr		skc_v6_daddr;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dc4865e90fe4..d0ac7d7be8a7 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -126,9 +126,7 @@ struct xfrm_state_walk {
 
 /* Full description of state of transformer. */
 struct xfrm_state {
-#ifdef CONFIG_NET_NS
-	struct net		*xs_net;
-#endif
+	possible_net_t		xs_net;
 	union {
 		struct hlist_node	gclist;
 		struct hlist_node	bydst;
@@ -522,9 +520,7 @@ struct xfrm_policy_queue {
 };
 
 struct xfrm_policy {
-#ifdef CONFIG_NET_NS
-	struct net		*xp_net;
-#endif
+	possible_net_t		xp_net;
 	struct hlist_node	bydst;
 	struct hlist_node	byidx;
 
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 80d08f6664cb..3e3d82d8ff70 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -940,7 +940,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
 	sin_server.sin_family = AF_INET;
 	sin_server.sin_addr.s_addr = in_aton(addr);
 	sin_server.sin_port = htons(opts.port);
-	err = __sock_create(read_pnet(&current->nsproxy->net_ns), PF_INET,
+	err = __sock_create(current->nsproxy->net_ns, PF_INET,
 			    SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
 	if (err) {
 		pr_err("%s (%d): problem creating socket\n",
@@ -988,7 +988,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
 
 	sun_server.sun_family = PF_UNIX;
 	strcpy(sun_server.sun_path, addr);
-	err = __sock_create(read_pnet(&current->nsproxy->net_ns), PF_UNIX,
+	err = __sock_create(current->nsproxy->net_ns, PF_UNIX,
 			    SOCK_STREAM, 0, &csocket, 1);
 	if (err < 0) {
 		pr_err("%s (%d): problem creating socket\n",
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9d78427652d2..5b188832800f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -73,9 +73,7 @@
 
 struct mr_table {
 	struct list_head	list;
-#ifdef CONFIG_NET_NS
-	struct net		*net;
-#endif
+	possible_net_t		net;
 	u32			id;
 	struct sock __rcu	*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 59c793040498..3cc50e2d3bf5 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -29,9 +29,7 @@
  * Policy Table
  */
 struct ip6addrlbl_entry {
-#ifdef CONFIG_NET_NS
-	struct net *lbl_net;
-#endif
+	possible_net_t lbl_net;
 	struct in6_addr prefix;
 	int prefixlen;
 	int ifindex;
@@ -237,9 +235,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 	newp->addrtype = addrtype;
 	newp->label = label;
 	INIT_HLIST_NODE(&newp->list);
-#ifdef CONFIG_NET_NS
-	newp->lbl_net = net;
-#endif
+	write_pnet(&newp->lbl_net, net);
 	atomic_set(&newp->refcnt, 1);
 	return newp;
 }
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 34b682617f50..4b9315aa273e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -56,9 +56,7 @@
 
 struct mr6_table {
 	struct list_head	list;
-#ifdef CONFIG_NET_NS
-	struct net		*net;
-#endif
+	possible_net_t		net;
 	u32			id;
 	struct sock		*mroute6_sk;
 	struct timer_list	ipmr_expire_timer;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 3ece94563079..4ec4a480b147 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -84,10 +84,8 @@ struct datapath {
 	/* Stats. */
 	struct dp_stats_percpu __percpu *stats_percpu;
 
-#ifdef CONFIG_NET_NS
 	/* Network namespace ref. */
-	struct net *net;
-#endif
+	possible_net_t net;
 
 	u32 user_features;
 };
diff --git a/net/packet/internal.h b/net/packet/internal.h
index cdddf6a30399..fe6e20caea1d 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -74,9 +74,7 @@ extern struct mutex fanout_mutex;
 #define PACKET_FANOUT_MAX	256
 
 struct packet_fanout {
-#ifdef CONFIG_NET_NS
-	struct net		*net;
-#endif
+	possible_net_t		net;
 	unsigned int		num_members;
 	u16			id;
 	u8			type;
-- 
2.2.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 1/2] net: Kill hold_net release_net
  2015-03-12  4:04                   ` [PATCH net-next 1/2] net: Kill hold_net release_net Eric W. Biederman
@ 2015-03-12  4:35                     ` Eric Dumazet
  0 siblings, 0 replies; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  4:35 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: David Miller, netdev, salo

On Wed, 2015-03-11 at 23:04 -0500, Eric W. Biederman wrote:
> hold_net and release_net were an idea that turned out to be useless.
> The code has been disabled since 2008.  Kill the code it is long past due.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: Introduce possible_net_t
  2015-03-12  4:06                   ` [PATCH net-next 2/2] net: Introduce possible_net_t Eric W. Biederman
@ 2015-03-12  4:37                     ` Eric Dumazet
  2015-03-12 16:33                       ` Eric W. Biederman
  0 siblings, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2015-03-12  4:37 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: David Miller, netdev, salo

On Wed, 2015-03-11 at 23:06 -0500, Eric W. Biederman wrote:
> Having to say
> > #ifdef CONFIG_NET_NS
> > 	struct net *net;
> > #endif
> 
> in structures is a little bit wordy and a little bit error prone.
> 
> Instead it is possible to say:
> > typedef struct {
> > #ifdef CONFIG_NET_NS
> >       struct net *net;
> > #endif
> > } possible_net_t;
> 
> And then in a header say:
> 
> > 	possible_net_t net;
> 
> Which is cleaner and easier to use and easier to test, as the
> possible_net_t is always there no matter what the compile options.
> 
> Further this allows read_pnet and write_pnet to be functions in all
> cases which is better at catching typos.
> 
> This change adds possible_net_t, updates the definitions of read_pnet
> and write_pnet, updates optional struct net * variables that
> write_pnet uses on to have the type possible_net_t, and finally fixes
> up the b0rked users of read_pnet and write_pnet.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> When I was testing this caught all three locations Eric Dumazet needed
> to add write_pnet as compile errors with network namespaces enabled.


Acked-by: Eric Dumazet <edumazet@google.com>

Only concern is the typedef possible_net_t : It seems number of uses
should be rather small, and we probably could use the "struct
possible_net" instead, to please Linus ;)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 2/2] net: Introduce possible_net_t
  2015-03-12  4:37                     ` Eric Dumazet
@ 2015-03-12 16:33                       ` Eric W. Biederman
  0 siblings, 0 replies; 22+ messages in thread
From: Eric W. Biederman @ 2015-03-12 16:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, salo

Eric Dumazet <eric.dumazet@gmail.com> writes:

> On Wed, 2015-03-11 at 23:06 -0500, Eric W. Biederman wrote:
>> Having to say
>> > #ifdef CONFIG_NET_NS
>> > 	struct net *net;
>> > #endif
>> 
>> in structures is a little bit wordy and a little bit error prone.
>> 
>> Instead it is possible to say:
>> > typedef struct {
>> > #ifdef CONFIG_NET_NS
>> >       struct net *net;
>> > #endif
>> > } possible_net_t;
>> 
>> And then in a header say:
>> 
>> > 	possible_net_t net;
>> 
>> Which is cleaner and easier to use and easier to test, as the
>> possible_net_t is always there no matter what the compile options.
>> 
>> Further this allows read_pnet and write_pnet to be functions in all
>> cases which is better at catching typos.
>> 
>> This change adds possible_net_t, updates the definitions of read_pnet
>> and write_pnet, updates optional struct net * variables that
>> write_pnet uses on to have the type possible_net_t, and finally fixes
>> up the b0rked users of read_pnet and write_pnet.
>> 
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>> 
>> When I was testing this caught all three locations Eric Dumazet needed
>> to add write_pnet as compile errors with network namespaces enabled.
>
>
> Acked-by: Eric Dumazet <edumazet@google.com>
>
> Only concern is the typedef possible_net_t : It seems number of uses
> should be rather small, and we probably could use the "struct
> possible_net" instead, to please Linus ;)

possible_net_t is a machine word size type so it falls into the
exceptions for typedefs in those cases.   It won't fool you about
how big it is, or how costly it is to use.

Years ago I remember passing a similar type to functions and watching
how even the function arguments boiled away at compile time, when
network namespaces were disabled.

Eric

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next 0/2] Introduce possible_net_t
  2015-03-12  4:03                 ` [PATCH net-next 0/2] Introduce possible_net_t Eric W. Biederman
  2015-03-12  4:04                   ` [PATCH net-next 1/2] net: Kill hold_net release_net Eric W. Biederman
  2015-03-12  4:06                   ` [PATCH net-next 2/2] net: Introduce possible_net_t Eric W. Biederman
@ 2015-03-12 18:40                   ` David Miller
  2 siblings, 0 replies; 22+ messages in thread
From: David Miller @ 2015-03-12 18:40 UTC (permalink / raw)
  To: ebiederm; +Cc: eric.dumazet, netdev, salo

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Wed, 11 Mar 2015 23:03:13 -0500

> The current usage of write_pnet and read_pnet is a little laborious and
> error prone as you only notice if you failed to include them if are
> compiling with network namespaces enabled.
> 
> possible_net_t remedies that by using a type that is 0 bytes when
> network namespaces are disabled and can only be read and written to with
> read_pnet and write_pnet.
> 
> Aka less work and safer for the same effect.
> 
> I kill hold_net and release_net first as are they are haven't been used
> since 2008 and are noise at the points where write_pnet and read_pnet
> are used.
> 
> I have folded in Eric Dumazets suggestions to improve the killing of
> hold_net and release net.  And respon.  I had to respin anyway as
> there was enough changes elsewhere in the tree the previous version
> of these patches did not quite apply cleanly.

Series applied, thanks!

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2015-03-12 18:40 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-10 22:45 [PATCH net-next] net: add real socket cookies Eric Dumazet
2015-03-11 21:53 ` David Miller
2015-03-11 22:30   ` Eric Dumazet
2015-03-11 22:56     ` [PATCH v2 " Eric Dumazet
2015-03-12  1:53       ` [PATCH v3 " Eric Dumazet
2015-03-12  1:55         ` David Miller
2015-03-12  2:23           ` Eric Dumazet
2015-03-12  2:54           ` Eric Dumazet
2015-03-12  3:21             ` Eric Dumazet
2015-03-12  3:27             ` [PATCH net-next] net: fix CONFIG_NET_NS=n compilation Eric Dumazet
2015-03-12  3:29               ` David Miller
2015-03-12  4:03                 ` [PATCH net-next 0/2] Introduce possible_net_t Eric W. Biederman
2015-03-12  4:04                   ` [PATCH net-next 1/2] net: Kill hold_net release_net Eric W. Biederman
2015-03-12  4:35                     ` Eric Dumazet
2015-03-12  4:06                   ` [PATCH net-next 2/2] net: Introduce possible_net_t Eric W. Biederman
2015-03-12  4:37                     ` Eric Dumazet
2015-03-12 16:33                       ` Eric W. Biederman
2015-03-12 18:40                   ` [PATCH net-next 0/2] " David Miller
2015-03-12  3:29             ` [PATCH v3 net-next] net: add real socket cookies Eric W. Biederman
2015-03-12  1:21     ` [PATCH " David Miller
2015-03-12  1:31       ` Eric Dumazet
2015-03-12  1:54         ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.