stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [GIT] Networking
@ 2019-11-05  1:51 David Miller
  2019-11-06 15:43 ` Greg KH
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2019-11-05  1:51 UTC (permalink / raw)
  To: stable

[-- Attachment #1: Type: Text/Plain, Size: 103 bytes --]


Please queue up the following networking bug fixes for v4.19 and
v5.3 -stable, respectively.

Thanks!

[-- Attachment #2: net_419.mbox --]
[-- Type: Application/Octet-Stream, Size: 131466 bytes --]

From 0e1aa9d0b36ab7200081249f4464023329eec7ae Mon Sep 17 00:00:00 2001
From: Vishal Kulkarni <vishal@chelsio.com>
Date: Wed, 30 Oct 2019 20:17:57 +0530
Subject: [PATCH 01/36] cxgb4: fix panic when attaching to ULD fail

[ Upstream commit fc89cc358fb64e2429aeae0f37906126636507ec ]

Release resources when attaching to ULD fail. Otherwise, data
mismatch is seen between LLD and ULD later on, which lead to
kernel panic when accessing resources that should not even
exist in the first place.

Fixes: 94cdb8bb993a ("cxgb4: Add support for dynamic allocation of resources for ULD")
Signed-off-by: Shahjada Abul Husain <shahjada@chelsio.com>
Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/chelsio/cxgb4/cxgb4_uld.c    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index dba8a0c1eda3..3d834d40e81e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -673,10 +673,10 @@ static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld)
 	lld->write_cmpl_support = adap->params.write_cmpl_support;
 }
 
-static void uld_attach(struct adapter *adap, unsigned int uld)
+static int uld_attach(struct adapter *adap, unsigned int uld)
 {
-	void *handle;
 	struct cxgb4_lld_info lli;
+	void *handle;
 
 	uld_init(adap, &lli);
 	uld_queue_init(adap, uld, &lli);
@@ -686,7 +686,7 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 		dev_warn(adap->pdev_dev,
 			 "could not attach to the %s driver, error %ld\n",
 			 adap->uld[uld].name, PTR_ERR(handle));
-		return;
+		return PTR_ERR(handle);
 	}
 
 	adap->uld[uld].handle = handle;
@@ -694,23 +694,24 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 
 	if (adap->flags & FULL_INIT_DONE)
 		adap->uld[uld].state_change(handle, CXGB4_STATE_UP);
+
+	return 0;
 }
 
-/**
- *	cxgb4_register_uld - register an upper-layer driver
- *	@type: the ULD type
- *	@p: the ULD methods
+/* cxgb4_register_uld - register an upper-layer driver
+ * @type: the ULD type
+ * @p: the ULD methods
  *
- *	Registers an upper-layer driver with this driver and notifies the ULD
- *	about any presently available devices that support its type.  Returns
- *	%-EBUSY if a ULD of the same type is already registered.
+ * Registers an upper-layer driver with this driver and notifies the ULD
+ * about any presently available devices that support its type.  Returns
+ * %-EBUSY if a ULD of the same type is already registered.
  */
 int cxgb4_register_uld(enum cxgb4_uld type,
 		       const struct cxgb4_uld_info *p)
 {
-	int ret = 0;
 	unsigned int adap_idx = 0;
 	struct adapter *adap;
+	int ret = 0;
 
 	if (type >= CXGB4_ULD_MAX)
 		return -EINVAL;
@@ -744,12 +745,16 @@ int cxgb4_register_uld(enum cxgb4_uld type,
 		if (ret)
 			goto free_irq;
 		adap->uld[type] = *p;
-		uld_attach(adap, type);
+		ret = uld_attach(adap, type);
+		if (ret)
+			goto free_txq;
 		adap_idx++;
 	}
 	mutex_unlock(&uld_mutex);
 	return 0;
 
+free_txq:
+	release_sge_txq_uld(adap, type);
 free_irq:
 	if (adap->flags & FULL_INIT_DONE)
 		quiesce_rx_uld(adap, type);
-- 
2.20.1


From 6f7b1c378abbae73524d231e2789ee02e5bc6d96 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Nov 2019 07:57:55 -0800
Subject: [PATCH 02/36] dccp: do not leak jiffies on the wire

[ Upstream commit 3d1e5039f5f87a8731202ceca08764ee7cb010d3 ]

For some reason I missed the case of DCCP passive
flows in my previous patch.

Fixes: a904a0693c18 ("inet: stop leaking jiffies on the wire")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Thiemo Nagel <tnagel@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8e08cea6f178..17afa03cab3a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -417,7 +417,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
 	RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
 	newinet->mc_index  = inet_iif(skb);
 	newinet->mc_ttl	   = ip_hdr(skb)->ttl;
-	newinet->inet_id   = jiffies;
+	newinet->inet_id   = prandom_u32();
 
 	if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
 		goto put_and_exit;
-- 
2.20.1


From 61dc1783c98cf6496688ebe006ed8fbc8ba89bec Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 28 Oct 2019 23:19:35 +0800
Subject: [PATCH 03/36] erspan: fix the tun_info options_len check for erspan

[ Upstream commit 2eb8d6d2910cfe3dc67dc056f26f3dd9c63d47cd ]

The check for !md doens't really work for ip_tunnel_info_opts(info) which
only does info + 1. Also to avoid out-of-bounds access on info, it should
ensure options_len is not less than erspan_metadata in both erspan_xmit()
and ip6erspan_tunnel_xmit().

Fixes: 1a66a836da ("gre: add collect_md mode to ERSPAN tunnel")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 4 ++--
 net/ipv6/ip6_gre.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0b87558f265e..758a0f86d499 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -589,9 +589,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	key = &tun_info->key;
 	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 		goto err_free_rt;
+	if (tun_info->options_len < sizeof(*md))
+ 		goto err_free_rt;
 	md = ip_tunnel_info_opts(tun_info);
-	if (!md)
-		goto err_free_rt;
 
 	/* ERSPAN has fixed 8 byte GRE header */
 	version = md->version;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a23516e22056..dee4113f21a9 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1000,9 +1000,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		dsfield = key->tos;
 		if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 			goto tx_err;
-		md = ip_tunnel_info_opts(tun_info);
-		if (!md)
+		if (tun_info->options_len < sizeof(*md))
 			goto tx_err;
+		md = ip_tunnel_info_opts(tun_info);
 
 		tun_id = tunnel_id_to_key32(key->tun_id);
 		if (md->version == 1) {
-- 
2.20.1


From e877abb18a2a2b5021fded38622b248581e0dc07 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Nov 2019 10:32:19 -0700
Subject: [PATCH 04/36] inet: stop leaking jiffies on the wire

[ Upstream commit a904a0693c189691eeee64f6c6b188bd7dc244e9 ]

Historically linux tried to stick to RFC 791, 1122, 2003
for IPv4 ID field generation.

RFC 6864 made clear that no matter how hard we try,
we can not ensure unicity of IP ID within maximum
lifetime for all datagrams with a given source
address/destination address/protocol tuple.

Linux uses a per socket inet generator (inet_id), initialized
at connection startup with a XOR of 'jiffies' and other
fields that appear clear on the wire.

Thiemo Nagel pointed that this strategy is a privacy
concern as this provides 16 bits of entropy to fingerprint
devices.

Let's switch to a random starting point, this is just as
good as far as RFC 6864 is concerned and does not leak
anything critical.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Thiemo Nagel <tnagel@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_cm.c | 2 +-
 net/dccp/ipv4.c                         | 2 +-
 net/ipv4/datagram.c                     | 2 +-
 net/ipv4/tcp_ipv4.c                     | 4 ++--
 net/sctp/socket.c                       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
index 0997e166ea57..8b749c721c87 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.c
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c
@@ -1276,7 +1276,7 @@ static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt)
 	tp->write_seq = snd_isn;
 	tp->snd_nxt = snd_isn;
 	tp->snd_una = snd_isn;
-	inet_sk(sk)->inet_id = tp->write_seq ^ jiffies;
+	inet_sk(sk)->inet_id = prandom_u32();
 	assign_rxopt(sk, opt);
 
 	if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10))
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 17afa03cab3a..176bddacc16e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -121,7 +121,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 						    inet->inet_daddr,
 						    inet->inet_sport,
 						    inet->inet_dport);
-	inet->inet_id = dp->dccps_iss ^ jiffies;
+	inet->inet_id = prandom_u32();
 
 	err = dccp_connect(sk);
 	rt = NULL;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 80107a6a2c4a..dc8517c2cdc7 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -77,7 +77,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	reuseport_has_conns(sk, true);
 	sk->sk_state = TCP_ESTABLISHED;
 	sk_set_txhash(sk);
-	inet->inet_id = jiffies;
+	inet->inet_id = prandom_u32();
 
 	sk_dst_set(sk, &rt->dst);
 	err = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b76cf96d5cfe..bfec48849735 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -305,7 +305,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 						 inet->inet_daddr);
 	}
 
-	inet->inet_id = tp->write_seq ^ jiffies;
+	inet->inet_id = prandom_u32();
 
 	if (tcp_fastopen_defer_connect(sk, &err))
 		return err;
@@ -1436,7 +1436,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (inet_opt)
 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
-	newinet->inet_id = newtp->write_seq ^ jiffies;
+	newinet->inet_id = prandom_u32();
 
 	if (!dst) {
 		dst = inet_csk_route_child_sock(sk, newsk, req);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 227b050cfe45..0ab339a42f99 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8777,7 +8777,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
 	newinet->inet_dport = htons(asoc->peer.port);
 	newinet->pmtudisc = inet->pmtudisc;
-	newinet->inet_id = asoc->next_tsn ^ jiffies;
+	newinet->inet_id = prandom_u32();
 
 	newinet->uc_ttl = inet->uc_ttl;
 	newinet->mc_loop = 1;
-- 
2.20.1


From 21d42af6f0abd5ca0e6f5e9148d8b4cb9ec5aa50 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Oct 2019 13:00:04 -0700
Subject: [PATCH 05/36] net: annotate accesses to sk->sk_incoming_cpu

[ Upstream commit 7170a977743b72cf3eb46ef6ef89885dc7ad3621 ]

This socket field can be read and written by concurrent cpus.

Use READ_ONCE() and WRITE_ONCE() annotations to document this,
and avoid some compiler 'optimizations'.

KCSAN reported :

BUG: KCSAN: data-race in tcp_v4_rcv / tcp_v4_rcv

write to 0xffff88812220763c of 4 bytes by interrupt on cpu 0:
 sk_incoming_cpu_update include/net/sock.h:953 [inline]
 tcp_v4_rcv+0x1b3c/0x1bb0 net/ipv4/tcp_ipv4.c:1934
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955
 napi_poll net/core/dev.c:6392 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6460
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082
 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337
 do_softirq kernel/softirq.c:329 [inline]
 __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189

read to 0xffff88812220763c of 4 bytes by interrupt on cpu 1:
 sk_incoming_cpu_update include/net/sock.h:952 [inline]
 tcp_v4_rcv+0x181a/0x1bb0 net/ipv4/tcp_ipv4.c:1934
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955
 napi_poll net/core/dev.c:6392 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6460
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 run_ksoftirqd+0x46/0x60 kernel/softirq.c:603
 smpboot_thread_fn+0x37d/0x4a0 kernel/smpboot.c:165

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h          | 4 ++--
 net/core/sock.c             | 4 ++--
 net/ipv4/inet_hashtables.c  | 2 +-
 net/ipv4/udp.c              | 2 +-
 net/ipv6/inet6_hashtables.c | 2 +-
 net/ipv6/udp.c              | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 1ece7736c49c..51a79443388b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -945,8 +945,8 @@ static inline void sk_incoming_cpu_update(struct sock *sk)
 {
 	int cpu = raw_smp_processor_id();
 
-	if (unlikely(sk->sk_incoming_cpu != cpu))
-		sk->sk_incoming_cpu = cpu;
+	if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
+		WRITE_ONCE(sk->sk_incoming_cpu, cpu);
 }
 
 static inline void sock_rps_record_flow_hash(__u32 hash)
diff --git a/net/core/sock.c b/net/core/sock.c
index f881eea1c4a4..5f652dbe66a2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1005,7 +1005,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_INCOMING_CPU:
-		sk->sk_incoming_cpu = val;
+		WRITE_ONCE(sk->sk_incoming_cpu, val);
 		break;
 
 	case SO_CNX_ADVICE:
@@ -1341,7 +1341,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_INCOMING_CPU:
-		v.val = sk->sk_incoming_cpu;
+		v.val = READ_ONCE(sk->sk_incoming_cpu);
 		break;
 
 	case SO_MEMINFO:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index f5c9ef2586de..7be966a60801 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -248,7 +248,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			if (sk->sk_bound_dev_if)
 				score += 4;
 		}
-		if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 			score++;
 	}
 	return score;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index aa59acc8ee0e..ec2ab82d0d4b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -408,7 +408,7 @@ static int compute_score(struct sock *sk, struct net *net,
 			score += 4;
 	}
 
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+	if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 		score++;
 	return score;
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..91d6ea937ffb 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -118,7 +118,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			if (sk->sk_bound_dev_if)
 				score++;
 		}
-		if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 			score++;
 	}
 	return score;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d1c59cb6dceb..1979922bcf67 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -154,7 +154,7 @@ static int compute_score(struct sock *sk, struct net *net,
 			score++;
 	}
 
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+	if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 		score++;
 
 	return score;
-- 
2.20.1


From 759029210a48f1901ea670a8c5c00f40899699c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Oct 2019 10:54:44 -0700
Subject: [PATCH 06/36] net: annotate lockless accesses to sk->sk_napi_id

[ Upstream commit ee8d153d46a3b98c064ee15c0c0a3bbf1450e5a1 ]

We already annotated most accesses to sk->sk_napi_id

We missed sk_mark_napi_id() and sk_mark_napi_id_once()
which might be called without socket lock held in UDP stack.

KCSAN reported :
BUG: KCSAN: data-race in udpv6_queue_rcv_one_skb / udpv6_queue_rcv_one_skb

write to 0xffff888121c6d108 of 4 bytes by interrupt on cpu 0:
 sk_mark_napi_id include/net/busy_poll.h:125 [inline]
 __udpv6_queue_rcv_skb net/ipv6/udp.c:571 [inline]
 udpv6_queue_rcv_one_skb+0x70c/0xb40 net/ipv6/udp.c:672
 udpv6_queue_rcv_skb+0xb5/0x400 net/ipv6/udp.c:689
 udp6_unicast_rcv_skb.isra.0+0xd7/0x180 net/ipv6/udp.c:832
 __udp6_lib_rcv+0x69c/0x1770 net/ipv6/udp.c:913
 udpv6_rcv+0x2b/0x40 net/ipv6/udp.c:1015
 ip6_protocol_deliver_rcu+0x22a/0xbe0 net/ipv6/ip6_input.c:409
 ip6_input_finish+0x30/0x50 net/ipv6/ip6_input.c:450
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip6_input+0x177/0x190 net/ipv6/ip6_input.c:459
 dst_input include/net/dst.h:442 [inline]
 ip6_rcv_finish+0x110/0x140 net/ipv6/ip6_input.c:76
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ipv6_rcv+0x1a1/0x1b0 net/ipv6/ip6_input.c:284
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955
 napi_poll net/core/dev.c:6392 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6460

write to 0xffff888121c6d108 of 4 bytes by interrupt on cpu 1:
 sk_mark_napi_id include/net/busy_poll.h:125 [inline]
 __udpv6_queue_rcv_skb net/ipv6/udp.c:571 [inline]
 udpv6_queue_rcv_one_skb+0x70c/0xb40 net/ipv6/udp.c:672
 udpv6_queue_rcv_skb+0xb5/0x400 net/ipv6/udp.c:689
 udp6_unicast_rcv_skb.isra.0+0xd7/0x180 net/ipv6/udp.c:832
 __udp6_lib_rcv+0x69c/0x1770 net/ipv6/udp.c:913
 udpv6_rcv+0x2b/0x40 net/ipv6/udp.c:1015
 ip6_protocol_deliver_rcu+0x22a/0xbe0 net/ipv6/ip6_input.c:409
 ip6_input_finish+0x30/0x50 net/ipv6/ip6_input.c:450
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip6_input+0x177/0x190 net/ipv6/ip6_input.c:459
 dst_input include/net/dst.h:442 [inline]
 ip6_rcv_finish+0x110/0x140 net/ipv6/ip6_input.c:76
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ipv6_rcv+0x1a1/0x1b0 net/ipv6/ip6_input.c:284
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 10890 Comm: syz-executor.0 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: e68b6e50fa35 ("udp: enable busy polling for all sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index ba61cdd09eaa..cf8f792743ec 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -134,7 +134,7 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
 static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	sk->sk_napi_id = skb->napi_id;
+	WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
 #endif
 	sk_rx_queue_set(sk, skb);
 }
@@ -144,8 +144,8 @@ static inline void sk_mark_napi_id_once(struct sock *sk,
 					const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	if (!sk->sk_napi_id)
-		sk->sk_napi_id = skb->napi_id;
+	if (!READ_ONCE(sk->sk_napi_id))
+		WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
 #endif
 }
 
-- 
2.20.1


From e8678b051d8138fdd87ca6063132a7b093d09112 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 31 Oct 2019 15:54:05 -0700
Subject: [PATCH 07/36] net: dsa: bcm_sf2: Fix IMP setup for port different
 than 8

[ Upstream commit 5fc0f21246e50afdf318b5a3a941f7f4f57b8947 ]

Since it became possible for the DSA core to use a CPU port different
than 8, our bcm_sf2_imp_setup() function was broken because it assumes
that registers are applicable to port 8. In particular, the port's MAC
is going to stay disabled, so make sure we clear the RX_DIS and TX_DIS
bits if we are not configured for port 8.

Fixes: 9f91484f6fcc ("net: dsa: make "label" property optional for dsa2")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 2fa2caf7a746..ca3655d28e00 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -41,22 +41,11 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 	unsigned int i;
 	u32 reg, offset;
 
-	if (priv->type == BCM7445_DEVICE_ID)
-		offset = CORE_STS_OVERRIDE_IMP;
-	else
-		offset = CORE_STS_OVERRIDE_IMP2;
-
 	/* Enable the port memories */
 	reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL);
 	reg &= ~P_TXQ_PSM_VDD(port);
 	core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL);
 
-	/* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
-	reg = core_readl(priv, CORE_IMP_CTL);
-	reg |= (RX_BCST_EN | RX_MCST_EN | RX_UCST_EN);
-	reg &= ~(RX_DIS | TX_DIS);
-	core_writel(priv, reg, CORE_IMP_CTL);
-
 	/* Enable forwarding */
 	core_writel(priv, SW_FWDG_EN, CORE_SWMODE);
 
@@ -75,10 +64,27 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 
 	b53_brcm_hdr_setup(ds, port);
 
-	/* Force link status for IMP port */
-	reg = core_readl(priv, offset);
-	reg |= (MII_SW_OR | LINK_STS);
-	core_writel(priv, reg, offset);
+	if (port == 8) {
+		if (priv->type == BCM7445_DEVICE_ID)
+			offset = CORE_STS_OVERRIDE_IMP;
+		else
+			offset = CORE_STS_OVERRIDE_IMP2;
+
+		/* Force link status for IMP port */
+		reg = core_readl(priv, offset);
+		reg |= (MII_SW_OR | LINK_STS);
+		core_writel(priv, reg, offset);
+
+		/* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
+		reg = core_readl(priv, CORE_IMP_CTL);
+		reg |= (RX_BCST_EN | RX_MCST_EN | RX_UCST_EN);
+		reg &= ~(RX_DIS | TX_DIS);
+		core_writel(priv, reg, CORE_IMP_CTL);
+	} else {
+		reg = core_readl(priv, CORE_G_PCTL_PORT(port));
+		reg &= ~(RX_DIS | TX_DIS);
+		core_writel(priv, reg, CORE_G_PCTL_PORT(port));
+	}
 }
 
 static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
-- 
2.20.1


From 66d112a28b59056e02736de6304846664196b624 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 25 Oct 2019 13:47:24 +1100
Subject: [PATCH 08/36] net: ethernet: ftgmac100: Fix DMA coherency issue with
 SW checksum

[ Upstream commit 88824e3bf29a2fcacfd9ebbfe03063649f0f3254 ]

We are calling the checksum helper after the dma_map_single()
call to map the packet. This is incorrect as the checksumming
code will touch the packet from the CPU. This means the cache
won't be properly flushes (or the bounce buffering will leave
us with the unmodified packet to DMA).

This moves the calculation of the checksum & vlan tags to
before the DMA mapping.

This also has the side effect of fixing another bug: If the
checksum helper fails, we goto "drop" to drop the packet, which
will not unmap the DMA mapping.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Fixes: 05690d633f30 ("ftgmac100: Upgrade to NETIF_F_HW_CSUM")
Reviewed-by: Vijay Khemka <vijaykhemka@fb.com>
Tested-by: Vijay Khemka <vijaykhemka@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/faraday/ftgmac100.c | 25 ++++++++++++------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index ed6c76d20b45..f6ed889bc36a 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -739,6 +739,18 @@ static int ftgmac100_hard_start_xmit(struct sk_buff *skb,
 	 */
 	nfrags = skb_shinfo(skb)->nr_frags;
 
+	/* Setup HW checksumming */
+	csum_vlan = 0;
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    !ftgmac100_prep_tx_csum(skb, &csum_vlan))
+		goto drop;
+
+	/* Add VLAN tag */
+	if (skb_vlan_tag_present(skb)) {
+		csum_vlan |= FTGMAC100_TXDES1_INS_VLANTAG;
+		csum_vlan |= skb_vlan_tag_get(skb) & 0xffff;
+	}
+
 	/* Get header len */
 	len = skb_headlen(skb);
 
@@ -765,19 +777,6 @@ static int ftgmac100_hard_start_xmit(struct sk_buff *skb,
 	if (nfrags == 0)
 		f_ctl_stat |= FTGMAC100_TXDES0_LTS;
 	txdes->txdes3 = cpu_to_le32(map);
-
-	/* Setup HW checksumming */
-	csum_vlan = 0;
-	if (skb->ip_summed == CHECKSUM_PARTIAL &&
-	    !ftgmac100_prep_tx_csum(skb, &csum_vlan))
-		goto drop;
-
-	/* Add VLAN tag */
-	if (skb_vlan_tag_present(skb)) {
-		csum_vlan |= FTGMAC100_TXDES1_INS_VLANTAG;
-		csum_vlan |= skb_vlan_tag_get(skb) & 0xffff;
-	}
-
 	txdes->txdes1 = cpu_to_le32(csum_vlan);
 
 	/* Next descriptor */
-- 
2.20.1


From e5aeeab9055d117c0c9f7766076191af846169b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2019 13:50:27 -0700
Subject: [PATCH 09/36] net: fix sk_page_frag() recursion from memory reclaim

[ Upstream commit 20eb4f29b60286e0d6dc01d9c260b4bd383c58fb ]

sk_page_frag() optimizes skb_frag allocations by using per-task
skb_frag cache when it knows it's the only user.  The condition is
determined by seeing whether the socket allocation mask allows
blocking - if the allocation may block, it obviously owns the task's
context and ergo exclusively owns current->task_frag.

Unfortunately, this misses recursion through memory reclaim path.
Please take a look at the following backtrace.

 [2] RIP: 0010:tcp_sendmsg_locked+0xccf/0xe10
     ...
     tcp_sendmsg+0x27/0x40
     sock_sendmsg+0x30/0x40
     sock_xmit.isra.24+0xa1/0x170 [nbd]
     nbd_send_cmd+0x1d2/0x690 [nbd]
     nbd_queue_rq+0x1b5/0x3b0 [nbd]
     __blk_mq_try_issue_directly+0x108/0x1b0
     blk_mq_request_issue_directly+0xbd/0xe0
     blk_mq_try_issue_list_directly+0x41/0xb0
     blk_mq_sched_insert_requests+0xa2/0xe0
     blk_mq_flush_plug_list+0x205/0x2a0
     blk_flush_plug_list+0xc3/0xf0
 [1] blk_finish_plug+0x21/0x2e
     _xfs_buf_ioapply+0x313/0x460
     __xfs_buf_submit+0x67/0x220
     xfs_buf_read_map+0x113/0x1a0
     xfs_trans_read_buf_map+0xbf/0x330
     xfs_btree_read_buf_block.constprop.42+0x95/0xd0
     xfs_btree_lookup_get_block+0x95/0x170
     xfs_btree_lookup+0xcc/0x470
     xfs_bmap_del_extent_real+0x254/0x9a0
     __xfs_bunmapi+0x45c/0xab0
     xfs_bunmapi+0x15/0x30
     xfs_itruncate_extents_flags+0xca/0x250
     xfs_free_eofblocks+0x181/0x1e0
     xfs_fs_destroy_inode+0xa8/0x1b0
     destroy_inode+0x38/0x70
     dispose_list+0x35/0x50
     prune_icache_sb+0x52/0x70
     super_cache_scan+0x120/0x1a0
     do_shrink_slab+0x120/0x290
     shrink_slab+0x216/0x2b0
     shrink_node+0x1b6/0x4a0
     do_try_to_free_pages+0xc6/0x370
     try_to_free_mem_cgroup_pages+0xe3/0x1e0
     try_charge+0x29e/0x790
     mem_cgroup_charge_skmem+0x6a/0x100
     __sk_mem_raise_allocated+0x18e/0x390
     __sk_mem_schedule+0x2a/0x40
 [0] tcp_sendmsg_locked+0x8eb/0xe10
     tcp_sendmsg+0x27/0x40
     sock_sendmsg+0x30/0x40
     ___sys_sendmsg+0x26d/0x2b0
     __sys_sendmsg+0x57/0xa0
     do_syscall_64+0x42/0x100
     entry_SYSCALL_64_after_hwframe+0x44/0xa9

In [0], tcp_send_msg_locked() was using current->page_frag when it
called sk_wmem_schedule().  It already calculated how many bytes can
be fit into current->page_frag.  Due to memory pressure,
sk_wmem_schedule() called into memory reclaim path which called into
xfs and then IO issue path.  Because the filesystem in question is
backed by nbd, the control goes back into the tcp layer - back into
tcp_sendmsg_locked().

nbd sets sk_allocation to (GFP_NOIO | __GFP_MEMALLOC) which makes
sense - it's in the process of freeing memory and wants to be able to,
e.g., drop clean pages to make forward progress.  However, this
confused sk_page_frag() called from [2].  Because it only tests
whether the allocation allows blocking which it does, it now thinks
current->page_frag can be used again although it already was being
used in [0].

After [2] used current->page_frag, the offset would be increased by
the used amount.  When the control returns to [0],
current->page_frag's offset is increased and the previously calculated
number of bytes now may overrun the end of allocated memory leading to
silent memory corruptions.

Fix it by adding gfpflags_normal_context() which tests sleepable &&
!reclaim and use it to determine whether to use current->task_frag.

v2: Eric didn't like gfp flags being tested twice.  Introduce a new
    helper gfpflags_normal_context() and combine the two tests.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/gfp.h | 23 +++++++++++++++++++++++
 include/net/sock.h  | 11 ++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 24bcc5eec6b4..f78d1e89593f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -325,6 +325,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 	return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
 }
 
+/**
+ * gfpflags_normal_context - is gfp_flags a normal sleepable context?
+ * @gfp_flags: gfp_flags to test
+ *
+ * Test whether @gfp_flags indicates that the allocation is from the
+ * %current context and allowed to sleep.
+ *
+ * An allocation being allowed to block doesn't mean it owns the %current
+ * context.  When direct reclaim path tries to allocate memory, the
+ * allocation context is nested inside whatever %current was doing at the
+ * time of the original allocation.  The nested allocation may be allowed
+ * to block but modifying anything %current owns can corrupt the outer
+ * context's expectations.
+ *
+ * %true result from this function indicates that the allocation context
+ * can sleep and use anything that's associated with %current.
+ */
+static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
+{
+	return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
+		__GFP_DIRECT_RECLAIM;
+}
+
 #ifdef CONFIG_HIGHMEM
 #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
 #else
diff --git a/include/net/sock.h b/include/net/sock.h
index 51a79443388b..05e8faa84717 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2216,12 +2216,17 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
  * sk_page_frag - return an appropriate page_frag
  * @sk: socket
  *
- * If socket allocation mode allows current thread to sleep, it means its
- * safe to use the per task page_frag instead of the per socket one.
+ * Use the per task page_frag instead of the per socket one for
+ * optimization when we know that we're in the normal context and owns
+ * everything that's associated with %current.
+ *
+ * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
+ * inside other socket operations and end up recursing into sk_page_frag()
+ * while it's already in use.
  */
 static inline struct page_frag *sk_page_frag(struct sock *sk)
 {
-	if (gfpflags_allow_blocking(sk->sk_allocation))
+	if (gfpflags_normal_context(sk->sk_allocation))
 		return &current->task_frag;
 
 	return &sk->sk_frag;
-- 
2.20.1


From 131e7ca5ab3d62a6a94bfb208b3858450e37d914 Mon Sep 17 00:00:00 2001
From: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
Date: Mon, 28 Oct 2019 13:09:46 +0800
Subject: [PATCH 10/36] net: hisilicon: Fix ping latency when deal with high
 throughput

[ Upstream commit e56bd641ca61beb92b135298d5046905f920b734 ]

This is due to error in over budget processing.
When dealing with high throughput, the used buffers
that exceeds the budget is not cleaned up. In addition,
it takes a lot of cycles to clean up the used buffer,
and then the buffer where the valid data is located can take effect.

Signed-off-by: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hip04_eth.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index a91d49dd92ea..2f8f03e0db81 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -174,6 +174,7 @@ struct hip04_priv {
 	dma_addr_t rx_phys[RX_DESC_NUM];
 	unsigned int rx_head;
 	unsigned int rx_buf_size;
+	unsigned int rx_cnt_remaining;
 
 	struct device_node *phy_node;
 	struct phy_device *phy;
@@ -487,7 +488,6 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget)
 	struct hip04_priv *priv = container_of(napi, struct hip04_priv, napi);
 	struct net_device *ndev = priv->ndev;
 	struct net_device_stats *stats = &ndev->stats;
-	unsigned int cnt = hip04_recv_cnt(priv);
 	struct rx_desc *desc;
 	struct sk_buff *skb;
 	unsigned char *buf;
@@ -500,8 +500,8 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget)
 
 	/* clean up tx descriptors */
 	tx_remaining = hip04_tx_reclaim(ndev, false);
-
-	while (cnt && !last) {
+	priv->rx_cnt_remaining += hip04_recv_cnt(priv);
+	while (priv->rx_cnt_remaining && !last) {
 		buf = priv->rx_buf[priv->rx_head];
 		skb = build_skb(buf, priv->rx_buf_size);
 		if (unlikely(!skb)) {
@@ -547,11 +547,13 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget)
 		hip04_set_recv_desc(priv, phys);
 
 		priv->rx_head = RX_NEXT(priv->rx_head);
-		if (rx >= budget)
+		if (rx >= budget) {
+			--priv->rx_cnt_remaining;
 			goto done;
+		}
 
-		if (--cnt == 0)
-			cnt = hip04_recv_cnt(priv);
+		if (--priv->rx_cnt_remaining == 0)
+			priv->rx_cnt_remaining += hip04_recv_cnt(priv);
 	}
 
 	if (!(priv->reg_inten & RCV_INT)) {
@@ -636,6 +638,7 @@ static int hip04_mac_open(struct net_device *ndev)
 	int i;
 
 	priv->rx_head = 0;
+	priv->rx_cnt_remaining = 0;
 	priv->tx_head = 0;
 	priv->tx_tail = 0;
 	hip04_reset_ppe(priv);
-- 
2.20.1


From e5759126f1326d550cd77acdcb1ec4e1b7e86873 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Sun, 27 Oct 2019 16:39:15 +0200
Subject: [PATCH 11/36] net/mlx4_core: Dynamically set guaranteed amount of
 counters per VF

[ Upstream commit e19868efea0c103f23b4b7e986fd0a703822111f ]

Prior to this patch, the amount of counters guaranteed per VF in the
resource tracker was MLX4_VF_COUNTERS_PER_PORT * MLX4_MAX_PORTS. It was
set regardless if the VF was single or dual port.
This caused several VFs to have no guaranteed counters although the
system could satisfy their request.

The fix is to dynamically guarantee counters, based on each VF
specification.

Fixes: 9de92c60beaa ("net/mlx4_core: Adjust counter grant policy in the resource tracker")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlx4/resource_tracker.c | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 676428a57662..a4c1ed65f620 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -471,12 +471,31 @@ void mlx4_init_quotas(struct mlx4_dev *dev)
 		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
 }
 
-static int get_max_gauranteed_vfs_counter(struct mlx4_dev *dev)
+static int
+mlx4_calc_res_counter_guaranteed(struct mlx4_dev *dev,
+				 struct resource_allocator *res_alloc,
+				 int vf)
 {
-	/* reduce the sink counter */
-	return (dev->caps.max_counters - 1 -
-		(MLX4_PF_COUNTERS_PER_PORT * MLX4_MAX_PORTS))
-		/ MLX4_MAX_PORTS;
+	struct mlx4_active_ports actv_ports;
+	int ports, counters_guaranteed;
+
+	/* For master, only allocate according to the number of phys ports */
+	if (vf == mlx4_master_func_num(dev))
+		return MLX4_PF_COUNTERS_PER_PORT * dev->caps.num_ports;
+
+	/* calculate real number of ports for the VF */
+	actv_ports = mlx4_get_active_ports(dev, vf);
+	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	counters_guaranteed = ports * MLX4_VF_COUNTERS_PER_PORT;
+
+	/* If we do not have enough counters for this VF, do not
+	 * allocate any for it. '-1' to reduce the sink counter.
+	 */
+	if ((res_alloc->res_reserved + counters_guaranteed) >
+	    (dev->caps.max_counters - 1))
+		return 0;
+
+	return counters_guaranteed;
 }
 
 int mlx4_init_resource_tracker(struct mlx4_dev *dev)
@@ -484,7 +503,6 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i, j;
 	int t;
-	int max_vfs_guarantee_counter = get_max_gauranteed_vfs_counter(dev);
 
 	priv->mfunc.master.res_tracker.slave_list =
 		kcalloc(dev->num_slaves, sizeof(struct slave_list),
@@ -603,16 +621,8 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 				break;
 			case RES_COUNTER:
 				res_alloc->quota[t] = dev->caps.max_counters;
-				if (t == mlx4_master_func_num(dev))
-					res_alloc->guaranteed[t] =
-						MLX4_PF_COUNTERS_PER_PORT *
-						MLX4_MAX_PORTS;
-				else if (t <= max_vfs_guarantee_counter)
-					res_alloc->guaranteed[t] =
-						MLX4_VF_COUNTERS_PER_PORT *
-						MLX4_MAX_PORTS;
-				else
-					res_alloc->guaranteed[t] = 0;
+				res_alloc->guaranteed[t] =
+					mlx4_calc_res_counter_guaranteed(dev, res_alloc, t);
 				break;
 			default:
 				break;
-- 
2.20.1


From 8db8d42eb4bc4f6228905a4c51f869793ac01815 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Wed, 23 Oct 2019 18:39:04 +0200
Subject: [PATCH 12/36] netns: fix GFP flags in rtnl_net_notifyid()

[ Upstream commit d4e4fdf9e4a27c87edb79b1478955075be141f67 ]

In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.

Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.

In most cases, GFP_KERNEL is fine. The exceptions are:
  * openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
    indirectly call rtnl_net_notifyid() from RCU critical section,

  * rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
    parameter.

Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.

Found by code inspection.

Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  2 +-
 net/core/dev.c              |  2 +-
 net/core/net_namespace.c    | 17 +++++++++--------
 net/core/rtnetlink.c        | 14 +++++++-------
 net/openvswitch/datapath.c  | 20 +++++++++++---------
 5 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 3f7b166262d7..5007eaba207d 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -322,7 +322,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #define __net_initconst	__initconst
 #endif
 
-int peernet2id_alloc(struct net *net, struct net *peer);
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
 int peernet2id(struct net *net, struct net *peer);
 bool peernet_has_id(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
diff --git a/net/core/dev.c b/net/core/dev.c
index ddd8aab20adf..4a2ee1ce6c02 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9211,7 +9211,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 
-	new_nsid = peernet2id_alloc(dev_net(dev), net);
+	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
 	/* If there is an ifindex conflict assign a new one */
 	if (__dev_get_by_index(net, dev->ifindex))
 		new_ifindex = dev_new_index(net);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7320f0844a50..6dab186d4b8f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -226,11 +226,11 @@ static int __peernet2id(struct net *net, struct net *peer)
 	return __peernet2id_alloc(net, peer, &no);
 }
 
-static void rtnl_net_notifyid(struct net *net, int cmd, int id);
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp);
 /* This function returns the id of a peer netns. If no id is assigned, one will
  * be allocated and returned.
  */
-int peernet2id_alloc(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
 {
 	bool alloc = false, alive = false;
 	int id;
@@ -249,7 +249,7 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 	id = __peernet2id_alloc(net, peer, &alloc);
 	spin_unlock_bh(&net->nsid_lock);
 	if (alloc && id >= 0)
-		rtnl_net_notifyid(net, RTM_NEWNSID, id);
+		rtnl_net_notifyid(net, RTM_NEWNSID, id, gfp);
 	if (alive)
 		put_net(peer);
 	return id;
@@ -495,7 +495,8 @@ static void unhash_nsid(struct net *net, struct net *last)
 			idr_remove(&tmp->netns_ids, id);
 		spin_unlock_bh(&tmp->nsid_lock);
 		if (id >= 0)
-			rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+			rtnl_net_notifyid(tmp, RTM_DELNSID, id,
+					  GFP_KERNEL);
 		if (tmp == last)
 			break;
 	}
@@ -720,7 +721,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = alloc_netid(net, peer, nsid);
 	spin_unlock_bh(&net->nsid_lock);
 	if (err >= 0) {
-		rtnl_net_notifyid(net, RTM_NEWNSID, err);
+		rtnl_net_notifyid(net, RTM_NEWNSID, err, GFP_KERNEL);
 		err = 0;
 	} else if (err == -ENOSPC && nsid >= 0) {
 		err = -EEXIST;
@@ -862,12 +863,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static void rtnl_net_notifyid(struct net *net, int cmd, int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp)
 {
 	struct sk_buff *msg;
 	int err = -ENOMEM;
 
-	msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+	msg = nlmsg_new(rtnl_net_get_size(), gfp);
 	if (!msg)
 		goto out;
 
@@ -875,7 +876,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
 	if (err < 0)
 		goto err_out;
 
-	rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+	rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, gfp);
 	return;
 
 err_out:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3932eed379a4..95768a9fca06 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1519,7 +1519,7 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
 
 static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 				  const struct net_device *dev,
-				  struct net *src_net)
+				  struct net *src_net, gfp_t gfp)
 {
 	bool put_iflink = false;
 
@@ -1527,7 +1527,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
 
 		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id_alloc(src_net, link_net);
+			int id = peernet2id_alloc(src_net, link_net, gfp);
 
 			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
 				return -EMSGSIZE;
@@ -1585,7 +1585,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
 			    u32 event, int *new_nsid, int new_ifindex,
-			    int tgt_netnsid)
+			    int tgt_netnsid, gfp_t gfp)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1677,7 +1677,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rtnl_fill_link_netnsid(skb, dev, src_net))
+	if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
 		goto nla_put_failure;
 
 	if (new_nsid &&
@@ -1933,7 +1933,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
 					       ext_filter_mask, 0, NULL, 0,
-					       netnsid);
+					       netnsid, GFP_KERNEL);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -3215,7 +3215,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = rtnl_fill_ifinfo(nskb, dev, net,
 			       RTM_NEWLINK, NETLINK_CB(skb).portid,
 			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
-			       0, NULL, 0, netnsid);
+			       0, NULL, 0, netnsid, GFP_KERNEL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -3325,7 +3325,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
 			       type, 0, 0, change, 0, 0, event,
-			       new_nsid, new_ifindex, -1);
+			       new_nsid, new_ifindex, -1, flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 8e396c7c8389..c297fe8afdfe 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1843,7 +1843,7 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
 /* Called with ovs_mutex or RCU read lock. */
 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 				   struct net *net, u32 portid, u32 seq,
-				   u32 flags, u8 cmd)
+				   u32 flags, u8 cmd, gfp_t gfp)
 {
 	struct ovs_header *ovs_header;
 	struct ovs_vport_stats vport_stats;
@@ -1864,7 +1864,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 		goto nla_put_failure;
 
 	if (!net_eq(net, dev_net(vport->dev))) {
-		int id = peernet2id_alloc(net, dev_net(vport->dev));
+		int id = peernet2id_alloc(net, dev_net(vport->dev), gfp);
 
 		if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
 			goto nla_put_failure;
@@ -1905,11 +1905,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
 	struct sk_buff *skb;
 	int retval;
 
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd);
+	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd,
+					 GFP_KERNEL);
 	BUG_ON(retval < 0);
 
 	return skb;
@@ -2042,7 +2043,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_NEW);
+				      OVS_VPORT_CMD_NEW, GFP_KERNEL);
 
 	if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
 		update_headroom(dp);
@@ -2101,7 +2102,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_NEW);
+				      OVS_VPORT_CMD_NEW, GFP_ATOMIC);
 	BUG_ON(err < 0);
 
 	ovs_unlock();
@@ -2140,7 +2141,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_DEL);
+				      OVS_VPORT_CMD_DEL, GFP_KERNEL);
 	BUG_ON(err < 0);
 
 	/* the vport deletion may trigger dp headroom update */
@@ -2182,7 +2183,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock_free;
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_NEW);
+				      OVS_VPORT_CMD_NEW, GFP_ATOMIC);
 	BUG_ON(err < 0);
 	rcu_read_unlock();
 
@@ -2218,7 +2219,8 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 						    NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
 						    NLM_F_MULTI,
-						    OVS_VPORT_CMD_NEW) < 0)
+						    OVS_VPORT_CMD_NEW,
+						    GFP_ATOMIC) < 0)
 				goto out;
 
 			j++;
-- 
2.20.1


From b8161aaf625849529ad38663c29527f2783225fb Mon Sep 17 00:00:00 2001
From: Daniel Wagner <dwagner@suse.de>
Date: Fri, 25 Oct 2019 10:04:13 +0200
Subject: [PATCH 13/36] net: usb: lan78xx: Disable interrupts before calling
 generic_handle_irq()

[ Upstream commit 0a29ac5bd3a988dc151c8d26910dec2557421f64 ]

lan78xx_status() will run with interrupts enabled due to the change in
ed194d136769 ("usb: core: remove local_irq_save() around ->complete()
handler"). generic_handle_irq() expects to be run with IRQs disabled.

[    4.886203] 000: irq 79 handler irq_default_primary_handler+0x0/0x8 enabled interrupts
[    4.886243] 000: WARNING: CPU: 0 PID: 0 at kernel/irq/handle.c:152 __handle_irq_event_percpu+0x154/0x168
[    4.896294] 000: Modules linked in:
[    4.896301] 000: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.6 #39
[    4.896310] 000: Hardware name: Raspberry Pi 3 Model B+ (DT)
[    4.896315] 000: pstate: 60000005 (nZCv daif -PAN -UAO)
[    4.896321] 000: pc : __handle_irq_event_percpu+0x154/0x168
[    4.896331] 000: lr : __handle_irq_event_percpu+0x154/0x168
[    4.896339] 000: sp : ffff000010003cc0
[    4.896346] 000: x29: ffff000010003cc0 x28: 0000000000000060
[    4.896355] 000: x27: ffff000011021980 x26: ffff00001189c72b
[    4.896364] 000: x25: ffff000011702bc0 x24: ffff800036d6e400
[    4.896373] 000: x23: 000000000000004f x22: ffff000010003d64
[    4.896381] 000: x21: 0000000000000000 x20: 0000000000000002
[    4.896390] 000: x19: ffff8000371c8480 x18: 0000000000000060
[    4.896398] 000: x17: 0000000000000000 x16: 00000000000000eb
[    4.896406] 000: x15: ffff000011712d18 x14: 7265746e69206465
[    4.896414] 000: x13: ffff000010003ba0 x12: ffff000011712df0
[    4.896422] 000: x11: 0000000000000001 x10: ffff000011712e08
[    4.896430] 000: x9 : 0000000000000001 x8 : 000000000003c920
[    4.896437] 000: x7 : ffff0000118cc410 x6 : ffff0000118c7f00
[    4.896445] 000: x5 : 000000000003c920 x4 : 0000000000004510
[    4.896453] 000: x3 : ffff000011712dc8 x2 : 0000000000000000
[    4.896461] 000: x1 : 73a3f67df94c1500 x0 : 0000000000000000
[    4.896466] 000: Call trace:
[    4.896471] 000:  __handle_irq_event_percpu+0x154/0x168
[    4.896481] 000:  handle_irq_event_percpu+0x50/0xb0
[    4.896489] 000:  handle_irq_event+0x40/0x98
[    4.896497] 000:  handle_simple_irq+0xa4/0xf0
[    4.896505] 000:  generic_handle_irq+0x24/0x38
[    4.896513] 000:  intr_complete+0xb0/0xe0
[    4.896525] 000:  __usb_hcd_giveback_urb+0x58/0xd8
[    4.896533] 000:  usb_giveback_urb_bh+0xd0/0x170
[    4.896539] 000:  tasklet_action_common.isra.0+0x9c/0x128
[    4.896549] 000:  tasklet_hi_action+0x24/0x30
[    4.896556] 000:  __do_softirq+0x120/0x23c
[    4.896564] 000:  irq_exit+0xb8/0xd8
[    4.896571] 000:  __handle_domain_irq+0x64/0xb8
[    4.896579] 000:  bcm2836_arm_irqchip_handle_irq+0x60/0xc0
[    4.896586] 000:  el1_irq+0xb8/0x140
[    4.896592] 000:  arch_cpu_idle+0x10/0x18
[    4.896601] 000:  do_idle+0x200/0x280
[    4.896608] 000:  cpu_startup_entry+0x20/0x28
[    4.896615] 000:  rest_init+0xb4/0xc0
[    4.896623] 000:  arch_call_rest_init+0xc/0x14
[    4.896632] 000:  start_kernel+0x454/0x480

Fixes: ed194d136769 ("usb: core: remove local_irq_save() around ->complete() handler")
Cc: Woojung Huh <woojung.huh@microchip.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Stefan Wahren <wahrenst@gmx.net>
Cc: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Daniel Wagner <dwagner@suse.de>
Tested-by: Stefan Wahren <wahrenst@gmx.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index e20266bd209e..64b6a769e781 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1278,8 +1278,11 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb)
 		netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata);
 		lan78xx_defer_kevent(dev, EVENT_LINK_RESET);
 
-		if (dev->domain_data.phyirq > 0)
+		if (dev->domain_data.phyirq > 0) {
+			local_irq_disable();
 			generic_handle_irq(dev->domain_data.phyirq);
+			local_irq_enable();
+		}
 	} else
 		netdev_warn(dev->net,
 			    "unexpected interrupt: 0x%08x\n", intdata);
-- 
2.20.1


From 50c7bb0cd99eda90946e4f2a30ab57998ae1cbb6 Mon Sep 17 00:00:00 2001
From: zhanglin <zhang.lin16@zte.com.cn>
Date: Sat, 26 Oct 2019 15:54:16 +0800
Subject: [PATCH 14/36] net: Zeroing the structure ethtool_wolinfo in
 ethtool_get_wol()

[ Upstream commit 5ff223e86f5addbfae26419cbb5d61d98f6fbf7d ]

memset() the structure ethtool_wolinfo that has padded bytes
but the padded bytes have not been zeroed out.

Signed-off-by: zhanglin <zhang.lin16@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 996813f345d5..09d828a6a173 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1482,11 +1482,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
 
 static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
 {
-	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+	struct ethtool_wolinfo wol;
 
 	if (!dev->ethtool_ops->get_wol)
 		return -EOPNOTSUPP;
 
+	memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+	wol.cmd = ETHTOOL_GWOL;
 	dev->ethtool_ops->get_wol(dev, &wol);
 
 	if (copy_to_user(useraddr, &wol, sizeof(wol)))
-- 
2.20.1


From 6111091c3a06c10c69bf07e5489099b8e881544f Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 31 Oct 2019 16:24:36 -0700
Subject: [PATCH 15/36] selftests: net: reuseport_dualstack: fix uninitalized
 parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit d64479a3e3f9924074ca7b50bd72fa5211dca9c1 ]

This test reports EINVAL for getsockopt(SOL_SOCKET, SO_DOMAIN)
occasionally due to the uninitialized length parameter.
Initialize it to fix this, and also use int for "test_family" to comply
with the API standard.

Fixes: d6a61f80b871 ("soreuseport: test mixed v4/v6 sockets")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Cc: Craig Gallek <cgallek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/reuseport_dualstack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/reuseport_dualstack.c b/tools/testing/selftests/net/reuseport_dualstack.c
index fe3230c55986..fb7a59ed759e 100644
--- a/tools/testing/selftests/net/reuseport_dualstack.c
+++ b/tools/testing/selftests/net/reuseport_dualstack.c
@@ -129,7 +129,7 @@ static void test(int *rcv_fds, int count, int proto)
 {
 	struct epoll_event ev;
 	int epfd, i, test_fd;
-	uint16_t test_family;
+	int test_family;
 	socklen_t len;
 
 	epfd = epoll_create(1);
@@ -146,6 +146,7 @@ static void test(int *rcv_fds, int count, int proto)
 	send_from_v4(proto);
 
 	test_fd = receive_once(epfd, proto);
+	len = sizeof(test_family);
 	if (getsockopt(test_fd, SOL_SOCKET, SO_DOMAIN, &test_family, &len))
 		error(1, errno, "failed to read socket domain");
 	if (test_family != AF_INET)
-- 
2.20.1


From ffccf124416b64b2fec8a99ab43f2fce1028a9da Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 24 Oct 2019 11:43:31 -0700
Subject: [PATCH 16/36] udp: fix data-race in udp_set_dev_scratch()

[ Upstream commit a793183caa9afae907a0d7ddd2ffd57329369bf5 ]

KCSAN reported a data-race in udp_set_dev_scratch() [1]

The issue here is that we must not write over skb fields
if skb is shared. A similar issue has been fixed in commit
89c22d8c3b27 ("net: Fix skb csum races when peeking")

While we are at it, use a helper only dealing with
udp_skb_scratch(skb)->csum_unnecessary, as this allows
udp_set_dev_scratch() to be called once and thus inlined.

[1]
BUG: KCSAN: data-race in udp_set_dev_scratch / udpv6_recvmsg

write to 0xffff888120278317 of 1 bytes by task 10411 on cpu 1:
 udp_set_dev_scratch+0xea/0x200 net/ipv4/udp.c:1308
 __first_packet_length+0x147/0x420 net/ipv4/udp.c:1556
 first_packet_length+0x68/0x2a0 net/ipv4/udp.c:1579
 udp_poll+0xea/0x110 net/ipv4/udp.c:2720
 sock_poll+0xed/0x250 net/socket.c:1256
 vfs_poll include/linux/poll.h:90 [inline]
 do_select+0x7d0/0x1020 fs/select.c:534
 core_sys_select+0x381/0x550 fs/select.c:677
 do_pselect.constprop.0+0x11d/0x160 fs/select.c:759
 __do_sys_pselect6 fs/select.c:784 [inline]
 __se_sys_pselect6 fs/select.c:769 [inline]
 __x64_sys_pselect6+0x12e/0x170 fs/select.c:769
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

read to 0xffff888120278317 of 1 bytes by task 10413 on cpu 0:
 udp_skb_csum_unnecessary include/net/udp.h:358 [inline]
 udpv6_recvmsg+0x43e/0xe90 net/ipv6/udp.c:310
 inet6_recvmsg+0xbb/0x240 net/ipv6/af_inet6.c:592
 sock_recvmsg_nosec+0x5c/0x70 net/socket.c:871
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 10413 Comm: syz-executor.0 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ec2ab82d0d4b..42f6027e30b4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1270,6 +1270,20 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
 		scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
 }
 
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+	/* We come here after udp_lib_checksum_complete() returned 0.
+	 * This means that __skb_checksum_complete() might have
+	 * set skb->csum_valid to 1.
+	 * On 64bit platforms, we can set csum_unnecessary
+	 * to true, but only if the skb is not shared.
+	 */
+#if BITS_PER_LONG == 64
+	if (!skb_shared(skb))
+		udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
 static int udp_skb_truesize(struct sk_buff *skb)
 {
 	return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
@@ -1504,10 +1518,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
 			*total += skb->truesize;
 			kfree_skb(skb);
 		} else {
-			/* the csum related bits could be changed, refresh
-			 * the scratch area
-			 */
-			udp_set_dev_scratch(skb);
+			udp_skb_csum_unnecessary_set(skb);
 			break;
 		}
 	}
-- 
2.20.1


From 7da26dc1dc0eee7b30810b640e49674fdad35a9c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 29 Oct 2019 01:24:32 +0800
Subject: [PATCH 17/36] vxlan: check tun_info options_len properly

[ Upstream commit eadf52cf1852196a1363044dcda22fa5d7f296f7 ]

This patch is to improve the tun_info options_len by dropping
the skb when TUNNEL_VXLAN_OPT is set but options_len is less
than vxlan_metadata. This can void a potential out-of-bounds
access on ip_tun_info.

Fixes: ee122c79d422 ("vxlan: Flow based tunneling")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0b1ec44acbf9..2a536f84d5f6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2174,9 +2174,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		vni = tunnel_id_to_key32(info->key.tun_id);
 		ifindex = 0;
 		dst_cache = &info->dst_cache;
-		if (info->options_len &&
-		    info->key.tun_flags & TUNNEL_VXLAN_OPT)
+		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+			if (info->options_len < sizeof(*md))
+				goto drop;
 			md = ip_tunnel_info_opts(info);
+		}
 		ttl = info->key.ttl;
 		tos = info->key.tos;
 		label = info->key.label;
-- 
2.20.1


From 0c11c1f5af0ec5f58bea057e37d76c7f5c5e17ac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:48 -0700
Subject: [PATCH 18/36] net: add skb_queue_empty_lockless()

[ Upstream commit d7d16a89350ab263484c0aa2b523dd3a234e4a80 ]

Some paths call skb_queue_empty() without holding
the queue lock. We must use a barrier in order
to not let the compiler do strange things, and avoid
KCSAN splats.

Adding a barrier in skb_queue_empty() might be overkill,
I prefer adding a new helper to clearly identify
points where the callers might be lockless. This might
help us finding real bugs.

The corresponding WRITE_ONCE() should add zero cost
for current compilers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 28baccb1efd5..7a683ea409fe 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1379,6 +1379,19 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 	return list->next == (const struct sk_buff *) list;
 }
 
+/**
+ *	skb_queue_empty_lockless - check if a queue is empty
+ *	@list: queue head
+ *
+ *	Returns true if the queue is empty, false otherwise.
+ *	This variant can be used in lockless contexts.
+ */
+static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
+{
+	return READ_ONCE(list->next) == (const struct sk_buff *) list;
+}
+
+
 /**
  *	skb_queue_is_last - check if skb is the last entry in the queue
  *	@list: queue head
@@ -1723,9 +1736,11 @@ static inline void __skb_insert(struct sk_buff *newsk,
 				struct sk_buff *prev, struct sk_buff *next,
 				struct sk_buff_head *list)
 {
-	newsk->next = next;
-	newsk->prev = prev;
-	next->prev  = prev->next = newsk;
+	/* see skb_queue_empty_lockless() for the opposite READ_ONCE() */
+	WRITE_ONCE(newsk->next, next);
+	WRITE_ONCE(newsk->prev, prev);
+	WRITE_ONCE(next->prev, newsk);
+	WRITE_ONCE(prev->next, newsk);
 	list->qlen++;
 }
 
@@ -1736,11 +1751,11 @@ static inline void __skb_queue_splice(const struct sk_buff_head *list,
 	struct sk_buff *first = list->next;
 	struct sk_buff *last = list->prev;
 
-	first->prev = prev;
-	prev->next = first;
+	WRITE_ONCE(first->prev, prev);
+	WRITE_ONCE(prev->next, first);
 
-	last->next = next;
-	next->prev = last;
+	WRITE_ONCE(last->next, next);
+	WRITE_ONCE(next->prev, last);
 }
 
 /**
@@ -1881,8 +1896,8 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
 	next	   = skb->next;
 	prev	   = skb->prev;
 	skb->next  = skb->prev = NULL;
-	next->prev = prev;
-	prev->next = next;
+	WRITE_ONCE(next->prev, prev);
+	WRITE_ONCE(prev->next, next);
 }
 
 /**
-- 
2.20.1


From 4aea278175a64ae5d43d2eae6495e8631e2eb4fa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:49 -0700
Subject: [PATCH 19/36] udp: use skb_queue_empty_lockless()

[ Upstream commit 137a0dbe3426fd7bcfe3f8117b36a87b3590e4eb ]

syzbot reported a data-race [1].

We should use skb_queue_empty_lockless() to document that we are
not ensuring a mutual exclusion and silence KCSAN.

[1]
BUG: KCSAN: data-race in __skb_recv_udp / __udp_enqueue_schedule_skb

write to 0xffff888122474b50 of 8 bytes by interrupt on cpu 0:
 __skb_insert include/linux/skbuff.h:1852 [inline]
 __skb_queue_before include/linux/skbuff.h:1958 [inline]
 __skb_queue_tail include/linux/skbuff.h:1991 [inline]
 __udp_enqueue_schedule_skb+0x2c1/0x410 net/ipv4/udp.c:1470
 __udp_queue_rcv_skb net/ipv4/udp.c:1940 [inline]
 udp_queue_rcv_one_skb+0x7bd/0xc70 net/ipv4/udp.c:2057
 udp_queue_rcv_skb+0xb5/0x400 net/ipv4/udp.c:2074
 udp_unicast_rcv_skb.isra.0+0x7e/0x1c0 net/ipv4/udp.c:2233
 __udp4_lib_rcv+0xa44/0x17c0 net/ipv4/udp.c:2300
 udp_rcv+0x2b/0x40 net/ipv4/udp.c:2470
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955

read to 0xffff888122474b50 of 8 bytes by task 8921 on cpu 1:
 skb_queue_empty include/linux/skbuff.h:1494 [inline]
 __skb_recv_udp+0x18d/0x500 net/ipv4/udp.c:1653
 udp_recvmsg+0xe1/0xb10 net/ipv4/udp.c:1712
 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838
 sock_recvmsg_nosec+0x5c/0x70 net/socket.c:871
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 8921 Comm: syz-executor.4 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 42f6027e30b4..c11eb6c9b4ff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1542,7 +1542,7 @@ static int first_packet_length(struct sock *sk)
 
 	spin_lock_bh(&rcvq->lock);
 	skb = __first_packet_length(sk, rcvq, &total);
-	if (!skb && !skb_queue_empty(sk_queue)) {
+	if (!skb && !skb_queue_empty_lockless(sk_queue)) {
 		spin_lock(&sk_queue->lock);
 		skb_queue_splice_tail_init(sk_queue, rcvq);
 		spin_unlock(&sk_queue->lock);
@@ -1617,7 +1617,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 				return skb;
 			}
 
-			if (skb_queue_empty(sk_queue)) {
+			if (skb_queue_empty_lockless(sk_queue)) {
 				spin_unlock_bh(&queue->lock);
 				goto busy_check;
 			}
@@ -1644,7 +1644,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 				break;
 
 			sk_busy_loop(sk, flags & MSG_DONTWAIT);
-		} while (!skb_queue_empty(sk_queue));
+		} while (!skb_queue_empty_lockless(sk_queue));
 
 		/* sk_queue is empty, reader_queue may contain peeked packets */
 	} while (timeo &&
-- 
2.20.1


From 9a10fef002ff0238fa5df4d085db095590c12ca2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:50 -0700
Subject: [PATCH 20/36] net: use skb_queue_empty_lockless() in poll() handlers

[ Upstream commit 3ef7cf57c72f32f61e97f8fa401bc39ea1f1a5d4 ]

Many poll() handlers are lockless. Using skb_queue_empty_lockless()
instead of skb_queue_empty() is more appropriate.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/capi/capi.c     | 2 +-
 net/atm/common.c             | 2 +-
 net/bluetooth/af_bluetooth.c | 4 ++--
 net/caif/caif_socket.c       | 2 +-
 net/core/datagram.c          | 4 ++--
 net/decnet/af_decnet.c       | 2 +-
 net/ipv4/tcp.c               | 2 +-
 net/ipv4/udp.c               | 2 +-
 net/nfc/llcp_sock.c          | 4 ++--
 net/phonet/socket.c          | 4 ++--
 net/sctp/socket.c            | 4 ++--
 net/tipc/socket.c            | 4 ++--
 net/unix/af_unix.c           | 6 +++---
 net/vmw_vsock/af_vsock.c     | 2 +-
 14 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 21786a442368..c67fd2fb333c 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -744,7 +744,7 @@ capi_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &(cdev->recvwait), wait);
 	mask = EPOLLOUT | EPOLLWRNORM;
-	if (!skb_queue_empty(&cdev->recvqueue))
+	if (!skb_queue_empty_lockless(&cdev->recvqueue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 	return mask;
 }
diff --git a/net/atm/common.c b/net/atm/common.c
index a38c174fc766..6772eddf6ec0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -667,7 +667,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 		mask |= EPOLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* writable? */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 8d12198eaa94..ee60c30f3be2 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -460,7 +460,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
 
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
@@ -470,7 +470,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= EPOLLHUP;
 
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == BT_CLOSED)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 416717c57cd1..4b31f0aaa96d 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -953,7 +953,7 @@ static __poll_t caif_poll(struct file *file,
 		mask |= EPOLLRDHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
 		(sk->sk_shutdown & RCV_SHUTDOWN))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a487df53a453..b69e0e28b826 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -842,7 +842,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 	mask = 0;
 
 	/* exceptional events? */
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
@@ -852,7 +852,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 		mask |= EPOLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 7d6ff983ba2c..0e6f32defd67 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1213,7 +1213,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wai
 	struct dn_scp *scp = DN_SK(sk);
 	__poll_t mask = datagram_poll(file, sock, wait);
 
-	if (!skb_queue_empty(&scp->other_receive_queue))
+	if (!skb_queue_empty_lockless(&scp->other_receive_queue))
 		mask |= EPOLLRDBAND;
 
 	return mask;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 611ba174265c..0e7bcc0b3640 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -595,7 +595,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR;
 
 	return mask;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c11eb6c9b4ff..8877bd140a0d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2651,7 +2651,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	__poll_t mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 
-	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
+	if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index e0a2cb8a029f..7d766350c08e 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -566,11 +566,11 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
 	if (sk->sk_state == LLCP_LISTEN)
 		return llcp_accept_poll(sk);
 
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == LLCP_CLOSED)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 30187990257f..1ae629544444 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -351,9 +351,9 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
 
 	if (sk->sk_state == TCP_CLOSE)
 		return EPOLLERR;
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
-	if (!skb_queue_empty(&pn->ctrlreq_queue))
+	if (!skb_queue_empty_lockless(&pn->ctrlreq_queue))
 		mask |= EPOLLPRI;
 	if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
 		return EPOLLHUP;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0ab339a42f99..d922c78b6c29 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7939,7 +7939,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	mask = 0;
 
 	/* Is there any exceptional events?  */
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
@@ -7948,7 +7948,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		mask |= EPOLLHUP;
 
 	/* Is it readable?  Reconsider this code with TCP-style support.  */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* The association is either gone or not ready.  */
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 6c91f1217dcf..5841d62ff580 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -731,7 +731,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
 		/* fall thru' */
 	case TIPC_LISTEN:
 	case TIPC_CONNECTING:
-		if (!skb_queue_empty(&sk->sk_receive_queue))
+		if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 			revents |= EPOLLIN | EPOLLRDNORM;
 		break;
 	case TIPC_OPEN:
@@ -739,7 +739,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
 			revents |= EPOLLOUT;
 		if (!tipc_sk_type_connectionless(sk))
 			break;
-		if (skb_queue_empty(&sk->sk_receive_queue))
+		if (skb_queue_empty_lockless(&sk->sk_receive_queue))
 			break;
 		revents |= EPOLLIN | EPOLLRDNORM;
 		break;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f601933ad728..231b6c032d2c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2661,7 +2661,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
@@ -2690,7 +2690,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	mask = 0;
 
 	/* exceptional events? */
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
@@ -2700,7 +2700,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 		mask |= EPOLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2a4613b239e0..b2724fb06bbb 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -873,7 +873,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 		 * the queue and write as long as the socket isn't shutdown for
 		 * sending.
 		 */
-		if (!skb_queue_empty(&sk->sk_receive_queue) ||
+		if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN)) {
 			mask |= EPOLLIN | EPOLLRDNORM;
 		}
-- 
2.20.1


From f745252a8ea33dbe2ecb4495e729fd5145d98052 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:51 -0700
Subject: [PATCH 21/36] net: use skb_queue_empty_lockless() in busy poll
 contexts

[ Upstream commit 3f926af3f4d688e2e11e7f8ed04e277a14d4d4a4 ]

Busy polling usually runs without locks.
Let's use skb_queue_empty_lockless() instead of skb_queue_empty()

Also uses READ_ONCE() in __skb_try_recv_datagram() to address
a similar potential problem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 2 +-
 net/core/datagram.c                     | 2 +-
 net/core/sock.c                         | 2 +-
 net/ipv4/tcp.c                          | 2 +-
 net/sctp/socket.c                       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index afebbd87c4aa..1587f4ac6821 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1716,7 +1716,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return peekmsg(sk, msg, len, nonblock, flags);
 
 	if (sk_can_busy_loop(sk) &&
-	    skb_queue_empty(&sk->sk_receive_queue) &&
+	    skb_queue_empty_lockless(&sk->sk_receive_queue) &&
 	    sk->sk_state == TCP_ESTABLISHED)
 		sk_busy_loop(sk, nonblock);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b69e0e28b826..d3b99616a6c5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -279,7 +279,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 			break;
 
 		sk_busy_loop(sk, flags & MSG_DONTWAIT);
-	} while (sk->sk_receive_queue.prev != *last);
+	} while (READ_ONCE(sk->sk_receive_queue.prev) != *last);
 
 	error = -EAGAIN;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 5f652dbe66a2..6c1107821776 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3483,7 +3483,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
 {
 	struct sock *sk = p;
 
-	return !skb_queue_empty(&sk->sk_receive_queue) ||
+	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
 	       sk_busy_loop_timeout(sk, start_time);
 }
 EXPORT_SYMBOL(sk_busy_loop_end);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e7bcc0b3640..647ba447bf1a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1948,7 +1948,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
 
-	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
 	    (sk->sk_state == TCP_ESTABLISHED))
 		sk_busy_loop(sk, nonblock);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d922c78b6c29..c76631552722 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8334,7 +8334,7 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
 		if (sk_can_busy_loop(sk)) {
 			sk_busy_loop(sk, noblock);
 
-			if (!skb_queue_empty(&sk->sk_receive_queue))
+			if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 				continue;
 		}
 
-- 
2.20.1


From c482c99bbacd8c8e2cc0d9b0e251c7edcdd0141b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:52 -0700
Subject: [PATCH 22/36] net: add READ_ONCE() annotation in
 __skb_wait_for_more_packets()

[ Upstream commit 7c422d0ce97552dde4a97e6290de70ec6efb0fc6 ]

__skb_wait_for_more_packets() can be called while other cpus
can feed packets to the socket receive queue.

KCSAN reported :

BUG: KCSAN: data-race in __skb_wait_for_more_packets / __udp_enqueue_schedule_skb

write to 0xffff888102e40b58 of 8 bytes by interrupt on cpu 0:
 __skb_insert include/linux/skbuff.h:1852 [inline]
 __skb_queue_before include/linux/skbuff.h:1958 [inline]
 __skb_queue_tail include/linux/skbuff.h:1991 [inline]
 __udp_enqueue_schedule_skb+0x2d7/0x410 net/ipv4/udp.c:1470
 __udp_queue_rcv_skb net/ipv4/udp.c:1940 [inline]
 udp_queue_rcv_one_skb+0x7bd/0xc70 net/ipv4/udp.c:2057
 udp_queue_rcv_skb+0xb5/0x400 net/ipv4/udp.c:2074
 udp_unicast_rcv_skb.isra.0+0x7e/0x1c0 net/ipv4/udp.c:2233
 __udp4_lib_rcv+0xa44/0x17c0 net/ipv4/udp.c:2300
 udp_rcv+0x2b/0x40 net/ipv4/udp.c:2470
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955

read to 0xffff888102e40b58 of 8 bytes by task 13035 on cpu 1:
 __skb_wait_for_more_packets+0xfa/0x320 net/core/datagram.c:100
 __skb_recv_udp+0x374/0x500 net/ipv4/udp.c:1683
 udp_recvmsg+0xe1/0xb10 net/ipv4/udp.c:1712
 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838
 sock_recvmsg_nosec+0x5c/0x70 net/socket.c:871
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 13035 Comm: syz-executor.3 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index d3b99616a6c5..865a8cb7b0bd 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -95,7 +95,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 	if (error)
 		goto out_err;
 
-	if (sk->sk_receive_queue.prev != skb)
+	if (READ_ONCE(sk->sk_receive_queue.prev) != skb)
 		goto out;
 
 	/* Socket shut down? */
-- 
2.20.1


From 1c045f22a642cbc151bbfccf579d55188e76d908 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sat, 26 Oct 2019 11:53:39 +0200
Subject: [PATCH 23/36] ipv4: fix route update on metric change.

[ Upstream commit 0b834ba00ab5337e938c727e216e1f5249794717 ]

Since commit af4d768ad28c ("net/ipv4: Add support for specifying metric
of connected routes"), when updating an IP address with a different metric,
the associated connected route is updated, too.

Still, the mentioned commit doesn't handle properly some corner cases:

$ ip addr add dev eth0 192.168.1.0/24
$ ip addr add dev eth0 192.168.2.1/32 peer 192.168.2.2
$ ip addr add dev eth0 192.168.3.1/24
$ ip addr change dev eth0 192.168.1.0/24 metric 10
$ ip addr change dev eth0 192.168.2.1/32 peer 192.168.2.2 metric 10
$ ip addr change dev eth0 192.168.3.1/24 metric 10
$ ip -4 route
192.168.1.0/24 dev eth0 proto kernel scope link src 192.168.1.0
192.168.2.2 dev eth0 proto kernel scope link src 192.168.2.1
192.168.3.0/24 dev eth0 proto kernel scope link src 192.168.2.1 metric 10

Only the last route is correctly updated.

The problem is the current test in fib_modify_prefix_metric():

	if (!(dev->flags & IFF_UP) ||
	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
	    ipv4_is_zeronet(prefix) ||
	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)

Which should be the logical 'not' of the pre-existing test in
fib_add_ifaddr():

	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
	    (prefix != addr || ifa->ifa_prefixlen < 32))

To properly negate the original expression, we need to change the last
logical 'or' to a logical 'and'.

Fixes: af4d768ad28c ("net/ipv4: Add support for specifying metric of connected routes")
Reported-and-suggested-by: Beniamino Galvani <bgalvani@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index dae743b649c1..7f4ec36e5f70 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -946,7 +946,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
 	if (!(dev->flags & IFF_UP) ||
 	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
 	    ipv4_is_zeronet(prefix) ||
-	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+	    (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
 		return;
 
 	/* add the new */
-- 
2.20.1


From 689395bedcdcd11d0ce8ca0f3dd944e4c002faef Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sat, 26 Oct 2019 11:53:40 +0200
Subject: [PATCH 24/36] selftests: fib_tests: add more tests for metric update

[ Upstream commit 37de3b354150450ba12275397155e68113e99901 ]

This patch adds two more tests to ipv4_addr_metric_test() to
explicitly cover the scenarios fixed by the previous patch.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 2f190aa8fc5f..c0885fb65767 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -1301,6 +1301,27 @@ ipv4_addr_metric_test()
 	fi
 	log_test $rc 0 "Prefix route with metric on link up"
 
+	# explicitly check for metric changes on edge scenarios
+	run_cmd "$IP addr flush dev dummy2"
+	run_cmd "$IP addr add dev dummy2 172.16.104.0/24 metric 259"
+	run_cmd "$IP addr change dev dummy2 172.16.104.0/24 metric 260"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.0 metric 260"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of .0/24 address"
+
+	run_cmd "$IP addr flush dev dummy2"
+	run_cmd "$IP addr add dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 260"
+	run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 261"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of address with peer route"
+
 	$IP li del dummy1
 	$IP li del dummy2
 	cleanup
-- 
2.20.1


From 6084bc17d4fc8d28a1cc279286f78d0064f14c91 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Mon, 16 Sep 2019 14:54:20 +0300
Subject: [PATCH 25/36] net/mlx5e: Fix handling of compressed CQEs in case of
 low NAPI budget

[ Upstream commit 9df86bdb6746d7fcfc2fda715f7a7c3d0ddb2654 ]

When CQE compression is enabled, compressed CQEs use the following
structure: a title is followed by one or many blocks, each containing 8
mini CQEs (except the last, which may contain fewer mini CQEs).

Due to NAPI budget restriction, a complete structure is not always
parsed in one NAPI run, and some blocks with mini CQEs may be deferred
to the next NAPI poll call - we have the mlx5e_decompress_cqes_cont call
in the beginning of mlx5e_poll_rx_cq. However, if the budget is
extremely low, some blocks may be left even after that, but the code
that follows the mlx5e_decompress_cqes_cont call doesn't check it and
assumes that a new CQE begins, which may not be the case. In such cases,
random memory corruptions occur.

An extremely low NAPI budget of 8 is used when busy_poll or busy_read is
active.

This commit adds a check to make sure that the previous compressed CQE
has been completely parsed after mlx5e_decompress_cqes_cont, otherwise
it prevents a new CQE from being fetched in the middle of a compressed
CQE.

This commit fixes random crashes in __build_skb, __page_pool_put_page
and other not-related-directly places, that used to happen when both CQE
compression and busy_poll/busy_read were enabled.

Fixes: 7219ab34f184 ("net/mlx5e: CQE compression")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index df49dc143c47..9cbc4173973e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1267,8 +1267,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return 0;
 
-	if (cq->decmprs_left)
+	if (cq->decmprs_left) {
 		work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
+		if (cq->decmprs_left || work_done >= budget)
+			goto out;
+	}
 
 	cqe = mlx5_cqwq_get_cqe(&cq->wq);
 	if (!cqe) {
-- 
2.20.1


From 2d35bdc1a21834a6258c35b4160505400f436bad Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 1 Nov 2019 00:10:21 +0100
Subject: [PATCH 26/36] r8169: fix wrong PHY ID issue with RTL8168dp

[ Upstream commit 62bdc8fd1c21d4263ebd18bec57f82532d09249f ]

As reported in [0] at least one RTL8168dp version has problems
establishing a link. This chip version has an integrated RTL8211b PHY,
however the chip seems to report a wrong PHY ID, resulting in a wrong
PHY driver (for Generic Realtek PHY) being loaded.
Work around this issue by adding a hook to r8168dp_2_mdio_read()
for returning the correct PHY ID.

[0] https://bbs.archlinux.org/viewtopic.php?id=246508

Fixes: 242cd9b5866a ("r8169: use phy_resume/phy_suspend")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 0c8b7146637e..4ab87fe84542 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1010,6 +1010,10 @@ static int r8168dp_2_mdio_read(struct rtl8169_private *tp, int reg)
 {
 	int value;
 
+	/* Work around issue with chip reporting wrong PHY ID */
+	if (reg == MII_PHYSID2)
+		return 0xc912;
+
 	r8168dp_2_mdio_start(tp);
 
 	value = r8169_mdio_read(tp, reg);
-- 
2.20.1


From dc022f80c17485ff66745aa30be368060c98e794 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Wed, 2 Oct 2019 16:53:21 +0300
Subject: [PATCH 27/36] net/mlx5e: Fix ethtool self test: link speed

[ Upstream commit 534e7366f41b0c689b01af4375aefcd1462adedf ]

Ethtool self test contains a test for link speed. This test reads the
PTYS register and determines whether the current speed is valid or not.
Change current implementation to use the function mlx5e_port_linkspeed()
that does the same check and fails when speed is invalid. This code
redundancy lead to a bug when mlx5e_port_linkspeed() was updated with
expended speeds and the self test was not.

Fixes: 2c81bfd5ae56 ("net/mlx5e: Move port speed code from en_ethtool.c to en/port.c")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_selftest.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 4382ef85488c..5fb088b54e66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -35,6 +35,7 @@
 #include <linux/udp.h>
 #include <net/udp.h>
 #include "en.h"
+#include "en/port.h"
 
 enum {
 	MLX5E_ST_LINK_STATE,
@@ -80,22 +81,12 @@ static int mlx5e_test_link_state(struct mlx5e_priv *priv)
 
 static int mlx5e_test_link_speed(struct mlx5e_priv *priv)
 {
-	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
-	u32 eth_proto_oper;
-	int i;
+	u32 speed;
 
 	if (!netif_carrier_ok(priv->netdev))
 		return 1;
 
-	if (mlx5_query_port_ptys(priv->mdev, out, sizeof(out), MLX5_PTYS_EN, 1))
-		return 1;
-
-	eth_proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
-	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; i++) {
-		if (eth_proto_oper & MLX5E_PROT_MASK(i))
-			return 0;
-	}
-	return 1;
+	return mlx5e_port_linkspeed(priv->mdev, &speed);
 }
 
 struct mlx5ehdr {
-- 
2.20.1


From 588c2e4606eef4a1b8780954ba633816cc9b281b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sat, 5 Oct 2019 15:05:18 -0700
Subject: [PATCH 28/36] net: dsa: b53: Do not clear existing mirrored port mask

[ Upstream commit c763ac436b668d7417f0979430ec0312ede4093d ]

Clearing the existing bitmask of mirrored ports essentially prevents us
from capturing more than one port at any given time. This is clearly
wrong, do not clear the bitmask prior to setting up the new port.

Reported-by: Hubert Feurstein <h.feurstein@gmail.com>
Fixes: ed3af5fd08eb ("net: dsa: b53: Add support for port mirroring")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index ad534b90ef21..2d3a2cb026d2 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1584,7 +1584,6 @@ int b53_mirror_add(struct dsa_switch *ds, int port,
 		loc = B53_EG_MIR_CTL;
 
 	b53_read16(dev, B53_MGMT_PAGE, loc, &reg);
-	reg &= ~MIRROR_MASK;
 	reg |= BIT(port);
 	b53_write16(dev, B53_MGMT_PAGE, loc, reg);
 
-- 
2.20.1


From 8674664116bcd25825450d02e181dc24e818de20 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:29 -0700
Subject: [PATCH 29/36] net: bcmgenet: don't set phydev->link from MAC

[ Upstream commit 7de48402faa32298c3551ea32c76ccb4f9d3025d ]

When commit 28b2e0d2cd13 ("net: phy: remove parameter new_link from
phy_mac_interrupt()") removed the new_link parameter it set the
phydev->link state from the MAC before invoking phy_mac_interrupt().

However, once commit 88d6272acaaa ("net: phy: avoid unneeded MDIO
reads in genphy_read_status") was added this initialization prevents
the proper determination of the connection parameters by the function
genphy_read_status().

This commit removes that initialization to restore the proper
functionality.

Fixes: 88d6272acaaa ("net: phy: avoid unneeded MDIO reads in genphy_read_status")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index fd587bed32eb..0b39118749b7 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2619,10 +2619,8 @@ static void bcmgenet_irq_task(struct work_struct *work)
 	spin_unlock_irq(&priv->lock);
 
 	/* Link UP/DOWN event */
-	if (status & UMAC_IRQ_LINK_EVENT) {
-		priv->dev->phydev->link = !!(status & UMAC_IRQ_LINK_UP);
+	if (status & UMAC_IRQ_LINK_EVENT)
 		phy_mac_interrupt(priv->dev->phydev);
-	}
 }
 
 /* bcmgenet_isr1: handle Rx and Tx priority queues */
-- 
2.20.1


From 7768e6b4215dc7df3c74560ca5346781290c0524 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:30 -0700
Subject: [PATCH 30/36] net: phy: bcm7xxx: define soft_reset for 40nm EPHY

[ Upstream commit fe586b823372a9f43f90e2c6aa0573992ce7ccb7 ]

The internal 40nm EPHYs use a "Workaround for putting the PHY in
IDDQ mode." These PHYs require a soft reset to restore functionality
after they are powered back up.

This commit defines the soft_reset function to use genphy_soft_reset
during phy_init_hw to accommodate this.

Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index b2b6307d64a4..acaf072bb4b0 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -643,6 +643,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev)
 	.name           = _name,					\
 	.features       = PHY_BASIC_FEATURES,				\
 	.flags          = PHY_IS_INTERNAL,				\
+	.soft_reset	= genphy_soft_reset,				\
 	.config_init    = bcm7xxx_config_init,				\
 	.suspend        = bcm7xxx_suspend,				\
 	.resume         = bcm7xxx_config_init,				\
-- 
2.20.1


From 1f327ba7fe8ad5d1ed3e7b6f89f076d0ad3e7e20 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:31 -0700
Subject: [PATCH 31/36] net: bcmgenet: soft reset 40nm EPHYs before MAC init

[ Upstream commit 1f515486275a08a17a2c806b844cca18f7de5b34 ]

It turns out that the "Workaround for putting the PHY in IDDQ mode"
used by the internal EPHYs on 40nm Set-Top Box chips when powering
down puts the interface to the GENET MAC in a state that can cause
subsequent MAC resets to be incomplete.

Rather than restore the forced soft reset when powering up internal
PHYs, this commit moves the invocation of phy_init_hw earlier in
the MAC initialization sequence to just before the MAC reset in the
open and resume functions. This allows the interface to be stable
and allows the MAC resets to be successful.

The bcmgenet_mii_probe() function is split in two to accommodate
this. The new function bcmgenet_mii_connect() handles the first
half of the functionality before the MAC initialization, and the
bcmgenet_mii_config() function is extended to provide the remaining
PHY configuration following the MAC initialization.

Fixes: 484bfa1507bf ("Revert "net: bcmgenet: Software reset EPHY after power on"")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/broadcom/genet/bcmgenet.c    |  28 +++--
 .../net/ethernet/broadcom/genet/bcmgenet.h    |   2 +-
 drivers/net/ethernet/broadcom/genet/bcmmii.c  | 112 ++++++++----------
 3 files changed, 69 insertions(+), 73 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 0b39118749b7..13ae7676d7f7 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2879,6 +2879,12 @@ static int bcmgenet_open(struct net_device *dev)
 	if (priv->internal_phy)
 		bcmgenet_power_up(priv, GENET_POWER_PASSIVE);
 
+	ret = bcmgenet_mii_connect(dev);
+	if (ret) {
+		netdev_err(dev, "failed to connect to PHY\n");
+		goto err_clk_disable;
+	}
+
 	/* take MAC out of reset */
 	bcmgenet_umac_reset(priv);
 
@@ -2888,6 +2894,12 @@ static int bcmgenet_open(struct net_device *dev)
 	reg = bcmgenet_umac_readl(priv, UMAC_CMD);
 	priv->crc_fwd_en = !!(reg & CMD_CRC_FWD);
 
+	ret = bcmgenet_mii_config(dev, true);
+	if (ret) {
+		netdev_err(dev, "unsupported PHY\n");
+		goto err_disconnect_phy;
+	}
+
 	bcmgenet_set_hw_addr(priv, dev->dev_addr);
 
 	if (priv->internal_phy) {
@@ -2903,7 +2915,7 @@ static int bcmgenet_open(struct net_device *dev)
 	ret = bcmgenet_init_dma(priv);
 	if (ret) {
 		netdev_err(dev, "failed to initialize DMA\n");
-		goto err_clk_disable;
+		goto err_disconnect_phy;
 	}
 
 	/* Always enable ring 16 - descriptor ring */
@@ -2926,25 +2938,19 @@ static int bcmgenet_open(struct net_device *dev)
 		goto err_irq0;
 	}
 
-	ret = bcmgenet_mii_probe(dev);
-	if (ret) {
-		netdev_err(dev, "failed to connect to PHY\n");
-		goto err_irq1;
-	}
-
 	bcmgenet_netif_start(dev);
 
 	netif_tx_start_all_queues(dev);
 
 	return 0;
 
-err_irq1:
-	free_irq(priv->irq1, priv);
 err_irq0:
 	free_irq(priv->irq0, priv);
 err_fini_dma:
 	bcmgenet_dma_teardown(priv);
 	bcmgenet_fini_dma(priv);
+err_disconnect_phy:
+	phy_disconnect(dev->phydev);
 err_clk_disable:
 	if (priv->internal_phy)
 		bcmgenet_power_down(priv, GENET_POWER_PASSIVE);
@@ -3657,6 +3663,8 @@ static int bcmgenet_resume(struct device *d)
 	if (priv->internal_phy)
 		bcmgenet_power_up(priv, GENET_POWER_PASSIVE);
 
+	phy_init_hw(dev->phydev);
+
 	bcmgenet_umac_reset(priv);
 
 	init_umac(priv);
@@ -3665,8 +3673,6 @@ static int bcmgenet_resume(struct device *d)
 	if (priv->wolopts)
 		clk_disable_unprepare(priv->clk_wol);
 
-	phy_init_hw(dev->phydev);
-
 	/* Speed settings must be restored */
 	bcmgenet_mii_config(priv->dev, false);
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 4dabf37319c8..654544618551 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -723,8 +723,8 @@ GENET_IO_MACRO(rbuf, GENET_RBUF_OFF);
 
 /* MDIO routines */
 int bcmgenet_mii_init(struct net_device *dev);
+int bcmgenet_mii_connect(struct net_device *dev);
 int bcmgenet_mii_config(struct net_device *dev, bool init);
-int bcmgenet_mii_probe(struct net_device *dev);
 void bcmgenet_mii_exit(struct net_device *dev);
 void bcmgenet_phy_power_set(struct net_device *dev, bool enable);
 void bcmgenet_mii_setup(struct net_device *dev);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 0d527fa5de61..ec4334639d95 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -176,6 +176,46 @@ static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv)
 					  bcmgenet_fixed_phy_link_update);
 }
 
+int bcmgenet_mii_connect(struct net_device *dev)
+{
+	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct device_node *dn = priv->pdev->dev.of_node;
+	struct phy_device *phydev;
+	u32 phy_flags = 0;
+	int ret;
+
+	/* Communicate the integrated PHY revision */
+	if (priv->internal_phy)
+		phy_flags = priv->gphy_rev;
+
+	/* Initialize link state variables that bcmgenet_mii_setup() uses */
+	priv->old_link = -1;
+	priv->old_speed = -1;
+	priv->old_duplex = -1;
+	priv->old_pause = -1;
+
+	if (dn) {
+		phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup,
+					phy_flags, priv->phy_interface);
+		if (!phydev) {
+			pr_err("could not attach to PHY\n");
+			return -ENODEV;
+		}
+	} else {
+		phydev = dev->phydev;
+		phydev->dev_flags = phy_flags;
+
+		ret = phy_connect_direct(dev, phydev, bcmgenet_mii_setup,
+					 priv->phy_interface);
+		if (ret) {
+			pr_err("could not attach to PHY\n");
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
 int bcmgenet_mii_config(struct net_device *dev, bool init)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
@@ -269,71 +309,21 @@ int bcmgenet_mii_config(struct net_device *dev, bool init)
 		bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL);
 	}
 
-	if (init)
-		dev_info(kdev, "configuring instance for %s\n", phy_name);
-
-	return 0;
-}
+	if (init) {
+		phydev->advertising = phydev->supported;
 
-int bcmgenet_mii_probe(struct net_device *dev)
-{
-	struct bcmgenet_priv *priv = netdev_priv(dev);
-	struct device_node *dn = priv->pdev->dev.of_node;
-	struct phy_device *phydev;
-	u32 phy_flags = 0;
-	int ret;
-
-	/* Communicate the integrated PHY revision */
-	if (priv->internal_phy)
-		phy_flags = priv->gphy_rev;
-
-	/* Initialize link state variables that bcmgenet_mii_setup() uses */
-	priv->old_link = -1;
-	priv->old_speed = -1;
-	priv->old_duplex = -1;
-	priv->old_pause = -1;
-
-	if (dn) {
-		phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup,
-					phy_flags, priv->phy_interface);
-		if (!phydev) {
-			pr_err("could not attach to PHY\n");
-			return -ENODEV;
-		}
-	} else {
-		phydev = dev->phydev;
-		phydev->dev_flags = phy_flags;
-
-		ret = phy_connect_direct(dev, phydev, bcmgenet_mii_setup,
-					 priv->phy_interface);
-		if (ret) {
-			pr_err("could not attach to PHY\n");
-			return -ENODEV;
-		}
-	}
+		/* The internal PHY has its link interrupts routed to the
+		 * Ethernet MAC ISRs. On GENETv5 there is a hardware issue
+		 * that prevents the signaling of link UP interrupts when
+		 * the link operates at 10Mbps, so fallback to polling for
+		 * those versions of GENET.
+		 */
+		if (priv->internal_phy && !GENET_IS_V5(priv))
+			phydev->irq = PHY_IGNORE_INTERRUPT;
 
-	/* Configure port multiplexer based on what the probed PHY device since
-	 * reading the 'max-speed' property determines the maximum supported
-	 * PHY speed which is needed for bcmgenet_mii_config() to configure
-	 * things appropriately.
-	 */
-	ret = bcmgenet_mii_config(dev, true);
-	if (ret) {
-		phy_disconnect(dev->phydev);
-		return ret;
+		dev_info(kdev, "configuring instance for %s\n", phy_name);
 	}
 
-	phydev->advertising = phydev->supported;
-
-	/* The internal PHY has its link interrupts routed to the
-	 * Ethernet MAC ISRs. On GENETv5 there is a hardware issue
-	 * that prevents the signaling of link UP interrupts when
-	 * the link operates at 10Mbps, so fallback to polling for
-	 * those versions of GENET.
-	 */
-	if (priv->internal_phy && !GENET_IS_V5(priv))
-		dev->phydev->irq = PHY_IGNORE_INTERRUPT;
-
 	return 0;
 }
 
-- 
2.20.1


From 9f7f7d8ca963bf5db2290306c7fdfdc6dd390848 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:32 -0700
Subject: [PATCH 32/36] net: bcmgenet: reset 40nm EPHY on energy detect

[ Upstream commit 25382b991d252aed961cd434176240f9de6bb15f ]

The EPHY integrated into the 40nm Set-Top Box devices can falsely
detect energy when connected to a disabled peer interface. When the
peer interface is enabled the EPHY will detect and report the link
as active, but on occasion may get into a state where it is not
able to exchange data with the connected GENET MAC. This issue has
not been observed when the link parameters are auto-negotiated;
however, it has been observed with a manually configured link.

It has been empirically determined that issuing a soft reset to the
EPHY when energy is detected prevents it from getting into this bad
state.

Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 13ae7676d7f7..b030871121c6 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2020,6 +2020,8 @@ static void bcmgenet_link_intr_enable(struct bcmgenet_priv *priv)
 	 */
 	if (priv->internal_phy) {
 		int0_enable |= UMAC_IRQ_LINK_EVENT;
+		if (GENET_IS_V1(priv) || GENET_IS_V2(priv) || GENET_IS_V3(priv))
+			int0_enable |= UMAC_IRQ_PHY_DET_R;
 	} else if (priv->ext_phy) {
 		int0_enable |= UMAC_IRQ_LINK_EVENT;
 	} else if (priv->phy_interface == PHY_INTERFACE_MODE_MOCA) {
@@ -2618,9 +2620,14 @@ static void bcmgenet_irq_task(struct work_struct *work)
 	priv->irq0_stat = 0;
 	spin_unlock_irq(&priv->lock);
 
+	if (status & UMAC_IRQ_PHY_DET_R &&
+	    priv->dev->phydev->autoneg != AUTONEG_ENABLE)
+		phy_init_hw(priv->dev->phydev);
+
 	/* Link UP/DOWN event */
 	if (status & UMAC_IRQ_LINK_EVENT)
 		phy_mac_interrupt(priv->dev->phydev);
+
 }
 
 /* bcmgenet_isr1: handle Rx and Tx priority queues */
@@ -2715,7 +2722,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 	}
 
 	/* all other interested interrupts handled in bottom half */
-	status &= UMAC_IRQ_LINK_EVENT;
+	status &= (UMAC_IRQ_LINK_EVENT | UMAC_IRQ_PHY_DET_R);
 	if (status) {
 		/* Save irq status for bottom-half processing. */
 		spin_lock_irqsave(&priv->lock, flags);
-- 
2.20.1


From 633f3416a0997588101a2f968d26a4b18c29fff4 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 17 Oct 2019 21:29:26 +0200
Subject: [PATCH 33/36] net: usb: lan78xx: Connect PHY before registering MAC

[ Upstream commit 38b4fe320119859c11b1dc06f6b4987a16344fa1 ]

As soon as the netdev is registers, the kernel can start using the
interface. If the driver connects the MAC to the PHY after the netdev
is registered, there is a race condition where the interface can be
opened without having the PHY connected.

Change the order to close this race condition.

Fixes: 92571a1aae40 ("lan78xx: Connect phy early")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 64b6a769e781..3a172fcb06fe 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -3799,10 +3799,14 @@ static int lan78xx_probe(struct usb_interface *intf,
 	/* driver requires remote-wakeup capability during autosuspend. */
 	intf->needs_remote_wakeup = 1;
 
+	ret = lan78xx_phy_init(dev);
+	if (ret < 0)
+		goto out4;
+
 	ret = register_netdev(netdev);
 	if (ret != 0) {
 		netif_err(dev, probe, netdev, "couldn't register the device\n");
-		goto out4;
+		goto out5;
 	}
 
 	usb_set_intfdata(intf, dev);
@@ -3815,14 +3819,10 @@ static int lan78xx_probe(struct usb_interface *intf,
 	pm_runtime_set_autosuspend_delay(&udev->dev,
 					 DEFAULT_AUTOSUSPEND_DELAY);
 
-	ret = lan78xx_phy_init(dev);
-	if (ret < 0)
-		goto out5;
-
 	return 0;
 
 out5:
-	unregister_netdev(netdev);
+	phy_disconnect(netdev->phydev);
 out4:
 	usb_free_urb(dev->urb_intr);
 out3:
-- 
2.20.1


From 10934c311db8176f371b64133df8f80bbcb899fa Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Fri, 18 Oct 2019 17:02:46 -0400
Subject: [PATCH 34/36] net: dsa: fix switch tree list

[ Upstream commit 50c7d2ba9de20f60a2d527ad6928209ef67e4cdd ]

If there are multiple switch trees on the device, only the last one
will be listed, because the arguments of list_add_tail are swapped.

Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation")
Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index a1917025e155..46ae4dee522a 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -49,7 +49,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index)
 	dst->index = index;
 
 	INIT_LIST_HEAD(&dst->list);
-	list_add_tail(&dsa_tree_list, &dst->list);
+	list_add_tail(&dst->list, &dsa_tree_list);
 
 	kref_init(&dst->refcount);
 
-- 
2.20.1


From f8e0e0bbd371bffb826bff8c53e5df3a583868da Mon Sep 17 00:00:00 2001
From: Kazutoshi Noguchi <noguchi.kazutosi@gmail.com>
Date: Mon, 21 Oct 2019 00:03:07 +0900
Subject: [PATCH 35/36] r8152: add device id for Lenovo ThinkPad USB-C Dock Gen
 2

[ Upstream commit b3060531979422d5bb18d80226f978910284dc70 ]

This device is sold as 'ThinkPad USB-C Dock Gen 2 (40AS)'.
Chipset is RTL8153 and works with r8152.
Without this, the generic cdc_ether grabs the device, and the device jam
connected networks up when the machine suspends.

Signed-off-by: Kazutoshi Noguchi <noguchi.kazutosi@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c | 7 +++++++
 drivers/net/usb/r8152.c     | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 85fba64c3fcf..c3cf9ae6d1df 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -800,6 +800,13 @@ static const struct usb_device_id	products[] = {
 	.driver_info = 0,
 },
 
+/* ThinkPad USB-C Dock Gen 2 (based on Realtek RTL8153) */
+{
+	USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0xa387, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+	.driver_info = 0,
+},
+
 /* NVIDIA Tegra USB 3.0 Ethernet Adapters (based on Realtek RTL8153) */
 {
 	USB_DEVICE_AND_INTERFACE_INFO(NVIDIA_VENDOR_ID, 0x09ff, USB_CLASS_COMM,
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index a291e5f2daef..91d47a714afd 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -5339,6 +5339,7 @@ static const struct usb_device_id rtl8152_table[] = {
 	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x7205)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x720c)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x7214)},
+	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0xa387)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_LINKSYS, 0x0041)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_NVIDIA,  0x09ff)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_TPLINK,  0x0601)},
-- 
2.20.1


From ee725bd77baafe1c5029d915194ed01f0ddbce51 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 22 Oct 2019 07:57:46 -0700
Subject: [PATCH 36/36] net/flow_dissector: switch to siphash

[ Upstream commit 55667441c84fa5e0911a0aac44fb059c15ba6da2 ]

UDP IPv6 packets auto flowlabels are using a 32bit secret
(static u32 hashrnd in net/core/flow_dissector.c) and
apply jhash() over fields known by the receivers.

Attackers can easily infer the 32bit secret and use this information
to identify a device and/or user, since this 32bit secret is only
set at boot time.

Really, using jhash() to generate cookies sent on the wire
is a serious security concern.

Trying to change the rol32(hash, 16) in ip6_make_flowlabel() would be
a dead end. Trying to periodically change the secret (like in sch_sfq.c)
could change paths taken in the network for long lived flows.

Let's switch to siphash, as we did in commit df453700e8d8
("inet: switch IP ID generator to siphash")

Using a cryptographically strong pseudo random function will solve this
privacy issue and more generally remove other weak points in the stack.

Packet schedulers using skb_get_hash_perturb() benefit from this change.

Fixes: b56774163f99 ("ipv6: Enable auto flow labels by default")
Fixes: 42240901f7c4 ("ipv6: Implement different admin modes for automatic flow labels")
Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel")
Fixes: cb1ce2ef387b ("ipv6: Implement automatic flow label generation on transmit")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Jonathan Berger <jonathann1@walla.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       |  3 ++-
 include/net/flow_dissector.h |  3 ++-
 include/net/fq.h             |  2 +-
 include/net/fq_impl.h        |  4 ++--
 net/core/flow_dissector.c    | 38 +++++++++++++++---------------------
 net/sched/sch_hhf.c          |  8 ++++----
 net/sched/sch_sfb.c          | 13 ++++++------
 net/sched/sch_sfq.c          | 14 +++++++------
 8 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a683ea409fe..80c3da1aa8b1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1251,7 +1251,8 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6
 	return skb->hash;
 }
 
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+			   const siphash_key_t *perturb);
 
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 {
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 6a4586dcdede..4618cbbe3632 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/in6.h>
+#include <linux/siphash.h>
 #include <uapi/linux/if_ether.h>
 
 /**
@@ -252,7 +253,7 @@ struct flow_keys_basic {
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
-	struct flow_dissector_key_basic basic;
+	struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT);
 	struct flow_dissector_key_tags tags;
 	struct flow_dissector_key_vlan vlan;
 	struct flow_dissector_key_vlan cvlan;
diff --git a/include/net/fq.h b/include/net/fq.h
index ac944a686840..c51be50e1349 100644
--- a/include/net/fq.h
+++ b/include/net/fq.h
@@ -70,7 +70,7 @@ struct fq {
 	struct list_head backlogs;
 	spinlock_t lock;
 	u32 flows_cnt;
-	u32 perturbation;
+	siphash_key_t	perturbation;
 	u32 limit;
 	u32 memory_limit;
 	u32 memory_usage;
diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index be7c0fab3478..89a012905ef0 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -118,7 +118,7 @@ static struct fq_flow *fq_flow_classify(struct fq *fq,
 
 	lockdep_assert_held(&fq->lock);
 
-	hash = skb_get_hash_perturb(skb, fq->perturbation);
+	hash = skb_get_hash_perturb(skb, &fq->perturbation);
 	idx = reciprocal_scale(hash, fq->flows_cnt);
 	flow = &fq->flows[idx];
 
@@ -307,7 +307,7 @@ static int fq_init(struct fq *fq, int flows_cnt)
 	INIT_LIST_HEAD(&fq->backlogs);
 	spin_lock_init(&fq->lock);
 	fq->flows_cnt = max_t(u32, flows_cnt, 1);
-	fq->perturbation = prandom_u32();
+	get_random_bytes(&fq->perturbation, sizeof(fq->perturbation));
 	fq->quantum = 300;
 	fq->limit = 8192;
 	fq->memory_limit = 16 << 20; /* 16 MBytes */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 415b95f76b66..4362ffe9b5ef 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1077,30 +1077,21 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(__skb_flow_dissect);
 
-static u32 hashrnd __read_mostly;
+static siphash_key_t hashrnd __read_mostly;
 static __always_inline void __flow_hash_secret_init(void)
 {
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
-					     u32 keyval)
+static const void *flow_keys_hash_start(const struct flow_keys *flow)
 {
-	return jhash2(words, length, keyval);
-}
-
-static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
-{
-	const void *p = flow;
-
-	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
-	return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
+	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
+	return &flow->FLOW_KEYS_HASH_START_FIELD;
 }
 
 static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 {
 	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
-	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
 	BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
 		     sizeof(*flow) - sizeof(flow->addrs));
 
@@ -1115,7 +1106,7 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 		diff -= sizeof(flow->addrs.tipckey);
 		break;
 	}
-	return (sizeof(*flow) - diff) / sizeof(u32);
+	return sizeof(*flow) - diff;
 }
 
 __be32 flow_get_u32_src(const struct flow_keys *flow)
@@ -1181,14 +1172,15 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
 	}
 }
 
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
+					const siphash_key_t *keyval)
 {
 	u32 hash;
 
 	__flow_hash_consistentify(keys);
 
-	hash = __flow_hash_words(flow_keys_hash_start(keys),
-				 flow_keys_hash_length(keys), keyval);
+	hash = siphash(flow_keys_hash_start(keys),
+		       flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
 
@@ -1198,12 +1190,13 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 u32 flow_hash_from_keys(struct flow_keys *keys)
 {
 	__flow_hash_secret_init();
-	return __flow_hash_from_keys(keys, hashrnd);
+	return __flow_hash_from_keys(keys, &hashrnd);
 }
 EXPORT_SYMBOL(flow_hash_from_keys);
 
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
-				  struct flow_keys *keys, u32 keyval)
+				  struct flow_keys *keys,
+				  const siphash_key_t *keyval)
 {
 	skb_flow_dissect_flow_keys(skb, keys,
 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
@@ -1251,7 +1244,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
 			   NULL, 0, 0, 0,
 			   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 
-	return __flow_hash_from_keys(&keys, hashrnd);
+	return __flow_hash_from_keys(&keys, &hashrnd);
 }
 EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
 
@@ -1271,13 +1264,14 @@ void __skb_get_hash(struct sk_buff *skb)
 
 	__flow_hash_secret_init();
 
-	hash = ___skb_get_hash(skb, &keys, hashrnd);
+	hash = ___skb_get_hash(skb, &keys, &hashrnd);
 
 	__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+			   const siphash_key_t *perturb)
 {
 	struct flow_keys keys;
 
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index a80fe8aa8527..c53b720674c6 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -4,11 +4,11 @@
  * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
  */
 
-#include <linux/jhash.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
+#include <linux/siphash.h>
 #include <net/pkt_sched.h>
 #include <net/sock.h>
 
@@ -125,7 +125,7 @@ struct wdrr_bucket {
 
 struct hhf_sched_data {
 	struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
-	u32		   perturbation;   /* hash perturbation */
+	siphash_key_t	   perturbation;   /* hash perturbation */
 	u32		   quantum;        /* psched_mtu(qdisc_dev(sch)); */
 	u32		   drop_overlimit; /* number of times max qdisc packet
 					    * limit was hit
@@ -263,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	/* Get hashed flow-id of the skb. */
-	hash = skb_get_hash_perturb(skb, q->perturbation);
+	hash = skb_get_hash_perturb(skb, &q->perturbation);
 
 	/* Check if this packet belongs to an already established HH flow. */
 	flow_pos = hash & HHF_BIT_MASK;
@@ -580,7 +580,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
 
 	sch->limit = 1000;
 	q->quantum = psched_mtu(qdisc_dev(sch));
-	q->perturbation = prandom_u32();
+	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 	INIT_LIST_HEAD(&q->new_buckets);
 	INIT_LIST_HEAD(&q->old_buckets);
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8419b7..1aa95e761671 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -22,7 +22,7 @@
 #include <linux/errno.h>
 #include <linux/skbuff.h>
 #include <linux/random.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
 #include <net/ip.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
@@ -49,7 +49,7 @@ struct sfb_bucket {
  * (Section 4.4 of SFB reference : moving hash functions)
  */
 struct sfb_bins {
-	u32		  perturbation; /* jhash perturbation */
+	siphash_key_t	  perturbation; /* siphash key */
 	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
 };
 
@@ -221,7 +221,8 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da
 
 static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
 {
-	q->bins[slot].perturbation = prandom_u32();
+	get_random_bytes(&q->bins[slot].perturbation,
+			 sizeof(q->bins[slot].perturbation));
 }
 
 static void sfb_swap_slot(struct sfb_sched_data *q)
@@ -318,9 +319,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		/* If using external classifiers, get result and record it. */
 		if (!sfb_classify(skb, fl, &ret, &salt))
 			goto other_drop;
-		sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+		sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation);
 	} else {
-		sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation);
+		sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation);
 	}
 
 
@@ -356,7 +357,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		/* Inelastic flow */
 		if (q->double_buffering) {
 			sfbhash = skb_get_hash_perturb(skb,
-			    q->bins[slot].perturbation);
+			    &q->bins[slot].perturbation);
 			if (!sfbhash)
 				sfbhash = 1;
 			sfb_skb_cb(skb)->hashes[slot] = sfbhash;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 650f21463853..d483d6ba59b7 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -18,7 +18,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/skbuff.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <net/netlink.h>
@@ -121,7 +121,7 @@ struct sfq_sched_data {
 	u8		headdrop;
 	u8		maxdepth;	/* limit of packets per flow */
 
-	u32		perturbation;
+	siphash_key_t 	perturbation;
 	u8		cur_depth;	/* depth of longest slot */
 	u8		flags;
 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
@@ -161,7 +161,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 			     const struct sk_buff *skb)
 {
-	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
+	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
 }
 
 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -611,9 +611,11 @@ static void sfq_perturbation(struct timer_list *t)
 	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 	struct Qdisc *sch = q->sch;
 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+	siphash_key_t nkey;
 
+	get_random_bytes(&nkey, sizeof(nkey));
 	spin_lock(root_lock);
-	q->perturbation = prandom_u32();
+	q->perturbation = nkey;
 	if (!q->filter_list && q->tail)
 		sfq_rehash(sch);
 	spin_unlock(root_lock);
@@ -692,7 +694,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	del_timer(&q->perturb_timer);
 	if (q->perturb_period) {
 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
-		q->perturbation = prandom_u32();
+		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 	}
 	sch_tree_unlock(sch);
 	kfree(p);
@@ -749,7 +751,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->quantum = psched_mtu(qdisc_dev(sch));
 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 	q->perturb_period = 0;
-	q->perturbation = prandom_u32();
+	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 
 	if (opt) {
 		int err = sfq_change(sch, opt);
-- 
2.20.1


[-- Attachment #3: net_53.mbox --]
[-- Type: Application/Octet-Stream, Size: 173040 bytes --]

From 040fa277f932fc2692b062ef95662308bca11cf4 Mon Sep 17 00:00:00 2001
From: Vishal Kulkarni <vishal@chelsio.com>
Date: Wed, 30 Oct 2019 20:17:57 +0530
Subject: [PATCH 01/55] cxgb4: fix panic when attaching to ULD fail

[ Upstream commit fc89cc358fb64e2429aeae0f37906126636507ec ]

Release resources when attaching to ULD fail. Otherwise, data
mismatch is seen between LLD and ULD later on, which lead to
kernel panic when accessing resources that should not even
exist in the first place.

Fixes: 94cdb8bb993a ("cxgb4: Add support for dynamic allocation of resources for ULD")
Signed-off-by: Shahjada Abul Husain <shahjada@chelsio.com>
Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/chelsio/cxgb4/cxgb4_uld.c    | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index a4dead4ab0ed..86b528d8364c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -695,10 +695,10 @@ static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld)
 	lld->write_cmpl_support = adap->params.write_cmpl_support;
 }
 
-static void uld_attach(struct adapter *adap, unsigned int uld)
+static int uld_attach(struct adapter *adap, unsigned int uld)
 {
-	void *handle;
 	struct cxgb4_lld_info lli;
+	void *handle;
 
 	uld_init(adap, &lli);
 	uld_queue_init(adap, uld, &lli);
@@ -708,7 +708,7 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 		dev_warn(adap->pdev_dev,
 			 "could not attach to the %s driver, error %ld\n",
 			 adap->uld[uld].name, PTR_ERR(handle));
-		return;
+		return PTR_ERR(handle);
 	}
 
 	adap->uld[uld].handle = handle;
@@ -716,22 +716,22 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 
 	if (adap->flags & CXGB4_FULL_INIT_DONE)
 		adap->uld[uld].state_change(handle, CXGB4_STATE_UP);
+
+	return 0;
 }
 
-/**
- *	cxgb4_register_uld - register an upper-layer driver
- *	@type: the ULD type
- *	@p: the ULD methods
+/* cxgb4_register_uld - register an upper-layer driver
+ * @type: the ULD type
+ * @p: the ULD methods
  *
- *	Registers an upper-layer driver with this driver and notifies the ULD
- *	about any presently available devices that support its type.  Returns
- *	%-EBUSY if a ULD of the same type is already registered.
+ * Registers an upper-layer driver with this driver and notifies the ULD
+ * about any presently available devices that support its type.
  */
 void cxgb4_register_uld(enum cxgb4_uld type,
 			const struct cxgb4_uld_info *p)
 {
-	int ret = 0;
 	struct adapter *adap;
+	int ret = 0;
 
 	if (type >= CXGB4_ULD_MAX)
 		return;
@@ -763,8 +763,12 @@ void cxgb4_register_uld(enum cxgb4_uld type,
 		if (ret)
 			goto free_irq;
 		adap->uld[type] = *p;
-		uld_attach(adap, type);
+		ret = uld_attach(adap, type);
+		if (ret)
+			goto free_txq;
 		continue;
+free_txq:
+		release_sge_txq_uld(adap, type);
 free_irq:
 		if (adap->flags & CXGB4_FULL_INIT_DONE)
 			quiesce_rx_uld(adap, type);
-- 
2.20.1


From 9ac81ca713fcabe8b9c3438ea99ce9d8ccf86687 Mon Sep 17 00:00:00 2001
From: Raju Rangoju <rajur@chelsio.com>
Date: Wed, 23 Oct 2019 23:03:55 +0530
Subject: [PATCH 02/55] cxgb4: request the TX CIDX updates to status page

[ Upstream commit 7c3bebc3d8688b84795c11848c314a2fbfe045e0 ]

For adapters which support the SGE Doorbell Queue Timer facility,
we configured the Ethernet TX Queues to send CIDX Updates to the
Associated Ethernet RX Response Queue with CPL_SGE_EGR_UPDATE
messages to allow us to respond more quickly to the CIDX Updates.
But, this was adding load to PCIe Link RX bandwidth and,
potentially, resulting in higher CPU Interrupt load.

This patch requests the HW to deliver the CIDX updates to the TX
queue status page rather than generating an ingress queue message
(as an interrupt). With this patch, the load on RX bandwidth is
reduced and a substantial improvement in BW is noticed at lower
IO sizes.

Fixes: d429005fdf2c ("cxgb4/cxgb4vf: Add support for SGE doorbell queue timer")
Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/sge.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index b3da81e90132..928bfea5457b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -3791,15 +3791,11 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 	 * write the CIDX Updates into the Status Page at the end of the
 	 * TX Queue.
 	 */
-	c.autoequiqe_to_viid = htonl((dbqt
-				      ? FW_EQ_ETH_CMD_AUTOEQUIQE_F
-				      : FW_EQ_ETH_CMD_AUTOEQUEQE_F) |
+	c.autoequiqe_to_viid = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
 				     FW_EQ_ETH_CMD_VIID_V(pi->viid));
 
 	c.fetchszm_to_iqid =
-		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(dbqt
-						 ? HOSTFCMODE_INGRESS_QUEUE_X
-						 : HOSTFCMODE_STATUS_PAGE_X) |
+		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
 		      FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
 
-- 
2.20.1


From 7df344d58f3c2c35be8ab6fabc363ae029447da2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Nov 2019 07:57:55 -0800
Subject: [PATCH 03/55] dccp: do not leak jiffies on the wire

[ Upstream commit 3d1e5039f5f87a8731202ceca08764ee7cb010d3 ]

For some reason I missed the case of DCCP passive
flows in my previous patch.

Fixes: a904a0693c18 ("inet: stop leaking jiffies on the wire")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Thiemo Nagel <tnagel@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b685bc82f8d0..6791eb9d4f16 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -416,7 +416,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
 	RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
 	newinet->mc_index  = inet_iif(skb);
 	newinet->mc_ttl	   = ip_hdr(skb)->ttl;
-	newinet->inet_id   = jiffies;
+	newinet->inet_id   = prandom_u32();
 
 	if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
 		goto put_and_exit;
-- 
2.20.1


From 7581c482a916613ce074117bfbcc26146ec16ff9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 28 Oct 2019 23:19:35 +0800
Subject: [PATCH 04/55] erspan: fix the tun_info options_len check for erspan

[ Upstream commit 2eb8d6d2910cfe3dc67dc056f26f3dd9c63d47cd ]

The check for !md doens't really work for ip_tunnel_info_opts(info) which
only does info + 1. Also to avoid out-of-bounds access on info, it should
ensure options_len is not less than erspan_metadata in both erspan_xmit()
and ip6erspan_tunnel_xmit().

Fixes: 1a66a836da ("gre: add collect_md mode to ERSPAN tunnel")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 4 ++--
 net/ipv6/ip6_gre.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 52690bb3e40f..10636fb6093e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -509,9 +509,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	key = &tun_info->key;
 	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 		goto err_free_skb;
-	md = ip_tunnel_info_opts(tun_info);
-	if (!md)
+	if (tun_info->options_len < sizeof(*md))
 		goto err_free_skb;
+	md = ip_tunnel_info_opts(tun_info);
 
 	/* ERSPAN has fixed 8 byte GRE header */
 	version = md->version;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d5779d6a6065..4efc272c6027 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -980,9 +980,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		dsfield = key->tos;
 		if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 			goto tx_err;
-		md = ip_tunnel_info_opts(tun_info);
-		if (!md)
+		if (tun_info->options_len < sizeof(*md))
 			goto tx_err;
+		md = ip_tunnel_info_opts(tun_info);
 
 		tun_id = tunnel_id_to_key32(key->tun_id);
 		if (md->version == 1) {
-- 
2.20.1


From c8f7491b2dfd2981d12509923c19f45935c632c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Nov 2019 10:32:19 -0700
Subject: [PATCH 05/55] inet: stop leaking jiffies on the wire

[ Upstream commit a904a0693c189691eeee64f6c6b188bd7dc244e9 ]

Historically linux tried to stick to RFC 791, 1122, 2003
for IPv4 ID field generation.

RFC 6864 made clear that no matter how hard we try,
we can not ensure unicity of IP ID within maximum
lifetime for all datagrams with a given source
address/destination address/protocol tuple.

Linux uses a per socket inet generator (inet_id), initialized
at connection startup with a XOR of 'jiffies' and other
fields that appear clear on the wire.

Thiemo Nagel pointed that this strategy is a privacy
concern as this provides 16 bits of entropy to fingerprint
devices.

Let's switch to a random starting point, this is just as
good as far as RFC 6864 is concerned and does not leak
anything critical.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Thiemo Nagel <tnagel@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_cm.c | 2 +-
 net/dccp/ipv4.c                         | 2 +-
 net/ipv4/datagram.c                     | 2 +-
 net/ipv4/tcp_ipv4.c                     | 4 ++--
 net/sctp/socket.c                       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
index 774d991d7cca..aca75237bbcf 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.c
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c
@@ -1297,7 +1297,7 @@ static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt)
 	tp->write_seq = snd_isn;
 	tp->snd_nxt = snd_isn;
 	tp->snd_una = snd_isn;
-	inet_sk(sk)->inet_id = tp->write_seq ^ jiffies;
+	inet_sk(sk)->inet_id = prandom_u32();
 	assign_rxopt(sk, opt);
 
 	if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10))
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6791eb9d4f16..6b8a602849dd 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -117,7 +117,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 						    inet->inet_daddr,
 						    inet->inet_sport,
 						    inet->inet_dport);
-	inet->inet_id = dp->dccps_iss ^ jiffies;
+	inet->inet_id = prandom_u32();
 
 	err = dccp_connect(sk);
 	rt = NULL;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 9a0fe0c2fa02..4a8550c49202 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	reuseport_has_conns(sk, true);
 	sk->sk_state = TCP_ESTABLISHED;
 	sk_set_txhash(sk);
-	inet->inet_id = jiffies;
+	inet->inet_id = prandom_u32();
 
 	sk_dst_set(sk, &rt->dst);
 	err = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d57641cb3477..54320ef35405 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -300,7 +300,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 						 inet->inet_daddr);
 	}
 
-	inet->inet_id = tp->write_seq ^ jiffies;
+	inet->inet_id = prandom_u32();
 
 	if (tcp_fastopen_defer_connect(sk, &err))
 		return err;
@@ -1443,7 +1443,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (inet_opt)
 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
-	newinet->inet_id = newtp->write_seq ^ jiffies;
+	newinet->inet_id = prandom_u32();
 
 	if (!dst) {
 		dst = inet_csk_route_child_sock(sk, newsk, req);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 8fd7b0e6ce9f..b8f9bafb180c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9159,7 +9159,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
 	newinet->inet_dport = htons(asoc->peer.port);
 	newinet->pmtudisc = inet->pmtudisc;
-	newinet->inet_id = asoc->next_tsn ^ jiffies;
+	newinet->inet_id = prandom_u32();
 
 	newinet->uc_ttl = inet->uc_ttl;
 	newinet->mc_loop = 1;
-- 
2.20.1


From 6e6066775d5946898e2208f8cc9d84c751c0573a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Oct 2019 13:00:04 -0700
Subject: [PATCH 06/55] net: annotate accesses to sk->sk_incoming_cpu

[ Upstream commit 7170a977743b72cf3eb46ef6ef89885dc7ad3621 ]

This socket field can be read and written by concurrent cpus.

Use READ_ONCE() and WRITE_ONCE() annotations to document this,
and avoid some compiler 'optimizations'.

KCSAN reported :

BUG: KCSAN: data-race in tcp_v4_rcv / tcp_v4_rcv

write to 0xffff88812220763c of 4 bytes by interrupt on cpu 0:
 sk_incoming_cpu_update include/net/sock.h:953 [inline]
 tcp_v4_rcv+0x1b3c/0x1bb0 net/ipv4/tcp_ipv4.c:1934
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955
 napi_poll net/core/dev.c:6392 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6460
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082
 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337
 do_softirq kernel/softirq.c:329 [inline]
 __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189

read to 0xffff88812220763c of 4 bytes by interrupt on cpu 1:
 sk_incoming_cpu_update include/net/sock.h:952 [inline]
 tcp_v4_rcv+0x181a/0x1bb0 net/ipv4/tcp_ipv4.c:1934
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955
 napi_poll net/core/dev.c:6392 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6460
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 run_ksoftirqd+0x46/0x60 kernel/softirq.c:603
 smpboot_thread_fn+0x37d/0x4a0 kernel/smpboot.c:165

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h          | 4 ++--
 net/core/sock.c             | 4 ++--
 net/ipv4/inet_hashtables.c  | 2 +-
 net/ipv4/udp.c              | 2 +-
 net/ipv6/inet6_hashtables.c | 2 +-
 net/ipv6/udp.c              | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 2c53f1a1d905..0a2fd37d2a92 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -949,8 +949,8 @@ static inline void sk_incoming_cpu_update(struct sock *sk)
 {
 	int cpu = raw_smp_processor_id();
 
-	if (unlikely(sk->sk_incoming_cpu != cpu))
-		sk->sk_incoming_cpu = cpu;
+	if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
+		WRITE_ONCE(sk->sk_incoming_cpu, cpu);
 }
 
 static inline void sock_rps_record_flow_hash(__u32 hash)
diff --git a/net/core/sock.c b/net/core/sock.c
index 3aa93af51d48..71d047962390 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1125,7 +1125,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		break;
 		}
 	case SO_INCOMING_CPU:
-		sk->sk_incoming_cpu = val;
+		WRITE_ONCE(sk->sk_incoming_cpu, val);
 		break;
 
 	case SO_CNX_ADVICE:
@@ -1474,7 +1474,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_INCOMING_CPU:
-		v.val = sk->sk_incoming_cpu;
+		v.val = READ_ONCE(sk->sk_incoming_cpu);
 		break;
 
 	case SO_MEMINFO:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 97824864e40d..83fb00153018 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -240,7 +240,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			return -1;
 
 		score = sk->sk_family == PF_INET ? 2 : 1;
-		if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 			score++;
 	}
 	return score;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5e5d0575a43c..05ecf7f2e647 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -388,7 +388,7 @@ static int compute_score(struct sock *sk, struct net *net,
 		return -1;
 	score += 4;
 
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+	if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 		score++;
 	return score;
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index cf60fae9533b..fbe9d4295eac 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -105,7 +105,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			return -1;
 
 		score = 1;
-		if (sk->sk_incoming_cpu == raw_smp_processor_id())
+		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 			score++;
 	}
 	return score;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0454a8a3b39c..bea3bdad0369 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -135,7 +135,7 @@ static int compute_score(struct sock *sk, struct net *net,
 		return -1;
 	score++;
 
-	if (sk->sk_incoming_cpu == raw_smp_processor_id())
+	if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 		score++;
 
 	return score;
-- 
2.20.1


From 70eb025d523ae8d642eaa283a11b946507f99be7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Oct 2019 10:54:44 -0700
Subject: [PATCH 07/55] net: annotate lockless accesses to sk->sk_napi_id

[ Upstream commit ee8d153d46a3b98c064ee15c0c0a3bbf1450e5a1 ]

We already annotated most accesses to sk->sk_napi_id

We missed sk_mark_napi_id() and sk_mark_napi_id_once()
which might be called without socket lock held in UDP stack.

KCSAN reported :
BUG: KCSAN: data-race in udpv6_queue_rcv_one_skb / udpv6_queue_rcv_one_skb

write to 0xffff888121c6d108 of 4 bytes by interrupt on cpu 0:
 sk_mark_napi_id include/net/busy_poll.h:125 [inline]
 __udpv6_queue_rcv_skb net/ipv6/udp.c:571 [inline]
 udpv6_queue_rcv_one_skb+0x70c/0xb40 net/ipv6/udp.c:672
 udpv6_queue_rcv_skb+0xb5/0x400 net/ipv6/udp.c:689
 udp6_unicast_rcv_skb.isra.0+0xd7/0x180 net/ipv6/udp.c:832
 __udp6_lib_rcv+0x69c/0x1770 net/ipv6/udp.c:913
 udpv6_rcv+0x2b/0x40 net/ipv6/udp.c:1015
 ip6_protocol_deliver_rcu+0x22a/0xbe0 net/ipv6/ip6_input.c:409
 ip6_input_finish+0x30/0x50 net/ipv6/ip6_input.c:450
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip6_input+0x177/0x190 net/ipv6/ip6_input.c:459
 dst_input include/net/dst.h:442 [inline]
 ip6_rcv_finish+0x110/0x140 net/ipv6/ip6_input.c:76
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ipv6_rcv+0x1a1/0x1b0 net/ipv6/ip6_input.c:284
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955
 napi_poll net/core/dev.c:6392 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6460

write to 0xffff888121c6d108 of 4 bytes by interrupt on cpu 1:
 sk_mark_napi_id include/net/busy_poll.h:125 [inline]
 __udpv6_queue_rcv_skb net/ipv6/udp.c:571 [inline]
 udpv6_queue_rcv_one_skb+0x70c/0xb40 net/ipv6/udp.c:672
 udpv6_queue_rcv_skb+0xb5/0x400 net/ipv6/udp.c:689
 udp6_unicast_rcv_skb.isra.0+0xd7/0x180 net/ipv6/udp.c:832
 __udp6_lib_rcv+0x69c/0x1770 net/ipv6/udp.c:913
 udpv6_rcv+0x2b/0x40 net/ipv6/udp.c:1015
 ip6_protocol_deliver_rcu+0x22a/0xbe0 net/ipv6/ip6_input.c:409
 ip6_input_finish+0x30/0x50 net/ipv6/ip6_input.c:450
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip6_input+0x177/0x190 net/ipv6/ip6_input.c:459
 dst_input include/net/dst.h:442 [inline]
 ip6_rcv_finish+0x110/0x140 net/ipv6/ip6_input.c:76
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ipv6_rcv+0x1a1/0x1b0 net/ipv6/ip6_input.c:284
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 10890 Comm: syz-executor.0 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: e68b6e50fa35 ("udp: enable busy polling for all sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 127a5c4e3699..86e028388bad 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -122,7 +122,7 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
 static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	sk->sk_napi_id = skb->napi_id;
+	WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
 #endif
 	sk_rx_queue_set(sk, skb);
 }
@@ -132,8 +132,8 @@ static inline void sk_mark_napi_id_once(struct sock *sk,
 					const struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	if (!sk->sk_napi_id)
-		sk->sk_napi_id = skb->napi_id;
+	if (!READ_ONCE(sk->sk_napi_id))
+		WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
 #endif
 }
 
-- 
2.20.1


From 5a163734865d61e45158a1cffff3ba1f4a2d3773 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 31 Oct 2019 15:54:05 -0700
Subject: [PATCH 08/55] net: dsa: bcm_sf2: Fix IMP setup for port different
 than 8

[ Upstream commit 5fc0f21246e50afdf318b5a3a941f7f4f57b8947 ]

Since it became possible for the DSA core to use a CPU port different
than 8, our bcm_sf2_imp_setup() function was broken because it assumes
that registers are applicable to port 8. In particular, the port's MAC
is going to stay disabled, so make sure we clear the RX_DIS and TX_DIS
bits if we are not configured for port 8.

Fixes: 9f91484f6fcc ("net: dsa: make "label" property optional for dsa2")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 28c963a21dac..9f05bf714ba2 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -37,22 +37,11 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 	unsigned int i;
 	u32 reg, offset;
 
-	if (priv->type == BCM7445_DEVICE_ID)
-		offset = CORE_STS_OVERRIDE_IMP;
-	else
-		offset = CORE_STS_OVERRIDE_IMP2;
-
 	/* Enable the port memories */
 	reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL);
 	reg &= ~P_TXQ_PSM_VDD(port);
 	core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL);
 
-	/* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
-	reg = core_readl(priv, CORE_IMP_CTL);
-	reg |= (RX_BCST_EN | RX_MCST_EN | RX_UCST_EN);
-	reg &= ~(RX_DIS | TX_DIS);
-	core_writel(priv, reg, CORE_IMP_CTL);
-
 	/* Enable forwarding */
 	core_writel(priv, SW_FWDG_EN, CORE_SWMODE);
 
@@ -71,10 +60,27 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 
 	b53_brcm_hdr_setup(ds, port);
 
-	/* Force link status for IMP port */
-	reg = core_readl(priv, offset);
-	reg |= (MII_SW_OR | LINK_STS);
-	core_writel(priv, reg, offset);
+	if (port == 8) {
+		if (priv->type == BCM7445_DEVICE_ID)
+			offset = CORE_STS_OVERRIDE_IMP;
+		else
+			offset = CORE_STS_OVERRIDE_IMP2;
+
+		/* Force link status for IMP port */
+		reg = core_readl(priv, offset);
+		reg |= (MII_SW_OR | LINK_STS);
+		core_writel(priv, reg, offset);
+
+		/* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
+		reg = core_readl(priv, CORE_IMP_CTL);
+		reg |= (RX_BCST_EN | RX_MCST_EN | RX_UCST_EN);
+		reg &= ~(RX_DIS | TX_DIS);
+		core_writel(priv, reg, CORE_IMP_CTL);
+	} else {
+		reg = core_readl(priv, CORE_G_PCTL_PORT(port));
+		reg &= ~(RX_DIS | TX_DIS);
+		core_writel(priv, reg, CORE_G_PCTL_PORT(port));
+	}
 }
 
 static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
-- 
2.20.1


From a13c6a2b127bc7d92f49fe443fb7caab5b80c601 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 25 Oct 2019 13:47:24 +1100
Subject: [PATCH 09/55] net: ethernet: ftgmac100: Fix DMA coherency issue with
 SW checksum

[ Upstream commit 88824e3bf29a2fcacfd9ebbfe03063649f0f3254 ]

We are calling the checksum helper after the dma_map_single()
call to map the packet. This is incorrect as the checksumming
code will touch the packet from the CPU. This means the cache
won't be properly flushes (or the bounce buffering will leave
us with the unmodified packet to DMA).

This moves the calculation of the checksum & vlan tags to
before the DMA mapping.

This also has the side effect of fixing another bug: If the
checksum helper fails, we goto "drop" to drop the packet, which
will not unmap the DMA mapping.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Fixes: 05690d633f30 ("ftgmac100: Upgrade to NETIF_F_HW_CSUM")
Reviewed-by: Vijay Khemka <vijaykhemka@fb.com>
Tested-by: Vijay Khemka <vijaykhemka@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/faraday/ftgmac100.c | 25 ++++++++++++------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 030fed65393e..713dc30f9dbb 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -726,6 +726,18 @@ static netdev_tx_t ftgmac100_hard_start_xmit(struct sk_buff *skb,
 	 */
 	nfrags = skb_shinfo(skb)->nr_frags;
 
+	/* Setup HW checksumming */
+	csum_vlan = 0;
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    !ftgmac100_prep_tx_csum(skb, &csum_vlan))
+		goto drop;
+
+	/* Add VLAN tag */
+	if (skb_vlan_tag_present(skb)) {
+		csum_vlan |= FTGMAC100_TXDES1_INS_VLANTAG;
+		csum_vlan |= skb_vlan_tag_get(skb) & 0xffff;
+	}
+
 	/* Get header len */
 	len = skb_headlen(skb);
 
@@ -752,19 +764,6 @@ static netdev_tx_t ftgmac100_hard_start_xmit(struct sk_buff *skb,
 	if (nfrags == 0)
 		f_ctl_stat |= FTGMAC100_TXDES0_LTS;
 	txdes->txdes3 = cpu_to_le32(map);
-
-	/* Setup HW checksumming */
-	csum_vlan = 0;
-	if (skb->ip_summed == CHECKSUM_PARTIAL &&
-	    !ftgmac100_prep_tx_csum(skb, &csum_vlan))
-		goto drop;
-
-	/* Add VLAN tag */
-	if (skb_vlan_tag_present(skb)) {
-		csum_vlan |= FTGMAC100_TXDES1_INS_VLANTAG;
-		csum_vlan |= skb_vlan_tag_get(skb) & 0xffff;
-	}
-
 	txdes->txdes1 = cpu_to_le32(csum_vlan);
 
 	/* Next descriptor */
-- 
2.20.1


From 70ca79c33c7ebe5285b6ba53f70add4f310b8083 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2019 13:50:27 -0700
Subject: [PATCH 10/55] net: fix sk_page_frag() recursion from memory reclaim

[ Upstream commit 20eb4f29b60286e0d6dc01d9c260b4bd383c58fb ]

sk_page_frag() optimizes skb_frag allocations by using per-task
skb_frag cache when it knows it's the only user.  The condition is
determined by seeing whether the socket allocation mask allows
blocking - if the allocation may block, it obviously owns the task's
context and ergo exclusively owns current->task_frag.

Unfortunately, this misses recursion through memory reclaim path.
Please take a look at the following backtrace.

 [2] RIP: 0010:tcp_sendmsg_locked+0xccf/0xe10
     ...
     tcp_sendmsg+0x27/0x40
     sock_sendmsg+0x30/0x40
     sock_xmit.isra.24+0xa1/0x170 [nbd]
     nbd_send_cmd+0x1d2/0x690 [nbd]
     nbd_queue_rq+0x1b5/0x3b0 [nbd]
     __blk_mq_try_issue_directly+0x108/0x1b0
     blk_mq_request_issue_directly+0xbd/0xe0
     blk_mq_try_issue_list_directly+0x41/0xb0
     blk_mq_sched_insert_requests+0xa2/0xe0
     blk_mq_flush_plug_list+0x205/0x2a0
     blk_flush_plug_list+0xc3/0xf0
 [1] blk_finish_plug+0x21/0x2e
     _xfs_buf_ioapply+0x313/0x460
     __xfs_buf_submit+0x67/0x220
     xfs_buf_read_map+0x113/0x1a0
     xfs_trans_read_buf_map+0xbf/0x330
     xfs_btree_read_buf_block.constprop.42+0x95/0xd0
     xfs_btree_lookup_get_block+0x95/0x170
     xfs_btree_lookup+0xcc/0x470
     xfs_bmap_del_extent_real+0x254/0x9a0
     __xfs_bunmapi+0x45c/0xab0
     xfs_bunmapi+0x15/0x30
     xfs_itruncate_extents_flags+0xca/0x250
     xfs_free_eofblocks+0x181/0x1e0
     xfs_fs_destroy_inode+0xa8/0x1b0
     destroy_inode+0x38/0x70
     dispose_list+0x35/0x50
     prune_icache_sb+0x52/0x70
     super_cache_scan+0x120/0x1a0
     do_shrink_slab+0x120/0x290
     shrink_slab+0x216/0x2b0
     shrink_node+0x1b6/0x4a0
     do_try_to_free_pages+0xc6/0x370
     try_to_free_mem_cgroup_pages+0xe3/0x1e0
     try_charge+0x29e/0x790
     mem_cgroup_charge_skmem+0x6a/0x100
     __sk_mem_raise_allocated+0x18e/0x390
     __sk_mem_schedule+0x2a/0x40
 [0] tcp_sendmsg_locked+0x8eb/0xe10
     tcp_sendmsg+0x27/0x40
     sock_sendmsg+0x30/0x40
     ___sys_sendmsg+0x26d/0x2b0
     __sys_sendmsg+0x57/0xa0
     do_syscall_64+0x42/0x100
     entry_SYSCALL_64_after_hwframe+0x44/0xa9

In [0], tcp_send_msg_locked() was using current->page_frag when it
called sk_wmem_schedule().  It already calculated how many bytes can
be fit into current->page_frag.  Due to memory pressure,
sk_wmem_schedule() called into memory reclaim path which called into
xfs and then IO issue path.  Because the filesystem in question is
backed by nbd, the control goes back into the tcp layer - back into
tcp_sendmsg_locked().

nbd sets sk_allocation to (GFP_NOIO | __GFP_MEMALLOC) which makes
sense - it's in the process of freeing memory and wants to be able to,
e.g., drop clean pages to make forward progress.  However, this
confused sk_page_frag() called from [2].  Because it only tests
whether the allocation allows blocking which it does, it now thinks
current->page_frag can be used again although it already was being
used in [0].

After [2] used current->page_frag, the offset would be increased by
the used amount.  When the control returns to [0],
current->page_frag's offset is increased and the previously calculated
number of bytes now may overrun the end of allocated memory leading to
silent memory corruptions.

Fix it by adding gfpflags_normal_context() which tests sleepable &&
!reclaim and use it to determine whether to use current->task_frag.

v2: Eric didn't like gfp flags being tested twice.  Introduce a new
    helper gfpflags_normal_context() and combine the two tests.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/gfp.h | 23 +++++++++++++++++++++++
 include/net/sock.h  | 11 ++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f33881688f42..ff1c96b8ae92 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -325,6 +325,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 	return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
 }
 
+/**
+ * gfpflags_normal_context - is gfp_flags a normal sleepable context?
+ * @gfp_flags: gfp_flags to test
+ *
+ * Test whether @gfp_flags indicates that the allocation is from the
+ * %current context and allowed to sleep.
+ *
+ * An allocation being allowed to block doesn't mean it owns the %current
+ * context.  When direct reclaim path tries to allocate memory, the
+ * allocation context is nested inside whatever %current was doing at the
+ * time of the original allocation.  The nested allocation may be allowed
+ * to block but modifying anything %current owns can corrupt the outer
+ * context's expectations.
+ *
+ * %true result from this function indicates that the allocation context
+ * can sleep and use anything that's associated with %current.
+ */
+static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
+{
+	return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
+		__GFP_DIRECT_RECLAIM;
+}
+
 #ifdef CONFIG_HIGHMEM
 #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
 #else
diff --git a/include/net/sock.h b/include/net/sock.h
index 0a2fd37d2a92..b03f96370f8e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2233,12 +2233,17 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
  * sk_page_frag - return an appropriate page_frag
  * @sk: socket
  *
- * If socket allocation mode allows current thread to sleep, it means its
- * safe to use the per task page_frag instead of the per socket one.
+ * Use the per task page_frag instead of the per socket one for
+ * optimization when we know that we're in the normal context and owns
+ * everything that's associated with %current.
+ *
+ * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
+ * inside other socket operations and end up recursing into sk_page_frag()
+ * while it's already in use.
  */
 static inline struct page_frag *sk_page_frag(struct sock *sk)
 {
-	if (gfpflags_allow_blocking(sk->sk_allocation))
+	if (gfpflags_normal_context(sk->sk_allocation))
 		return &current->task_frag;
 
 	return &sk->sk_frag;
-- 
2.20.1


From 213c2579a5a7e0d5497349aedbbfd18b1af3387b Mon Sep 17 00:00:00 2001
From: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
Date: Mon, 28 Oct 2019 13:09:46 +0800
Subject: [PATCH 11/55] net: hisilicon: Fix ping latency when deal with high
 throughput

[ Upstream commit e56bd641ca61beb92b135298d5046905f920b734 ]

This is due to error in over budget processing.
When dealing with high throughput, the used buffers
that exceeds the budget is not cleaned up. In addition,
it takes a lot of cycles to clean up the used buffer,
and then the buffer where the valid data is located can take effect.

Signed-off-by: Jiangfeng Xiao <xiaojiangfeng@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hip04_eth.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index c84167447abe..f51bc0255556 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -237,6 +237,7 @@ struct hip04_priv {
 	dma_addr_t rx_phys[RX_DESC_NUM];
 	unsigned int rx_head;
 	unsigned int rx_buf_size;
+	unsigned int rx_cnt_remaining;
 
 	struct device_node *phy_node;
 	struct phy_device *phy;
@@ -575,7 +576,6 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget)
 	struct hip04_priv *priv = container_of(napi, struct hip04_priv, napi);
 	struct net_device *ndev = priv->ndev;
 	struct net_device_stats *stats = &ndev->stats;
-	unsigned int cnt = hip04_recv_cnt(priv);
 	struct rx_desc *desc;
 	struct sk_buff *skb;
 	unsigned char *buf;
@@ -588,8 +588,8 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget)
 
 	/* clean up tx descriptors */
 	tx_remaining = hip04_tx_reclaim(ndev, false);
-
-	while (cnt && !last) {
+	priv->rx_cnt_remaining += hip04_recv_cnt(priv);
+	while (priv->rx_cnt_remaining && !last) {
 		buf = priv->rx_buf[priv->rx_head];
 		skb = build_skb(buf, priv->rx_buf_size);
 		if (unlikely(!skb)) {
@@ -635,11 +635,13 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget)
 		hip04_set_recv_desc(priv, phys);
 
 		priv->rx_head = RX_NEXT(priv->rx_head);
-		if (rx >= budget)
+		if (rx >= budget) {
+			--priv->rx_cnt_remaining;
 			goto done;
+		}
 
-		if (--cnt == 0)
-			cnt = hip04_recv_cnt(priv);
+		if (--priv->rx_cnt_remaining == 0)
+			priv->rx_cnt_remaining += hip04_recv_cnt(priv);
 	}
 
 	if (!(priv->reg_inten & RCV_INT)) {
@@ -724,6 +726,7 @@ static int hip04_mac_open(struct net_device *ndev)
 	int i;
 
 	priv->rx_head = 0;
+	priv->rx_cnt_remaining = 0;
 	priv->tx_head = 0;
 	priv->tx_tail = 0;
 	hip04_reset_ppe(priv);
-- 
2.20.1


From 8f56b9c6922b5755c641652b0af80a763fb189ee Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Sun, 27 Oct 2019 16:39:15 +0200
Subject: [PATCH 12/55] net/mlx4_core: Dynamically set guaranteed amount of
 counters per VF

[ Upstream commit e19868efea0c103f23b4b7e986fd0a703822111f ]

Prior to this patch, the amount of counters guaranteed per VF in the
resource tracker was MLX4_VF_COUNTERS_PER_PORT * MLX4_MAX_PORTS. It was
set regardless if the VF was single or dual port.
This caused several VFs to have no guaranteed counters although the
system could satisfy their request.

The fix is to dynamically guarantee counters, based on each VF
specification.

Fixes: 9de92c60beaa ("net/mlx4_core: Adjust counter grant policy in the resource tracker")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlx4/resource_tracker.c | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 4356f3a58002..1187ef1375e2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -471,12 +471,31 @@ void mlx4_init_quotas(struct mlx4_dev *dev)
 		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
 }
 
-static int get_max_gauranteed_vfs_counter(struct mlx4_dev *dev)
+static int
+mlx4_calc_res_counter_guaranteed(struct mlx4_dev *dev,
+				 struct resource_allocator *res_alloc,
+				 int vf)
 {
-	/* reduce the sink counter */
-	return (dev->caps.max_counters - 1 -
-		(MLX4_PF_COUNTERS_PER_PORT * MLX4_MAX_PORTS))
-		/ MLX4_MAX_PORTS;
+	struct mlx4_active_ports actv_ports;
+	int ports, counters_guaranteed;
+
+	/* For master, only allocate according to the number of phys ports */
+	if (vf == mlx4_master_func_num(dev))
+		return MLX4_PF_COUNTERS_PER_PORT * dev->caps.num_ports;
+
+	/* calculate real number of ports for the VF */
+	actv_ports = mlx4_get_active_ports(dev, vf);
+	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	counters_guaranteed = ports * MLX4_VF_COUNTERS_PER_PORT;
+
+	/* If we do not have enough counters for this VF, do not
+	 * allocate any for it. '-1' to reduce the sink counter.
+	 */
+	if ((res_alloc->res_reserved + counters_guaranteed) >
+	    (dev->caps.max_counters - 1))
+		return 0;
+
+	return counters_guaranteed;
 }
 
 int mlx4_init_resource_tracker(struct mlx4_dev *dev)
@@ -484,7 +503,6 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int i, j;
 	int t;
-	int max_vfs_guarantee_counter = get_max_gauranteed_vfs_counter(dev);
 
 	priv->mfunc.master.res_tracker.slave_list =
 		kcalloc(dev->num_slaves, sizeof(struct slave_list),
@@ -603,16 +621,8 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 				break;
 			case RES_COUNTER:
 				res_alloc->quota[t] = dev->caps.max_counters;
-				if (t == mlx4_master_func_num(dev))
-					res_alloc->guaranteed[t] =
-						MLX4_PF_COUNTERS_PER_PORT *
-						MLX4_MAX_PORTS;
-				else if (t <= max_vfs_guarantee_counter)
-					res_alloc->guaranteed[t] =
-						MLX4_VF_COUNTERS_PER_PORT *
-						MLX4_MAX_PORTS;
-				else
-					res_alloc->guaranteed[t] = 0;
+				res_alloc->guaranteed[t] =
+					mlx4_calc_res_counter_guaranteed(dev, res_alloc, t);
 				break;
 			default:
 				break;
-- 
2.20.1


From 617a2dade62a6d85a29935da9bd13a7cd24859cc Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Wed, 23 Oct 2019 18:39:04 +0200
Subject: [PATCH 13/55] netns: fix GFP flags in rtnl_net_notifyid()

[ Upstream commit d4e4fdf9e4a27c87edb79b1478955075be141f67 ]

In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.

Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.

In most cases, GFP_KERNEL is fine. The exceptions are:
  * openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
    indirectly call rtnl_net_notifyid() from RCU critical section,

  * rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
    parameter.

Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.

Found by code inspection.

Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  2 +-
 net/core/dev.c              |  2 +-
 net/core/net_namespace.c    | 17 +++++++++--------
 net/core/rtnetlink.c        | 14 +++++++-------
 net/openvswitch/datapath.c  | 20 +++++++++++---------
 5 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index ab40d7afdc54..f1f3438ed13a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -333,7 +333,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #define __net_initconst	__initconst
 #endif
 
-int peernet2id_alloc(struct net *net, struct net *peer);
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
 int peernet2id(struct net *net, struct net *peer);
 bool peernet_has_id(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ed9df74eb8a..33b278b826b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9411,7 +9411,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 
-	new_nsid = peernet2id_alloc(dev_net(dev), net);
+	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
 	/* If there is an ifindex conflict assign a new one */
 	if (__dev_get_by_index(net, dev->ifindex))
 		new_ifindex = dev_new_index(net);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a0e0d298c991..53ac647b893b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -245,11 +245,11 @@ static int __peernet2id(struct net *net, struct net *peer)
 	return __peernet2id_alloc(net, peer, &no);
 }
 
-static void rtnl_net_notifyid(struct net *net, int cmd, int id);
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp);
 /* This function returns the id of a peer netns. If no id is assigned, one will
  * be allocated and returned.
  */
-int peernet2id_alloc(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
 {
 	bool alloc = false, alive = false;
 	int id;
@@ -268,7 +268,7 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 	id = __peernet2id_alloc(net, peer, &alloc);
 	spin_unlock_bh(&net->nsid_lock);
 	if (alloc && id >= 0)
-		rtnl_net_notifyid(net, RTM_NEWNSID, id);
+		rtnl_net_notifyid(net, RTM_NEWNSID, id, gfp);
 	if (alive)
 		put_net(peer);
 	return id;
@@ -532,7 +532,8 @@ static void unhash_nsid(struct net *net, struct net *last)
 			idr_remove(&tmp->netns_ids, id);
 		spin_unlock_bh(&tmp->nsid_lock);
 		if (id >= 0)
-			rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+			rtnl_net_notifyid(tmp, RTM_DELNSID, id,
+					  GFP_KERNEL);
 		if (tmp == last)
 			break;
 	}
@@ -764,7 +765,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = alloc_netid(net, peer, nsid);
 	spin_unlock_bh(&net->nsid_lock);
 	if (err >= 0) {
-		rtnl_net_notifyid(net, RTM_NEWNSID, err);
+		rtnl_net_notifyid(net, RTM_NEWNSID, err, GFP_KERNEL);
 		err = 0;
 	} else if (err == -ENOSPC && nsid >= 0) {
 		err = -EEXIST;
@@ -1051,7 +1052,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 	return err < 0 ? err : skb->len;
 }
 
-static void rtnl_net_notifyid(struct net *net, int cmd, int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, gfp_t gfp)
 {
 	struct net_fill_args fillargs = {
 		.cmd = cmd,
@@ -1060,7 +1061,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
 	struct sk_buff *msg;
 	int err = -ENOMEM;
 
-	msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+	msg = nlmsg_new(rtnl_net_get_size(), gfp);
 	if (!msg)
 		goto out;
 
@@ -1068,7 +1069,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
 	if (err < 0)
 		goto err_out;
 
-	rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+	rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, gfp);
 	return;
 
 err_out:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1ee6460f8275..6b866aff7b21 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1523,7 +1523,7 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
 
 static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 				  const struct net_device *dev,
-				  struct net *src_net)
+				  struct net *src_net, gfp_t gfp)
 {
 	bool put_iflink = false;
 
@@ -1531,7 +1531,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
 		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
 
 		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id_alloc(src_net, link_net);
+			int id = peernet2id_alloc(src_net, link_net, gfp);
 
 			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
 				return -EMSGSIZE;
@@ -1589,7 +1589,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
 			    u32 event, int *new_nsid, int new_ifindex,
-			    int tgt_netnsid)
+			    int tgt_netnsid, gfp_t gfp)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1681,7 +1681,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rtnl_fill_link_netnsid(skb, dev, src_net))
+	if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
 		goto nla_put_failure;
 
 	if (new_nsid &&
@@ -2001,7 +2001,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       nlh->nlmsg_seq, 0, flags,
 					       ext_filter_mask, 0, NULL, 0,
-					       netnsid);
+					       netnsid, GFP_KERNEL);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -3359,7 +3359,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = rtnl_fill_ifinfo(nskb, dev, net,
 			       RTM_NEWLINK, NETLINK_CB(skb).portid,
 			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
-			       0, NULL, 0, netnsid);
+			       0, NULL, 0, netnsid, GFP_KERNEL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -3471,7 +3471,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
 			       type, 0, 0, change, 0, 0, event,
-			       new_nsid, new_ifindex, -1);
+			       new_nsid, new_ifindex, -1, flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f1e7041a5a60..43aeca12208c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1850,7 +1850,7 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
 /* Called with ovs_mutex or RCU read lock. */
 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 				   struct net *net, u32 portid, u32 seq,
-				   u32 flags, u8 cmd)
+				   u32 flags, u8 cmd, gfp_t gfp)
 {
 	struct ovs_header *ovs_header;
 	struct ovs_vport_stats vport_stats;
@@ -1871,7 +1871,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 		goto nla_put_failure;
 
 	if (!net_eq(net, dev_net(vport->dev))) {
-		int id = peernet2id_alloc(net, dev_net(vport->dev));
+		int id = peernet2id_alloc(net, dev_net(vport->dev), gfp);
 
 		if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
 			goto nla_put_failure;
@@ -1912,11 +1912,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
 	struct sk_buff *skb;
 	int retval;
 
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd);
+	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd,
+					 GFP_KERNEL);
 	BUG_ON(retval < 0);
 
 	return skb;
@@ -2058,7 +2059,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_NEW);
+				      OVS_VPORT_CMD_NEW, GFP_KERNEL);
 
 	new_headroom = netdev_get_fwd_headroom(vport->dev);
 
@@ -2119,7 +2120,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_SET);
+				      OVS_VPORT_CMD_SET, GFP_KERNEL);
 	BUG_ON(err < 0);
 
 	ovs_unlock();
@@ -2159,7 +2160,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_DEL);
+				      OVS_VPORT_CMD_DEL, GFP_KERNEL);
 	BUG_ON(err < 0);
 
 	/* the vport deletion may trigger dp headroom update */
@@ -2206,7 +2207,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
 		goto exit_unlock_free;
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
-				      OVS_VPORT_CMD_GET);
+				      OVS_VPORT_CMD_GET, GFP_ATOMIC);
 	BUG_ON(err < 0);
 	rcu_read_unlock();
 
@@ -2242,7 +2243,8 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 						    NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
 						    NLM_F_MULTI,
-						    OVS_VPORT_CMD_GET) < 0)
+						    OVS_VPORT_CMD_GET,
+						    GFP_ATOMIC) < 0)
 				goto out;
 
 			j++;
-- 
2.20.1


From a82d9811743f62a189efbfef2c5558d761069c71 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 29 Oct 2019 13:59:32 +0200
Subject: [PATCH 14/55] net: rtnetlink: fix a typo fbd -> fdb

[ Upstream commit 8b73018fe44521c1cf59d7bac53624c87d3f10e2 ]

A simple typo fix in the nl error message (fbd -> fdb).

CC: David Ahern <dsahern@gmail.com>
Fixes: 8c6e137fbc7f ("rtnetlink: Update rtnl_fdb_dump for strict data checking")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6b866aff7b21..868a768f7300 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3916,7 +3916,7 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
 	ndm = nlmsg_data(nlh);
 	if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_state ||
 	    ndm->ndm_flags || ndm->ndm_type) {
-		NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request");
+		NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
 		return -EINVAL;
 	}
 
-- 
2.20.1


From cf6939873f65208f8f064b889d5f6290a61f7052 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <dwagner@suse.de>
Date: Fri, 25 Oct 2019 10:04:13 +0200
Subject: [PATCH 15/55] net: usb: lan78xx: Disable interrupts before calling
 generic_handle_irq()

[ Upstream commit 0a29ac5bd3a988dc151c8d26910dec2557421f64 ]

lan78xx_status() will run with interrupts enabled due to the change in
ed194d136769 ("usb: core: remove local_irq_save() around ->complete()
handler"). generic_handle_irq() expects to be run with IRQs disabled.

[    4.886203] 000: irq 79 handler irq_default_primary_handler+0x0/0x8 enabled interrupts
[    4.886243] 000: WARNING: CPU: 0 PID: 0 at kernel/irq/handle.c:152 __handle_irq_event_percpu+0x154/0x168
[    4.896294] 000: Modules linked in:
[    4.896301] 000: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.6 #39
[    4.896310] 000: Hardware name: Raspberry Pi 3 Model B+ (DT)
[    4.896315] 000: pstate: 60000005 (nZCv daif -PAN -UAO)
[    4.896321] 000: pc : __handle_irq_event_percpu+0x154/0x168
[    4.896331] 000: lr : __handle_irq_event_percpu+0x154/0x168
[    4.896339] 000: sp : ffff000010003cc0
[    4.896346] 000: x29: ffff000010003cc0 x28: 0000000000000060
[    4.896355] 000: x27: ffff000011021980 x26: ffff00001189c72b
[    4.896364] 000: x25: ffff000011702bc0 x24: ffff800036d6e400
[    4.896373] 000: x23: 000000000000004f x22: ffff000010003d64
[    4.896381] 000: x21: 0000000000000000 x20: 0000000000000002
[    4.896390] 000: x19: ffff8000371c8480 x18: 0000000000000060
[    4.896398] 000: x17: 0000000000000000 x16: 00000000000000eb
[    4.896406] 000: x15: ffff000011712d18 x14: 7265746e69206465
[    4.896414] 000: x13: ffff000010003ba0 x12: ffff000011712df0
[    4.896422] 000: x11: 0000000000000001 x10: ffff000011712e08
[    4.896430] 000: x9 : 0000000000000001 x8 : 000000000003c920
[    4.896437] 000: x7 : ffff0000118cc410 x6 : ffff0000118c7f00
[    4.896445] 000: x5 : 000000000003c920 x4 : 0000000000004510
[    4.896453] 000: x3 : ffff000011712dc8 x2 : 0000000000000000
[    4.896461] 000: x1 : 73a3f67df94c1500 x0 : 0000000000000000
[    4.896466] 000: Call trace:
[    4.896471] 000:  __handle_irq_event_percpu+0x154/0x168
[    4.896481] 000:  handle_irq_event_percpu+0x50/0xb0
[    4.896489] 000:  handle_irq_event+0x40/0x98
[    4.896497] 000:  handle_simple_irq+0xa4/0xf0
[    4.896505] 000:  generic_handle_irq+0x24/0x38
[    4.896513] 000:  intr_complete+0xb0/0xe0
[    4.896525] 000:  __usb_hcd_giveback_urb+0x58/0xd8
[    4.896533] 000:  usb_giveback_urb_bh+0xd0/0x170
[    4.896539] 000:  tasklet_action_common.isra.0+0x9c/0x128
[    4.896549] 000:  tasklet_hi_action+0x24/0x30
[    4.896556] 000:  __do_softirq+0x120/0x23c
[    4.896564] 000:  irq_exit+0xb8/0xd8
[    4.896571] 000:  __handle_domain_irq+0x64/0xb8
[    4.896579] 000:  bcm2836_arm_irqchip_handle_irq+0x60/0xc0
[    4.896586] 000:  el1_irq+0xb8/0x140
[    4.896592] 000:  arch_cpu_idle+0x10/0x18
[    4.896601] 000:  do_idle+0x200/0x280
[    4.896608] 000:  cpu_startup_entry+0x20/0x28
[    4.896615] 000:  rest_init+0xb4/0xc0
[    4.896623] 000:  arch_call_rest_init+0xc/0x14
[    4.896632] 000:  start_kernel+0x454/0x480

Fixes: ed194d136769 ("usb: core: remove local_irq_save() around ->complete() handler")
Cc: Woojung Huh <woojung.huh@microchip.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Stefan Wahren <wahrenst@gmx.net>
Cc: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Daniel Wagner <dwagner@suse.de>
Tested-by: Stefan Wahren <wahrenst@gmx.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index f033fee225a1..dd9b3f4c4561 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1265,8 +1265,11 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb)
 		netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata);
 		lan78xx_defer_kevent(dev, EVENT_LINK_RESET);
 
-		if (dev->domain_data.phyirq > 0)
+		if (dev->domain_data.phyirq > 0) {
+			local_irq_disable();
 			generic_handle_irq(dev->domain_data.phyirq);
+			local_irq_enable();
+		}
 	} else
 		netdev_warn(dev->net,
 			    "unexpected interrupt: 0x%08x\n", intdata);
-- 
2.20.1


From 55e0629879abf41062d0f55be6fca7f8e639ff3d Mon Sep 17 00:00:00 2001
From: zhanglin <zhang.lin16@zte.com.cn>
Date: Sat, 26 Oct 2019 15:54:16 +0800
Subject: [PATCH 16/55] net: Zeroing the structure ethtool_wolinfo in
 ethtool_get_wol()

[ Upstream commit 5ff223e86f5addbfae26419cbb5d61d98f6fbf7d ]

memset() the structure ethtool_wolinfo that has padded bytes
but the padded bytes have not been zeroed out.

Signed-off-by: zhanglin <zhang.lin16@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6288e69e94fc..563a48c3df36 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1395,11 +1395,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
 
 static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
 {
-	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+	struct ethtool_wolinfo wol;
 
 	if (!dev->ethtool_ops->get_wol)
 		return -EOPNOTSUPP;
 
+	memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+	wol.cmd = ETHTOOL_GWOL;
 	dev->ethtool_ops->get_wol(dev, &wol);
 
 	if (copy_to_user(useraddr, &wol, sizeof(wol)))
-- 
2.20.1


From 1f98d8c1984377f5a668e1cc50721e91ce9c6333 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 31 Oct 2019 16:24:36 -0700
Subject: [PATCH 17/55] selftests: net: reuseport_dualstack: fix uninitalized
 parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit d64479a3e3f9924074ca7b50bd72fa5211dca9c1 ]

This test reports EINVAL for getsockopt(SOL_SOCKET, SO_DOMAIN)
occasionally due to the uninitialized length parameter.
Initialize it to fix this, and also use int for "test_family" to comply
with the API standard.

Fixes: d6a61f80b871 ("soreuseport: test mixed v4/v6 sockets")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Wei Wang <weiwan@google.com>
Cc: Craig Gallek <cgallek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/reuseport_dualstack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/reuseport_dualstack.c b/tools/testing/selftests/net/reuseport_dualstack.c
index fe3230c55986..fb7a59ed759e 100644
--- a/tools/testing/selftests/net/reuseport_dualstack.c
+++ b/tools/testing/selftests/net/reuseport_dualstack.c
@@ -129,7 +129,7 @@ static void test(int *rcv_fds, int count, int proto)
 {
 	struct epoll_event ev;
 	int epfd, i, test_fd;
-	uint16_t test_family;
+	int test_family;
 	socklen_t len;
 
 	epfd = epoll_create(1);
@@ -146,6 +146,7 @@ static void test(int *rcv_fds, int count, int proto)
 	send_from_v4(proto);
 
 	test_fd = receive_once(epfd, proto);
+	len = sizeof(test_family);
 	if (getsockopt(test_fd, SOL_SOCKET, SO_DOMAIN, &test_family, &len))
 		error(1, errno, "failed to read socket domain");
 	if (test_family != AF_INET)
-- 
2.20.1


From 618f88ba85db8518c34837ff79ab24619fdfc8ab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 24 Oct 2019 11:43:31 -0700
Subject: [PATCH 18/55] udp: fix data-race in udp_set_dev_scratch()

[ Upstream commit a793183caa9afae907a0d7ddd2ffd57329369bf5 ]

KCSAN reported a data-race in udp_set_dev_scratch() [1]

The issue here is that we must not write over skb fields
if skb is shared. A similar issue has been fixed in commit
89c22d8c3b27 ("net: Fix skb csum races when peeking")

While we are at it, use a helper only dealing with
udp_skb_scratch(skb)->csum_unnecessary, as this allows
udp_set_dev_scratch() to be called once and thus inlined.

[1]
BUG: KCSAN: data-race in udp_set_dev_scratch / udpv6_recvmsg

write to 0xffff888120278317 of 1 bytes by task 10411 on cpu 1:
 udp_set_dev_scratch+0xea/0x200 net/ipv4/udp.c:1308
 __first_packet_length+0x147/0x420 net/ipv4/udp.c:1556
 first_packet_length+0x68/0x2a0 net/ipv4/udp.c:1579
 udp_poll+0xea/0x110 net/ipv4/udp.c:2720
 sock_poll+0xed/0x250 net/socket.c:1256
 vfs_poll include/linux/poll.h:90 [inline]
 do_select+0x7d0/0x1020 fs/select.c:534
 core_sys_select+0x381/0x550 fs/select.c:677
 do_pselect.constprop.0+0x11d/0x160 fs/select.c:759
 __do_sys_pselect6 fs/select.c:784 [inline]
 __se_sys_pselect6 fs/select.c:769 [inline]
 __x64_sys_pselect6+0x12e/0x170 fs/select.c:769
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

read to 0xffff888120278317 of 1 bytes by task 10413 on cpu 0:
 udp_skb_csum_unnecessary include/net/udp.h:358 [inline]
 udpv6_recvmsg+0x43e/0xe90 net/ipv6/udp.c:310
 inet6_recvmsg+0xbb/0x240 net/ipv6/af_inet6.c:592
 sock_recvmsg_nosec+0x5c/0x70 net/socket.c:871
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 10413 Comm: syz-executor.0 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 05ecf7f2e647..ce0728c4b4aa 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1316,6 +1316,20 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
 		scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
 }
 
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+	/* We come here after udp_lib_checksum_complete() returned 0.
+	 * This means that __skb_checksum_complete() might have
+	 * set skb->csum_valid to 1.
+	 * On 64bit platforms, we can set csum_unnecessary
+	 * to true, but only if the skb is not shared.
+	 */
+#if BITS_PER_LONG == 64
+	if (!skb_shared(skb))
+		udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
 static int udp_skb_truesize(struct sk_buff *skb)
 {
 	return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
@@ -1550,10 +1564,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
 			*total += skb->truesize;
 			kfree_skb(skb);
 		} else {
-			/* the csum related bits could be changed, refresh
-			 * the scratch area
-			 */
-			udp_set_dev_scratch(skb);
+			udp_skb_csum_unnecessary_set(skb);
 			break;
 		}
 	}
-- 
2.20.1


From 28be9795dd7e875ca4bb9f2fb4596a93573e3e93 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 29 Oct 2019 01:24:32 +0800
Subject: [PATCH 19/55] vxlan: check tun_info options_len properly

[ Upstream commit eadf52cf1852196a1363044dcda22fa5d7f296f7 ]

This patch is to improve the tun_info options_len by dropping
the skb when TUNNEL_VXLAN_OPT is set but options_len is less
than vxlan_metadata. This can void a potential out-of-bounds
access on ip_tun_info.

Fixes: ee122c79d422 ("vxlan: Flow based tunneling")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3d9bcc957f7d..e07872869266 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2487,9 +2487,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		vni = tunnel_id_to_key32(info->key.tun_id);
 		ifindex = 0;
 		dst_cache = &info->dst_cache;
-		if (info->options_len &&
-		    info->key.tun_flags & TUNNEL_VXLAN_OPT)
+		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+			if (info->options_len < sizeof(*md))
+				goto drop;
 			md = ip_tunnel_info_opts(info);
+		}
 		ttl = info->key.ttl;
 		tos = info->key.tos;
 		label = info->key.label;
-- 
2.20.1


From 69c8c21236403ae2e11ce84f3089fad97b22f39f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:48 -0700
Subject: [PATCH 20/55] net: add skb_queue_empty_lockless()

[ Upstream commit d7d16a89350ab263484c0aa2b523dd3a234e4a80 ]

Some paths call skb_queue_empty() without holding
the queue lock. We must use a barrier in order
to not let the compiler do strange things, and avoid
KCSAN splats.

Adding a barrier in skb_queue_empty() might be overkill,
I prefer adding a new helper to clearly identify
points where the callers might be lockless. This might
help us finding real bugs.

The corresponding WRITE_ONCE() should add zero cost
for current compilers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9b18d33681c2..39a7cecaa880 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1500,6 +1500,19 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 	return list->next == (const struct sk_buff *) list;
 }
 
+/**
+ *	skb_queue_empty_lockless - check if a queue is empty
+ *	@list: queue head
+ *
+ *	Returns true if the queue is empty, false otherwise.
+ *	This variant can be used in lockless contexts.
+ */
+static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
+{
+	return READ_ONCE(list->next) == (const struct sk_buff *) list;
+}
+
+
 /**
  *	skb_queue_is_last - check if skb is the last entry in the queue
  *	@list: queue head
@@ -1853,9 +1866,11 @@ static inline void __skb_insert(struct sk_buff *newsk,
 				struct sk_buff *prev, struct sk_buff *next,
 				struct sk_buff_head *list)
 {
-	newsk->next = next;
-	newsk->prev = prev;
-	next->prev  = prev->next = newsk;
+	/* see skb_queue_empty_lockless() for the opposite READ_ONCE() */
+	WRITE_ONCE(newsk->next, next);
+	WRITE_ONCE(newsk->prev, prev);
+	WRITE_ONCE(next->prev, newsk);
+	WRITE_ONCE(prev->next, newsk);
 	list->qlen++;
 }
 
@@ -1866,11 +1881,11 @@ static inline void __skb_queue_splice(const struct sk_buff_head *list,
 	struct sk_buff *first = list->next;
 	struct sk_buff *last = list->prev;
 
-	first->prev = prev;
-	prev->next = first;
+	WRITE_ONCE(first->prev, prev);
+	WRITE_ONCE(prev->next, first);
 
-	last->next = next;
-	next->prev = last;
+	WRITE_ONCE(last->next, next);
+	WRITE_ONCE(next->prev, last);
 }
 
 /**
@@ -2011,8 +2026,8 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
 	next	   = skb->next;
 	prev	   = skb->prev;
 	skb->next  = skb->prev = NULL;
-	next->prev = prev;
-	prev->next = next;
+	WRITE_ONCE(next->prev, prev);
+	WRITE_ONCE(prev->next, next);
 }
 
 /**
-- 
2.20.1


From 1721e65cf56608630f6beef7255162c6af4b054e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:49 -0700
Subject: [PATCH 21/55] udp: use skb_queue_empty_lockless()

[ Upstream commit 137a0dbe3426fd7bcfe3f8117b36a87b3590e4eb ]

syzbot reported a data-race [1].

We should use skb_queue_empty_lockless() to document that we are
not ensuring a mutual exclusion and silence KCSAN.

[1]
BUG: KCSAN: data-race in __skb_recv_udp / __udp_enqueue_schedule_skb

write to 0xffff888122474b50 of 8 bytes by interrupt on cpu 0:
 __skb_insert include/linux/skbuff.h:1852 [inline]
 __skb_queue_before include/linux/skbuff.h:1958 [inline]
 __skb_queue_tail include/linux/skbuff.h:1991 [inline]
 __udp_enqueue_schedule_skb+0x2c1/0x410 net/ipv4/udp.c:1470
 __udp_queue_rcv_skb net/ipv4/udp.c:1940 [inline]
 udp_queue_rcv_one_skb+0x7bd/0xc70 net/ipv4/udp.c:2057
 udp_queue_rcv_skb+0xb5/0x400 net/ipv4/udp.c:2074
 udp_unicast_rcv_skb.isra.0+0x7e/0x1c0 net/ipv4/udp.c:2233
 __udp4_lib_rcv+0xa44/0x17c0 net/ipv4/udp.c:2300
 udp_rcv+0x2b/0x40 net/ipv4/udp.c:2470
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955

read to 0xffff888122474b50 of 8 bytes by task 8921 on cpu 1:
 skb_queue_empty include/linux/skbuff.h:1494 [inline]
 __skb_recv_udp+0x18d/0x500 net/ipv4/udp.c:1653
 udp_recvmsg+0xe1/0xb10 net/ipv4/udp.c:1712
 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838
 sock_recvmsg_nosec+0x5c/0x70 net/socket.c:871
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 8921 Comm: syz-executor.4 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ce0728c4b4aa..97310f7a25a8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1588,7 +1588,7 @@ static int first_packet_length(struct sock *sk)
 
 	spin_lock_bh(&rcvq->lock);
 	skb = __first_packet_length(sk, rcvq, &total);
-	if (!skb && !skb_queue_empty(sk_queue)) {
+	if (!skb && !skb_queue_empty_lockless(sk_queue)) {
 		spin_lock(&sk_queue->lock);
 		skb_queue_splice_tail_init(sk_queue, rcvq);
 		spin_unlock(&sk_queue->lock);
@@ -1661,7 +1661,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 				return skb;
 			}
 
-			if (skb_queue_empty(sk_queue)) {
+			if (skb_queue_empty_lockless(sk_queue)) {
 				spin_unlock_bh(&queue->lock);
 				goto busy_check;
 			}
@@ -1687,7 +1687,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 				break;
 
 			sk_busy_loop(sk, flags & MSG_DONTWAIT);
-		} while (!skb_queue_empty(sk_queue));
+		} while (!skb_queue_empty_lockless(sk_queue));
 
 		/* sk_queue is empty, reader_queue may contain peeked packets */
 	} while (timeo &&
-- 
2.20.1


From b2dfe7322d698b982a77edc9e89b2a96f4c5ce85 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:50 -0700
Subject: [PATCH 22/55] net: use skb_queue_empty_lockless() in poll() handlers

[ Upstream commit 3ef7cf57c72f32f61e97f8fa401bc39ea1f1a5d4 ]

Many poll() handlers are lockless. Using skb_queue_empty_lockless()
instead of skb_queue_empty() is more appropriate.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/capi/capi.c     | 2 +-
 net/atm/common.c             | 2 +-
 net/bluetooth/af_bluetooth.c | 4 ++--
 net/caif/caif_socket.c       | 2 +-
 net/core/datagram.c          | 4 ++--
 net/decnet/af_decnet.c       | 2 +-
 net/ipv4/tcp.c               | 2 +-
 net/ipv4/udp.c               | 2 +-
 net/nfc/llcp_sock.c          | 4 ++--
 net/phonet/socket.c          | 4 ++--
 net/sctp/socket.c            | 4 ++--
 net/tipc/socket.c            | 4 ++--
 net/unix/af_unix.c           | 6 +++---
 net/vmw_vsock/af_vsock.c     | 2 +-
 14 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index c92b405b7646..ba8619524231 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -744,7 +744,7 @@ capi_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &(cdev->recvwait), wait);
 	mask = EPOLLOUT | EPOLLWRNORM;
-	if (!skb_queue_empty(&cdev->recvqueue))
+	if (!skb_queue_empty_lockless(&cdev->recvqueue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 	return mask;
 }
diff --git a/net/atm/common.c b/net/atm/common.c
index b7528e77997c..0ce530af534d 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -668,7 +668,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 		mask |= EPOLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* writable? */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 94ddf19998c7..5f508c50649d 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -460,7 +460,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 	if (sk->sk_state == BT_LISTEN)
 		return bt_accept_poll(sk);
 
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
@@ -470,7 +470,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= EPOLLHUP;
 
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == BT_CLOSED)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 13ea920600ae..ef14da50a981 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -953,7 +953,7 @@ static __poll_t caif_poll(struct file *file,
 		mask |= EPOLLRDHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
 		(sk->sk_shutdown & RCV_SHUTDOWN))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 45a162ef5e02..7360714c9e69 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -767,7 +767,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 	mask = 0;
 
 	/* exceptional events? */
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
@@ -777,7 +777,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 		mask |= EPOLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 0ea75286abf4..3349ea81f901 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1205,7 +1205,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wai
 	struct dn_scp *scp = DN_SK(sk);
 	__poll_t mask = datagram_poll(file, sock, wait);
 
-	if (!skb_queue_empty(&scp->other_receive_queue))
+	if (!skb_queue_empty_lockless(&scp->other_receive_queue))
 		mask |= EPOLLRDBAND;
 
 	return mask;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 61082065b26a..0ba28b2ea84a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -584,7 +584,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR;
 
 	return mask;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 97310f7a25a8..5487b43b8a56 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2723,7 +2723,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	__poll_t mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 
-	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
+	if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ccdd790e163a..28604414dec1 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -554,11 +554,11 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
 	if (sk->sk_state == LLCP_LISTEN)
 		return llcp_accept_poll(sk);
 
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == LLCP_CLOSED)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 96ea9f254ae9..76d499f6af9a 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -338,9 +338,9 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
 
 	if (sk->sk_state == TCP_CLOSE)
 		return EPOLLERR;
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
-	if (!skb_queue_empty(&pn->ctrlreq_queue))
+	if (!skb_queue_empty_lockless(&pn->ctrlreq_queue))
 		mask |= EPOLLPRI;
 	if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
 		return EPOLLHUP;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b8f9bafb180c..1ff3579c6549 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8329,7 +8329,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	mask = 0;
 
 	/* Is there any exceptional events?  */
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
@@ -8338,7 +8338,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		mask |= EPOLLHUP;
 
 	/* Is it readable?  Reconsider this code with TCP-style support.  */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* The association is either gone or not ready.  */
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 83ae41d7e554..90ecca988d12 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -740,7 +740,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
 		/* fall through */
 	case TIPC_LISTEN:
 	case TIPC_CONNECTING:
-		if (!skb_queue_empty(&sk->sk_receive_queue))
+		if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 			revents |= EPOLLIN | EPOLLRDNORM;
 		break;
 	case TIPC_OPEN:
@@ -748,7 +748,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
 			revents |= EPOLLOUT;
 		if (!tipc_sk_type_connectionless(sk))
 			break;
-		if (skb_queue_empty(&sk->sk_receive_queue))
+		if (skb_queue_empty_lockless(&sk->sk_receive_queue))
 			break;
 		revents |= EPOLLIN | EPOLLRDNORM;
 		break;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 67e87db5877f..0d8da809bea2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2599,7 +2599,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
@@ -2628,7 +2628,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	mask = 0;
 
 	/* exceptional events? */
-	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
 		mask |= EPOLLERR |
 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
@@ -2638,7 +2638,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 		mask |= EPOLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue))
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2ab43b2bba31..582a3e4dfce2 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -870,7 +870,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 		 * the queue and write as long as the socket isn't shutdown for
 		 * sending.
 		 */
-		if (!skb_queue_empty(&sk->sk_receive_queue) ||
+		if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN)) {
 			mask |= EPOLLIN | EPOLLRDNORM;
 		}
-- 
2.20.1


From 2c83dcb80d3a6e1405c7b3781772d85572b5f8f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:51 -0700
Subject: [PATCH 23/55] net: use skb_queue_empty_lockless() in busy poll
 contexts

[ Upstream commit 3f926af3f4d688e2e11e7f8ed04e277a14d4d4a4 ]

Busy polling usually runs without locks.
Let's use skb_queue_empty_lockless() instead of skb_queue_empty()

Also uses READ_ONCE() in __skb_try_recv_datagram() to address
a similar potential problem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 2 +-
 net/core/datagram.c                     | 2 +-
 net/core/sock.c                         | 2 +-
 net/ipv4/tcp.c                          | 2 +-
 net/sctp/socket.c                       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 551bca6fef24..f382c2b23d75 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1701,7 +1701,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return peekmsg(sk, msg, len, nonblock, flags);
 
 	if (sk_can_busy_loop(sk) &&
-	    skb_queue_empty(&sk->sk_receive_queue) &&
+	    skb_queue_empty_lockless(&sk->sk_receive_queue) &&
 	    sk->sk_state == TCP_ESTABLISHED)
 		sk_busy_loop(sk, nonblock);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 7360714c9e69..b824b8117137 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -278,7 +278,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 			break;
 
 		sk_busy_loop(sk, flags & MSG_DONTWAIT);
-	} while (sk->sk_receive_queue.prev != *last);
+	} while (READ_ONCE(sk->sk_receive_queue.prev) != *last);
 
 	error = -EAGAIN;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 71d047962390..b4247635c4a2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3593,7 +3593,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
 {
 	struct sock *sk = p;
 
-	return !skb_queue_empty(&sk->sk_receive_queue) ||
+	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
 	       sk_busy_loop_timeout(sk, start_time);
 }
 EXPORT_SYMBOL(sk_busy_loop_end);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0ba28b2ea84a..cf79ab96c2df 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1961,7 +1961,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
 
-	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
 	    (sk->sk_state == TCP_ESTABLISHED))
 		sk_busy_loop(sk, nonblock);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1ff3579c6549..b81d7673634c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8724,7 +8724,7 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
 		if (sk_can_busy_loop(sk)) {
 			sk_busy_loop(sk, noblock);
 
-			if (!skb_queue_empty(&sk->sk_receive_queue))
+			if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
 				continue;
 		}
 
-- 
2.20.1


From 40059aca248e8fd893c11cb85fdcd344743de0f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 23 Oct 2019 22:44:52 -0700
Subject: [PATCH 24/55] net: add READ_ONCE() annotation in
 __skb_wait_for_more_packets()

[ Upstream commit 7c422d0ce97552dde4a97e6290de70ec6efb0fc6 ]

__skb_wait_for_more_packets() can be called while other cpus
can feed packets to the socket receive queue.

KCSAN reported :

BUG: KCSAN: data-race in __skb_wait_for_more_packets / __udp_enqueue_schedule_skb

write to 0xffff888102e40b58 of 8 bytes by interrupt on cpu 0:
 __skb_insert include/linux/skbuff.h:1852 [inline]
 __skb_queue_before include/linux/skbuff.h:1958 [inline]
 __skb_queue_tail include/linux/skbuff.h:1991 [inline]
 __udp_enqueue_schedule_skb+0x2d7/0x410 net/ipv4/udp.c:1470
 __udp_queue_rcv_skb net/ipv4/udp.c:1940 [inline]
 udp_queue_rcv_one_skb+0x7bd/0xc70 net/ipv4/udp.c:2057
 udp_queue_rcv_skb+0xb5/0x400 net/ipv4/udp.c:2074
 udp_unicast_rcv_skb.isra.0+0x7e/0x1c0 net/ipv4/udp.c:2233
 __udp4_lib_rcv+0xa44/0x17c0 net/ipv4/udp.c:2300
 udp_rcv+0x2b/0x40 net/ipv4/udp.c:2470
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5010
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5124
 process_backlog+0x1d3/0x420 net/core/dev.c:5955

read to 0xffff888102e40b58 of 8 bytes by task 13035 on cpu 1:
 __skb_wait_for_more_packets+0xfa/0x320 net/core/datagram.c:100
 __skb_recv_udp+0x374/0x500 net/ipv4/udp.c:1683
 udp_recvmsg+0xe1/0xb10 net/ipv4/udp.c:1712
 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838
 sock_recvmsg_nosec+0x5c/0x70 net/socket.c:871
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 13035 Comm: syz-executor.3 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index b824b8117137..5dc112ec7286 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -97,7 +97,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 	if (error)
 		goto out_err;
 
-	if (sk->sk_receive_queue.prev != skb)
+	if (READ_ONCE(sk->sk_receive_queue.prev) != skb)
 		goto out;
 
 	/* Socket shut down? */
-- 
2.20.1


From 90f14562df3ca9353668b8bbb0a8217030eed320 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sat, 26 Oct 2019 11:53:39 +0200
Subject: [PATCH 25/55] ipv4: fix route update on metric change.

[ Upstream commit 0b834ba00ab5337e938c727e216e1f5249794717 ]

Since commit af4d768ad28c ("net/ipv4: Add support for specifying metric
of connected routes"), when updating an IP address with a different metric,
the associated connected route is updated, too.

Still, the mentioned commit doesn't handle properly some corner cases:

$ ip addr add dev eth0 192.168.1.0/24
$ ip addr add dev eth0 192.168.2.1/32 peer 192.168.2.2
$ ip addr add dev eth0 192.168.3.1/24
$ ip addr change dev eth0 192.168.1.0/24 metric 10
$ ip addr change dev eth0 192.168.2.1/32 peer 192.168.2.2 metric 10
$ ip addr change dev eth0 192.168.3.1/24 metric 10
$ ip -4 route
192.168.1.0/24 dev eth0 proto kernel scope link src 192.168.1.0
192.168.2.2 dev eth0 proto kernel scope link src 192.168.2.1
192.168.3.0/24 dev eth0 proto kernel scope link src 192.168.2.1 metric 10

Only the last route is correctly updated.

The problem is the current test in fib_modify_prefix_metric():

	if (!(dev->flags & IFF_UP) ||
	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
	    ipv4_is_zeronet(prefix) ||
	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)

Which should be the logical 'not' of the pre-existing test in
fib_add_ifaddr():

	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
	    (prefix != addr || ifa->ifa_prefixlen < 32))

To properly negate the original expression, we need to change the last
logical 'or' to a logical 'and'.

Fixes: af4d768ad28c ("net/ipv4: Add support for specifying metric of connected routes")
Reported-and-suggested-by: Beniamino Galvani <bgalvani@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e8bc939b56dd..fb4162943fae 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1147,7 +1147,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
 	if (!(dev->flags & IFF_UP) ||
 	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
 	    ipv4_is_zeronet(prefix) ||
-	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+	    (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
 		return;
 
 	/* add the new */
-- 
2.20.1


From c14e508f3d0d88cf7dff886b121954ec5eb4cc8c Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sat, 26 Oct 2019 11:53:40 +0200
Subject: [PATCH 26/55] selftests: fib_tests: add more tests for metric update

[ Upstream commit 37de3b354150450ba12275397155e68113e99901 ]

This patch adds two more tests to ipv4_addr_metric_test() to
explicitly cover the scenarios fixed by the previous patch.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index c4ba0ff4a53f..76c1897e6352 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -1438,6 +1438,27 @@ ipv4_addr_metric_test()
 	fi
 	log_test $rc 0 "Prefix route with metric on link up"
 
+	# explicitly check for metric changes on edge scenarios
+	run_cmd "$IP addr flush dev dummy2"
+	run_cmd "$IP addr add dev dummy2 172.16.104.0/24 metric 259"
+	run_cmd "$IP addr change dev dummy2 172.16.104.0/24 metric 260"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.0 metric 260"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of .0/24 address"
+
+	run_cmd "$IP addr flush dev dummy2"
+	run_cmd "$IP addr add dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 260"
+	run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 261"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of address with peer route"
+
 	$IP li del dummy1
 	$IP li del dummy2
 	cleanup
-- 
2.20.1


From 0d49faeba1b3237bc4022bce1ec9f4e38ef9640d Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 23 Oct 2019 15:44:05 +0200
Subject: [PATCH 27/55] net/smc: fix closing of fallback SMC sockets

[ Upstream commit f536dffc0b79738c3104af999318279dccbaa261 ]

For SMC sockets forced to fallback to TCP, the file is propagated
from the outer SMC to the internal TCP socket. When closing the SMC
socket, the internal TCP socket file pointer must be restored to the
original NULL value, otherwise memory leaks may show up (found with
CONFIG_DEBUG_KMEMLEAK).

The internal TCP socket is released in smc_clcsock_release(), which
calls __sock_release() function in net/socket.c. This calls the
needed iput(SOCK_INODE(sock)) only, if the file pointer has been reset
to the original NULL-value.

Fixes: 07603b230895 ("net/smc: propagate file from SMC to TCP socket")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b932583e407..d9566e84f2f9 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -123,6 +123,12 @@ struct proto smc_proto6 = {
 };
 EXPORT_SYMBOL_GPL(smc_proto6);
 
+static void smc_restore_fallback_changes(struct smc_sock *smc)
+{
+	smc->clcsock->file->private_data = smc->sk.sk_socket;
+	smc->clcsock->file = NULL;
+}
+
 static int __smc_release(struct smc_sock *smc)
 {
 	struct sock *sk = &smc->sk;
@@ -141,6 +147,7 @@ static int __smc_release(struct smc_sock *smc)
 		}
 		sk->sk_state = SMC_CLOSED;
 		sk->sk_state_change(sk);
+		smc_restore_fallback_changes(smc);
 	}
 
 	sk->sk_prot->unhash(sk);
-- 
2.20.1


From 8b71e8e93d234499e5bae716ecc3a279e0c65f1a Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 23 Oct 2019 15:44:06 +0200
Subject: [PATCH 28/55] net/smc: keep vlan_id for SMC-R in smc_listen_work()

[ Upstream commit ca5f8d2dd5229ccacdd5cfde1ce4d32b0810e454 ]

Creating of an SMC-R connection with vlan-id fails, because
smc_listen_work() determines the vlan_id of the connection,
saves it in struct smc_init_info ini, but clears the ini area
again if SMC-D is not applicable.
This patch just resets the ISM device before investigating
SMC-R availability.

Fixes: bc36d2fc93eb ("net/smc: consolidate function parameters")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index d9566e84f2f9..cea3c36ea0da 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1298,8 +1298,8 @@ static void smc_listen_work(struct work_struct *work)
 	/* check if RDMA is available */
 	if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
 		/* prepare RDMA check */
-		memset(&ini, 0, sizeof(ini));
 		ini.is_smcd = false;
+		ini.ism_dev = NULL;
 		ini.ib_lcl = &pclc->lcl;
 		rc = smc_find_rdma_device(new_smc, &ini);
 		if (rc) {
-- 
2.20.1


From a9e45ef7f458b160c7223dc5ddb53b7101b35826 Mon Sep 17 00:00:00 2001
From: Takeshi Misawa <jeliantsurux@gmail.com>
Date: Sat, 19 Oct 2019 15:34:43 +0900
Subject: [PATCH 29/55] keys: Fix memory leak in copy_net_ns

[ Upstream commit 82ecff655e7968151b0047f1b5de03b249e5c1c4 ]

If copy_net_ns() failed after net_alloc(), net->key_domain is leaked.
Fix this, by freeing key_domain in error path.

syzbot report:
BUG: memory leak
unreferenced object 0xffff8881175007e0 (size 32):
  comm "syz-executor902", pid 7069, jiffies 4294944350 (age 28.400s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<00000000a83ed741>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline]
    [<00000000a83ed741>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<00000000a83ed741>] slab_alloc mm/slab.c:3326 [inline]
    [<00000000a83ed741>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553
    [<0000000059fc92b9>] kmalloc include/linux/slab.h:547 [inline]
    [<0000000059fc92b9>] kzalloc include/linux/slab.h:742 [inline]
    [<0000000059fc92b9>] net_alloc net/core/net_namespace.c:398 [inline]
    [<0000000059fc92b9>] copy_net_ns+0xb2/0x220 net/core/net_namespace.c:445
    [<00000000a9d74bbc>] create_new_namespaces+0x141/0x2a0 kernel/nsproxy.c:103
    [<000000008047d645>] unshare_nsproxy_namespaces+0x7f/0x100 kernel/nsproxy.c:202
    [<000000005993ea6e>] ksys_unshare+0x236/0x490 kernel/fork.c:2674
    [<0000000019417e75>] __do_sys_unshare kernel/fork.c:2742 [inline]
    [<0000000019417e75>] __se_sys_unshare kernel/fork.c:2740 [inline]
    [<0000000019417e75>] __x64_sys_unshare+0x16/0x20 kernel/fork.c:2740
    [<00000000f4c5f2c8>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:296
    [<0000000038550184>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

syzbot also reported other leak in copy_net_ns -> setup_net.
This problem is already fixed by cf47a0b882a4e5f6b34c7949d7b293e9287f1972.

Fixes: 9b242610514f ("keys: Network namespace domain tag")
Reported-and-tested-by: syzbot+3b3296d032353c33184b@syzkaller.appspotmail.com
Signed-off-by: Takeshi Misawa <jeliantsurux@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 53ac647b893b..87c32ab63304 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -478,6 +478,7 @@ struct net *copy_net_ns(unsigned long flags,
 
 	if (rv < 0) {
 put_userns:
+		key_remove_domain(net->key_domain);
 		put_user_ns(user_ns);
 		net_drop_ns(net);
 dec_ucounts:
-- 
2.20.1


From 21e9992e60d82963f878d10a1aa22428eac15c7b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 31 Oct 2019 15:42:26 -0700
Subject: [PATCH 30/55] net: phylink: Fix phylink_dbg() macro

[ Upstream commit 9d68db5092c5fac99fccfdeab3f04df0b27d1762 ]

The phylink_dbg() macro does not follow dynamic debug or defined(DEBUG)
and as a result, it spams the kernel log since a PR_DEBUG level is
currently used. Fix it to be defined appropriately whether
CONFIG_DYNAMIC_DEBUG or defined(DEBUG) are set.

Fixes: 17091180b152 ("net: phylink: Add phylink_{printk, err, warn, info, dbg} macros")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index a5a57ca94c1a..26a13fd3c463 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -87,8 +87,24 @@ struct phylink {
 	phylink_printk(KERN_WARNING, pl, fmt, ##__VA_ARGS__)
 #define phylink_info(pl, fmt, ...) \
 	phylink_printk(KERN_INFO, pl, fmt, ##__VA_ARGS__)
+#if defined(CONFIG_DYNAMIC_DEBUG)
 #define phylink_dbg(pl, fmt, ...) \
+do {									\
+	if ((pl)->config->type == PHYLINK_NETDEV)			\
+		netdev_dbg((pl)->netdev, fmt, ##__VA_ARGS__);		\
+	else if ((pl)->config->type == PHYLINK_DEV)			\
+		dev_dbg((pl)->dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+#elif defined(DEBUG)
+#define phylink_dbg(pl, fmt, ...)					\
 	phylink_printk(KERN_DEBUG, pl, fmt, ##__VA_ARGS__)
+#else
+#define phylink_dbg(pl, fmt, ...)					\
+({									\
+	if (0)								\
+		phylink_printk(KERN_DEBUG, pl, fmt, ##__VA_ARGS__);	\
+})
+#endif
 
 /**
  * phylink_set_port_modes() - set the port type modes in the ethtool mask
-- 
2.20.1


From 2a7b6f8a1a75d98f009ede75f42194ca3acf0426 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 31 Oct 2019 12:13:46 +0000
Subject: [PATCH 31/55] rxrpc: Fix handling of last subpacket of jumbo packet

[ Upstream commit f9c32435ab7221d1d6cb35738fa85a2da012b23e ]

When rxrpc_recvmsg_data() sets the return value to 1 because it's drained
all the data for the last packet, it checks the last-packet flag on the
whole packet - but this is wrong, since the last-packet flag is only set on
the final subpacket of the last jumbo packet.  This means that a call that
receives its last packet in a jumbo packet won't complete properly.

Fix this by having rxrpc_locate_data() determine the last-packet state of
the subpacket it's looking at and passing that back to the caller rather
than having the caller look in the packet header.  The caller then needs to
cache this in the rxrpc_call struct as rxrpc_locate_data() isn't then
called again for this packet.

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Fixes: e2de6c404898 ("rxrpc: Use info in skbuff instead of reparsing a jumbo packet")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/ar-internal.h |  1 +
 net/rxrpc/recvmsg.c     | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 8051dfdcf26d..b23a13c69019 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -596,6 +596,7 @@ struct rxrpc_call {
 	int			debug_id;	/* debug ID for printks */
 	unsigned short		rx_pkt_offset;	/* Current recvmsg packet offset */
 	unsigned short		rx_pkt_len;	/* Current recvmsg packet len */
+	bool			rx_pkt_last;	/* Current recvmsg packet is last */
 
 	/* Rx/Tx circular buffer, depending on phase.
 	 *
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 3b0becb12041..08d4b4b9283a 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -267,11 +267,13 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
  */
 static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
 			     u8 *_annotation,
-			     unsigned int *_offset, unsigned int *_len)
+			     unsigned int *_offset, unsigned int *_len,
+			     bool *_last)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	unsigned int offset = sizeof(struct rxrpc_wire_header);
 	unsigned int len;
+	bool last = false;
 	int ret;
 	u8 annotation = *_annotation;
 	u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET;
@@ -281,6 +283,8 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
 	len = skb->len - offset;
 	if (subpacket < sp->nr_subpackets - 1)
 		len = RXRPC_JUMBO_DATALEN;
+	else if (sp->rx_flags & RXRPC_SKB_INCL_LAST)
+		last = true;
 
 	if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
 		ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
@@ -291,6 +295,7 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
 
 	*_offset = offset;
 	*_len = len;
+	*_last = last;
 	call->conn->security->locate_data(call, skb, _offset, _len);
 	return 0;
 }
@@ -309,7 +314,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top, seq;
 	size_t remain;
-	bool last;
+	bool rx_pkt_last;
 	unsigned int rx_pkt_offset, rx_pkt_len;
 	int ix, copy, ret = -EAGAIN, ret2;
 
@@ -319,6 +324,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 
 	rx_pkt_offset = call->rx_pkt_offset;
 	rx_pkt_len = call->rx_pkt_len;
+	rx_pkt_last = call->rx_pkt_last;
 
 	if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
 		seq = call->rx_hard_ack;
@@ -329,6 +335,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	/* Barriers against rxrpc_input_data(). */
 	hard_ack = call->rx_hard_ack;
 	seq = hard_ack + 1;
+
 	while (top = smp_load_acquire(&call->rx_top),
 	       before_eq(seq, top)
 	       ) {
@@ -356,7 +363,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		if (rx_pkt_offset == 0) {
 			ret2 = rxrpc_locate_data(call, skb,
 						 &call->rxtx_annotations[ix],
-						 &rx_pkt_offset, &rx_pkt_len);
+						 &rx_pkt_offset, &rx_pkt_len,
+						 &rx_pkt_last);
 			trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
 					    rx_pkt_offset, rx_pkt_len, ret2);
 			if (ret2 < 0) {
@@ -396,13 +404,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 		}
 
 		/* The whole packet has been transferred. */
-		last = sp->hdr.flags & RXRPC_LAST_PACKET;
 		if (!(flags & MSG_PEEK))
 			rxrpc_rotate_rx_window(call);
 		rx_pkt_offset = 0;
 		rx_pkt_len = 0;
 
-		if (last) {
+		if (rx_pkt_last) {
 			ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
 			ret = 1;
 			goto out;
@@ -415,6 +422,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
 	if (!(flags & MSG_PEEK)) {
 		call->rx_pkt_offset = rx_pkt_offset;
 		call->rx_pkt_len = rx_pkt_len;
+		call->rx_pkt_last = rx_pkt_last;
 	}
 done:
 	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
-- 
2.20.1


From 5959f3d579fc1a1467987bac0e5a1a8e8322ef3e Mon Sep 17 00:00:00 2001
From: Dmytro Linkin <dmitrolin@mellanox.com>
Date: Wed, 4 Sep 2019 12:32:49 +0000
Subject: [PATCH 32/55] net/mlx5e: Determine source port properly for vlan push
 action

[ Upstream commit d5dbcc4e87bc8444bd2f1ca4b8f787e1e5677ec2 ]

Termination tables are used for vlan push actions on uplink ports.
To support RoCE dual port the source port value was placed in a register.
Fix the code to use an API method returning the source port according to
the FW capabilities.

Fixes: 10caabdaad5a ("net/mlx5e: Use termination table for VLAN push actions")
Signed-off-by: Dmytro Linkin <dmitrolin@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../mlx5/core/eswitch_offloads_termtbl.c      | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
index 1d55a324a17e..7879e1746297 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c
@@ -177,22 +177,32 @@ mlx5_eswitch_termtbl_actions_move(struct mlx5_flow_act *src,
 	memset(&src->vlan[1], 0, sizeof(src->vlan[1]));
 }
 
+static bool mlx5_eswitch_offload_is_uplink_port(const struct mlx5_eswitch *esw,
+						const struct mlx5_flow_spec *spec)
+{
+	u32 port_mask, port_value;
+
+	if (MLX5_CAP_ESW_FLOWTABLE(esw->dev, flow_source))
+		return spec->flow_context.flow_source == MLX5_VPORT_UPLINK;
+
+	port_mask = MLX5_GET(fte_match_param, spec->match_criteria,
+			     misc_parameters.source_port);
+	port_value = MLX5_GET(fte_match_param, spec->match_value,
+			      misc_parameters.source_port);
+	return (port_mask & port_value & 0xffff) == MLX5_VPORT_UPLINK;
+}
+
 bool
 mlx5_eswitch_termtbl_required(struct mlx5_eswitch *esw,
 			      struct mlx5_flow_act *flow_act,
 			      struct mlx5_flow_spec *spec)
 {
-	u32 port_mask = MLX5_GET(fte_match_param, spec->match_criteria,
-				 misc_parameters.source_port);
-	u32 port_value = MLX5_GET(fte_match_param, spec->match_value,
-				  misc_parameters.source_port);
-
 	if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, termination_table))
 		return false;
 
 	/* push vlan on RX */
 	return (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) &&
-		((port_mask & port_value) == MLX5_VPORT_UPLINK);
+		mlx5_eswitch_offload_is_uplink_port(esw, spec);
 }
 
 struct mlx5_flow_handle *
-- 
2.20.1


From d716e696f814089323e0116bc3c4158ac619e85c Mon Sep 17 00:00:00 2001
From: Dmytro Linkin <dmitrolin@mellanox.com>
Date: Thu, 29 Aug 2019 15:24:27 +0000
Subject: [PATCH 33/55] net/mlx5e: Remove incorrect match criteria assignment
 line

[ Upstream commit 752d3dc06d6936d5a357a18b6b51d91c7e134e88 ]

Driver have function, which enable match criteria for misc parameters
in dependence of eswitch capabilities.

Fixes: 4f5d1beadc10 ("Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux")
Signed-off-by: Dmytro Linkin <dmitrolin@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 0323fd078271..35945cdd0a61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -285,7 +285,6 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 
 	mlx5_eswitch_set_rule_source_port(esw, spec, attr);
 
-	spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS;
 	if (attr->outer_match_level != MLX5_MATCH_NONE)
 		spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
 
-- 
2.20.1


From ce7793943ceaefacd81341a1002816f0b33b9fb4 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Wed, 23 Oct 2019 12:57:54 +0300
Subject: [PATCH 34/55] net/mlx5e: Initialize on stack link modes bitmap

[ Upstream commit 926b37f76fb0a22fe93c8873c819fd167180e85c ]

Initialize link modes bitmap on stack before using it, otherwise the
outcome of ethtool set link ksettings might have unexpected values.

Fixes: 4b95840a6ced ("net/mlx5e: Fix matching of speed to PRM link modes")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 20e628c907e5..a9bb8e2b34a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1021,7 +1021,7 @@ static bool ext_link_mode_requested(const unsigned long *adver)
 {
 #define MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT ETHTOOL_LINK_MODE_50000baseKR_Full_BIT
 	int size = __ETHTOOL_LINK_MODE_MASK_NBITS - MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT;
-	__ETHTOOL_DECLARE_LINK_MODE_MASK(modes);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = {0,};
 
 	bitmap_set(modes, MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT, size);
 	return bitmap_intersects(modes, adver, __ETHTOOL_LINK_MODE_MASK_NBITS);
-- 
2.20.1


From 0d4fca4bafe40ea687bf4470ad8bc88a664d1406 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Wed, 11 Sep 2019 14:44:50 +0300
Subject: [PATCH 35/55] net/mlx5: Fix flow counter list auto bits struct

[ Upstream commit 6dfef396ea13873ae9066ee2e0ad6ee364031fe2 ]

The union should contain the extended dest and counter list.
Remove the resevered 0x40 bits which is redundant.
This change doesn't break any functionally.
Everything works today because the code in fs_cmd.c is using
the correct structs if extended dest or the basic dest.

Fixes: 1b115498598f ("net/mlx5: Introduce extended destination fields")
Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b8b570c30b5e..e4b323e4db8f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1437,9 +1437,8 @@ struct mlx5_ifc_extended_dest_format_bits {
 };
 
 union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits {
-	struct mlx5_ifc_dest_format_struct_bits dest_format_struct;
+	struct mlx5_ifc_extended_dest_format_bits extended_dest_format;
 	struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
-	u8         reserved_at_0[0x40];
 };
 
 struct mlx5_ifc_fte_match_param_bits {
-- 
2.20.1


From 5e401f71af440cd65f6bbbed5a09211d571e269d Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Tue, 29 Oct 2019 12:41:26 +0100
Subject: [PATCH 36/55] net/smc: fix refcounting for non-blocking connect()

[ Upstream commit 301428ea3708188dc4a243e6e6b46c03b46a0fbc ]

If a nonblocking socket is immediately closed after connect(),
the connect worker may not have started. This results in a refcount
problem, since sock_hold() is called from the connect worker.
This patch moves the sock_hold in front of the connect worker
scheduling.

Reported-by: syzbot+4c063e6dea39e4b79f29@syzkaller.appspotmail.com
Fixes: 50717a37db03 ("net/smc: nonblocking connect rework")
Reviewed-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index cea3c36ea0da..47946f489fd4 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -707,8 +707,6 @@ static int __smc_connect(struct smc_sock *smc)
 	int smc_type;
 	int rc = 0;
 
-	sock_hold(&smc->sk); /* sock put in passive closing */
-
 	if (smc->use_fallback)
 		return smc_connect_fallback(smc, smc->fallback_rsn);
 
@@ -853,6 +851,8 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
 	if (rc && rc != -EINPROGRESS)
 		goto out;
+
+	sock_hold(&smc->sk); /* sock put in passive closing */
 	if (flags & O_NONBLOCK) {
 		if (schedule_work(&smc->connect_work))
 			smc->connect_nonblock = 1;
-- 
2.20.1


From 5e015c8c965f06224577f143ecb0884d12d3307c Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 19 Sep 2019 15:58:14 -0500
Subject: [PATCH 37/55] net/mlx5: Fix rtable reference leak

[ Upstream commit 2347cee83b2bd868bde2d283db0fac89f22be4e0 ]

If the rt entry gateway family is not AF_INET for multipath device,
rtable reference is leaked.
Hence, fix it by releasing the reference.

Fixes: 5fb091e8130b ("net/mlx5e: Use hint to resolve route when in HW multipath mode")
Fixes: e32ee6c78efa ("net/mlx5e: Support tunnel encap over tagged Ethernet")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index a6a52806be45..310f65ef5446 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -90,15 +90,19 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
 	if (ret)
 		return ret;
 
-	if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET)
+	if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) {
+		ip_rt_put(rt);
 		return -ENETUNREACH;
+	}
 #else
 	return -EOPNOTSUPP;
 #endif
 
 	ret = get_route_and_out_devs(priv, rt->dst.dev, route_dev, out_dev);
-	if (ret < 0)
+	if (ret < 0) {
+		ip_rt_put(rt);
 		return ret;
+	}
 
 	if (!(*out_ttl))
 		*out_ttl = ip4_dst_hoplimit(&rt->dst);
@@ -142,8 +146,10 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
 		*out_ttl = ip6_dst_hoplimit(dst);
 
 	ret = get_route_and_out_devs(priv, dst->dev, route_dev, out_dev);
-	if (ret < 0)
+	if (ret < 0) {
+		dst_release(dst);
 		return ret;
+	}
 #else
 	return -EOPNOTSUPP;
 #endif
-- 
2.20.1


From 50d2b933112d0852d7859fc03bcd5d7589ef56cd Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 30 Oct 2019 11:04:22 +0200
Subject: [PATCH 38/55] mlxsw: core: Unpublish devlink parameters during reload

[ Upstream commit b7265a0df82c1716bf788096217083ed65a8bb14 ]

The devlink parameter "acl_region_rehash_interval" is a runtime
parameter whose value is stored in a dynamically allocated memory. While
reloading the driver, this memory is freed and then allocated again. A
use-after-free might happen if during this time frame someone tries to
retrieve its value.

Since commit 070c63f20f6c ("net: devlink: allow to change namespaces
during reload") the use-after-free can be reliably triggered when
reloading the driver into a namespace, as after freeing the memory (via
reload_down() callback) all the parameters are notified.

Fix this by unpublishing and then re-publishing the parameters during
reload.

Fixes: 98bbf70c1c41 ("mlxsw: spectrum: add "acl_region_rehash_interval" devlink param")
Fixes: 7c62cfb8c574 ("devlink: publish params only after driver init is done")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 17ceac7505e5..b94cdbd7bb18 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1128,7 +1128,7 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_thermal_init;
 
-	if (mlxsw_driver->params_register && !reload)
+	if (mlxsw_driver->params_register)
 		devlink_params_publish(devlink);
 
 	return 0;
@@ -1201,7 +1201,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
 			return;
 	}
 
-	if (mlxsw_core->driver->params_unregister && !reload)
+	if (mlxsw_core->driver->params_unregister)
 		devlink_params_unpublish(devlink);
 	mlxsw_thermal_fini(mlxsw_core->thermal);
 	mlxsw_hwmon_fini(mlxsw_core->hwmon);
-- 
2.20.1


From 29b0fed2da074ff626346292eb5dc34dee8f4cc8 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 1 Nov 2019 00:10:21 +0100
Subject: [PATCH 39/55] r8169: fix wrong PHY ID issue with RTL8168dp

[ Upstream commit 62bdc8fd1c21d4263ebd18bec57f82532d09249f ]

As reported in [0] at least one RTL8168dp version has problems
establishing a link. This chip version has an integrated RTL8211b PHY,
however the chip seems to report a wrong PHY ID, resulting in a wrong
PHY driver (for Generic Realtek PHY) being loaded.
Work around this issue by adding a hook to r8168dp_2_mdio_read()
for returning the correct PHY ID.

[0] https://bbs.archlinux.org/viewtopic.php?id=246508

Fixes: 242cd9b5866a ("r8169: use phy_resume/phy_suspend")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index bae0074ab9aa..00c86c7dd42d 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -976,6 +976,10 @@ static int r8168dp_2_mdio_read(struct rtl8169_private *tp, int reg)
 {
 	int value;
 
+	/* Work around issue with chip reporting wrong PHY ID */
+	if (reg == MII_PHYSID2)
+		return 0xc912;
+
 	r8168dp_2_mdio_start(tp);
 
 	value = r8169_mdio_read(tp, reg);
-- 
2.20.1


From 0b361cf04266d729d8496d770b8d79fa022eabfb Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Wed, 2 Oct 2019 16:53:21 +0300
Subject: [PATCH 40/55] net/mlx5e: Fix ethtool self test: link speed

[ Upstream commit 534e7366f41b0c689b01af4375aefcd1462adedf ]

Ethtool self test contains a test for link speed. This test reads the
PTYS register and determines whether the current speed is valid or not.
Change current implementation to use the function mlx5e_port_linkspeed()
that does the same check and fails when speed is invalid. This code
redundancy lead to a bug when mlx5e_port_linkspeed() was updated with
expended speeds and the self test was not.

Fixes: 2c81bfd5ae56 ("net/mlx5e: Move port speed code from en_ethtool.c to en/port.c")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_selftest.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 840ec945ccba..bbff8d8ded76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -35,6 +35,7 @@
 #include <linux/udp.h>
 #include <net/udp.h>
 #include "en.h"
+#include "en/port.h"
 
 enum {
 	MLX5E_ST_LINK_STATE,
@@ -80,22 +81,12 @@ static int mlx5e_test_link_state(struct mlx5e_priv *priv)
 
 static int mlx5e_test_link_speed(struct mlx5e_priv *priv)
 {
-	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
-	u32 eth_proto_oper;
-	int i;
+	u32 speed;
 
 	if (!netif_carrier_ok(priv->netdev))
 		return 1;
 
-	if (mlx5_query_port_ptys(priv->mdev, out, sizeof(out), MLX5_PTYS_EN, 1))
-		return 1;
-
-	eth_proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
-	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; i++) {
-		if (eth_proto_oper & MLX5E_PROT_MASK(i))
-			return 0;
-	}
-	return 1;
+	return mlx5e_port_linkspeed(priv->mdev, &speed);
 }
 
 struct mlx5ehdr {
-- 
2.20.1


From c10b1df56f659cea091da620c1a84a2f2fc0c872 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Mon, 16 Sep 2019 14:54:20 +0300
Subject: [PATCH 41/55] net/mlx5e: Fix handling of compressed CQEs in case of
 low NAPI budget

[ Upstream commit 9df86bdb6746d7fcfc2fda715f7a7c3d0ddb2654 ]

When CQE compression is enabled, compressed CQEs use the following
structure: a title is followed by one or many blocks, each containing 8
mini CQEs (except the last, which may contain fewer mini CQEs).

Due to NAPI budget restriction, a complete structure is not always
parsed in one NAPI run, and some blocks with mini CQEs may be deferred
to the next NAPI poll call - we have the mlx5e_decompress_cqes_cont call
in the beginning of mlx5e_poll_rx_cq. However, if the budget is
extremely low, some blocks may be left even after that, but the code
that follows the mlx5e_decompress_cqes_cont call doesn't check it and
assumes that a new CQE begins, which may not be the case. In such cases,
random memory corruptions occur.

An extremely low NAPI budget of 8 is used when busy_poll or busy_read is
active.

This commit adds a check to make sure that the previous compressed CQE
has been completely parsed after mlx5e_decompress_cqes_cont, otherwise
it prevents a new CQE from being fetched in the middle of a compressed
CQE.

This commit fixes random crashes in __build_skb, __page_pool_put_page
and other not-related-directly places, that used to happen when both CQE
compression and busy_poll/busy_read were enabled.

Fixes: 7219ab34f184 ("net/mlx5e: CQE compression")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ac6e586d403d..fb139f8b9acf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1367,8 +1367,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return 0;
 
-	if (rq->cqd.left)
+	if (rq->cqd.left) {
 		work_done += mlx5e_decompress_cqes_cont(rq, cqwq, 0, budget);
+		if (rq->cqd.left || work_done >= budget)
+			goto out;
+	}
 
 	cqe = mlx5_cqwq_get_cqe(cqwq);
 	if (!cqe) {
-- 
2.20.1


From f34965e2d568cde84a2fe2d80b5ebb29ca32d6e9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 19 Oct 2019 09:26:37 -0700
Subject: [PATCH 42/55] ipv4: fix IPSKB_FRAG_PMTU handling with fragmentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit e7a409c3f46cb0dbc7bfd4f6f9421d53e92614a5 ]

This patch removes the iph field from the state structure, which is not
properly initialized. Instead, add a new field to make the "do we want
to set DF" be the state bit and move the code to set the DF flag from
ip_frag_next().

Joint work with Pablo and Linus.

Fixes: 19c3401a917b ("net: ipv4: place control buffer handling away from fragmentation iterators")
Reported-by: Patrick Schönthaler <patrick@notvads.ovh>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h                           |  4 ++--
 net/bridge/netfilter/nf_conntrack_bridge.c |  2 +-
 net/ipv4/ip_output.c                       | 11 ++++++-----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 29d89de39822..e6609ab69161 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -184,7 +184,7 @@ static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
 }
 
 struct ip_frag_state {
-	struct iphdr	*iph;
+	bool		DF;
 	unsigned int	hlen;
 	unsigned int	ll_rs;
 	unsigned int	mtu;
@@ -195,7 +195,7 @@ struct ip_frag_state {
 };
 
 void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs,
-		  unsigned int mtu, struct ip_frag_state *state);
+		  unsigned int mtu, bool DF, struct ip_frag_state *state);
 struct sk_buff *ip_frag_next(struct sk_buff *skb,
 			     struct ip_frag_state *state);
 
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 4f5444d2a526..b3866536a0da 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -94,7 +94,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 	 * This may also be a clone skbuff, we could preserve the geometry for
 	 * the copies but probably not worth the effort.
 	 */
-	ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state);
+	ip_frag_init(skb, hlen, ll_rs, frag_max_size, false, &state);
 
 	while (state.left > 0) {
 		struct sk_buff *skb2;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index da521790cd63..be64c4a210fa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -645,11 +645,12 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
 EXPORT_SYMBOL(ip_fraglist_prepare);
 
 void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
-		  unsigned int ll_rs, unsigned int mtu,
+		  unsigned int ll_rs, unsigned int mtu, bool DF,
 		  struct ip_frag_state *state)
 {
 	struct iphdr *iph = ip_hdr(skb);
 
+	state->DF = DF;
 	state->hlen = hlen;
 	state->ll_rs = ll_rs;
 	state->mtu = mtu;
@@ -668,9 +669,6 @@ static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
 	/* Copy the flags to each fragment. */
 	IPCB(to)->flags = IPCB(from)->flags;
 
-	if (IPCB(from)->flags & IPSKB_FRAG_PMTU)
-		state->iph->frag_off |= htons(IP_DF);
-
 	/* ANK: dirty, but effective trick. Upgrade options only if
 	 * the segment to be fragmented was THE FIRST (otherwise,
 	 * options are already fixed) and make it ONCE
@@ -738,6 +736,8 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
 	 */
 	iph = ip_hdr(skb2);
 	iph->frag_off = htons((state->offset >> 3));
+	if (state->DF)
+		iph->frag_off |= htons(IP_DF);
 
 	/*
 	 *	Added AC : If we are fragmenting a fragment that's not the
@@ -881,7 +881,8 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	 *	Fragment the datagram.
 	 */
 
-	ip_frag_init(skb, hlen, ll_rs, mtu, &state);
+	ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
+		     &state);
 
 	/*
 	 *	Keep copying data until we run out.
-- 
2.20.1


From 5f18f8f20eb769fbe6578856eb7b054981453d7d Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:29 -0700
Subject: [PATCH 43/55] net: bcmgenet: don't set phydev->link from MAC

[ Upstream commit 7de48402faa32298c3551ea32c76ccb4f9d3025d ]

When commit 28b2e0d2cd13 ("net: phy: remove parameter new_link from
phy_mac_interrupt()") removed the new_link parameter it set the
phydev->link state from the MAC before invoking phy_mac_interrupt().

However, once commit 88d6272acaaa ("net: phy: avoid unneeded MDIO
reads in genphy_read_status") was added this initialization prevents
the proper determination of the connection parameters by the function
genphy_read_status().

This commit removes that initialization to restore the proper
functionality.

Fixes: 88d6272acaaa ("net: phy: avoid unneeded MDIO reads in genphy_read_status")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index b22196880d6d..f7359d2271df 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2617,10 +2617,8 @@ static void bcmgenet_irq_task(struct work_struct *work)
 	spin_unlock_irq(&priv->lock);
 
 	/* Link UP/DOWN event */
-	if (status & UMAC_IRQ_LINK_EVENT) {
-		priv->dev->phydev->link = !!(status & UMAC_IRQ_LINK_UP);
+	if (status & UMAC_IRQ_LINK_EVENT)
 		phy_mac_interrupt(priv->dev->phydev);
-	}
 }
 
 /* bcmgenet_isr1: handle Rx and Tx priority queues */
-- 
2.20.1


From 4f856960165370a7f69b885cfdae169a15a5042e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sat, 5 Oct 2019 15:05:18 -0700
Subject: [PATCH 44/55] net: dsa: b53: Do not clear existing mirrored port mask

[ Upstream commit c763ac436b668d7417f0979430ec0312ede4093d ]

Clearing the existing bitmask of mirrored ports essentially prevents us
from capturing more than one port at any given time. This is clearly
wrong, do not clear the bitmask prior to setting up the new port.

Reported-by: Hubert Feurstein <h.feurstein@gmail.com>
Fixes: ed3af5fd08eb ("net: dsa: b53: Add support for port mirroring")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 907af62846ba..0721c22e2bc8 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1808,7 +1808,6 @@ int b53_mirror_add(struct dsa_switch *ds, int port,
 		loc = B53_EG_MIR_CTL;
 
 	b53_read16(dev, B53_MGMT_PAGE, loc, &reg);
-	reg &= ~MIRROR_MASK;
 	reg |= BIT(port);
 	b53_write16(dev, B53_MGMT_PAGE, loc, reg);
 
-- 
2.20.1


From 3abecc13b1d69b54291e9ce0f5f46457e662f853 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Fri, 18 Oct 2019 17:02:46 -0400
Subject: [PATCH 45/55] net: dsa: fix switch tree list

[ Upstream commit 50c7d2ba9de20f60a2d527ad6928209ef67e4cdd ]

If there are multiple switch trees on the device, only the last one
will be listed, because the arguments of list_add_tail are swapped.

Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation")
Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 96f787cf9b6e..130f1a343abb 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -46,7 +46,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index)
 	dst->index = index;
 
 	INIT_LIST_HEAD(&dst->list);
-	list_add_tail(&dsa_tree_list, &dst->list);
+	list_add_tail(&dst->list, &dsa_tree_list);
 
 	kref_init(&dst->refcount);
 
-- 
2.20.1


From c1859a8574f3cbf37ed546f64648dd306a22af2c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Oct 2019 18:00:56 -0700
Subject: [PATCH 46/55] net: ensure correct skb->tstamp in various fragmenters

[ Upstream commit 9669fffc1415bb0c30e5d2ec98a8e1c3a418cb9c ]

Thomas found that some forwarded packets would be stuck
in FQ packet scheduler because their skb->tstamp contained
timestamps far in the future.

We thought we addressed this point in commit 8203e2d844d3
("net: clear skb->tstamp in forwarding paths") but there
is still an issue when/if a packet needs to be fragmented.

In order to meet EDT requirements, we have to make sure all
fragments get the original skb->tstamp.

Note that this original skb->tstamp should be zero in
forwarding path, but might have a non zero value in
output path if user decided so.

Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Thomas Bartschies <Thomas.Bartschies@cvk.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/nf_conntrack_bridge.c | 3 +++
 net/ipv4/ip_output.c                       | 3 +++
 net/ipv6/ip6_output.c                      | 3 +++
 net/ipv6/netfilter.c                       | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index b3866536a0da..a48cb1baeac6 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -34,6 +34,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 {
 	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
 	unsigned int hlen, ll_rs, mtu;
+	ktime_t tstamp = skb->tstamp;
 	struct ip_frag_state state;
 	struct iphdr *iph;
 	int err;
@@ -81,6 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 			if (iter.frag)
 				ip_fraglist_prepare(skb, &iter);
 
+			skb->tstamp = tstamp;
 			err = output(net, sk, data, skb);
 			if (err || !iter.frag)
 				break;
@@ -105,6 +107,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 			goto blackhole;
 		}
 
+		skb2->tstamp = tstamp;
 		err = output(net, sk, data, skb2);
 		if (err)
 			goto blackhole;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index be64c4a210fa..e780ceab16e1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -771,6 +771,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	struct rtable *rt = skb_rtable(skb);
 	unsigned int mtu, hlen, ll_rs;
 	struct ip_fraglist_iter iter;
+	ktime_t tstamp = skb->tstamp;
 	struct ip_frag_state state;
 	int err = 0;
 
@@ -846,6 +847,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 				ip_fraglist_prepare(skb, &iter);
 			}
 
+			skb->tstamp = tstamp;
 			err = output(net, sk, skb);
 
 			if (!err)
@@ -901,6 +903,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
+		skb2->tstamp = tstamp;
 		err = output(net, sk, skb2);
 		if (err)
 			goto fail;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8e49fd62eea9..e71568f730f9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -768,6 +768,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 				inet6_sk(skb->sk) : NULL;
 	struct ip6_frag_state state;
 	unsigned int mtu, hlen, nexthdr_offset;
+	ktime_t tstamp = skb->tstamp;
 	int hroom, err = 0;
 	__be32 frag_id;
 	u8 *prevhdr, nexthdr = 0;
@@ -855,6 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			if (iter.frag)
 				ip6_fraglist_prepare(skb, &iter);
 
+			skb->tstamp = tstamp;
 			err = output(net, sk, skb);
 			if (!err)
 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -913,6 +915,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
+		frag->tstamp = tstamp;
 		err = output(net, sk, frag);
 		if (err)
 			goto fail;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 61819ed858b1..7e75d01464fb 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -119,6 +119,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 				  struct sk_buff *))
 {
 	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+	ktime_t tstamp = skb->tstamp;
 	struct ip6_frag_state state;
 	u8 *prevhdr, nexthdr = 0;
 	unsigned int mtu, hlen;
@@ -183,6 +184,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			if (iter.frag)
 				ip6_fraglist_prepare(skb, &iter);
 
+			skb->tstamp = tstamp;
 			err = output(net, sk, data, skb);
 			if (err || !iter.frag)
 				break;
@@ -215,6 +217,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			goto blackhole;
 		}
 
+		skb2->tstamp = tstamp;
 		err = output(net, sk, data, skb2);
 		if (err)
 			goto blackhole;
-- 
2.20.1


From 0e46b14cae9201e6d8161c9a5b2d55347a574937 Mon Sep 17 00:00:00 2001
From: Yonglong Liu <liuyonglong@huawei.com>
Date: Fri, 18 Oct 2019 11:42:59 +0800
Subject: [PATCH 47/55] net: hns3: fix mis-counting IRQ vector numbers issue

[ Upstream commit 580a05f9d4ada3bfb689140d0efec1efdb8a48da ]

Currently, the num_msi_left means the vector numbers of NIC,
but if the PF supported RoCE, it contains the vector numbers
of NIC and RoCE(Not expected).

This may cause interrupts lost in some case, because of the
NIC module used the vector resources which belongs to RoCE.

This patch adds a new variable num_nic_msi to store the vector
numbers of NIC, and adjust the default TQP numbers and rss_size
according to the value of num_nic_msi.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h   |  2 ++
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 21 +++++++++++++-
 .../hisilicon/hns3/hns3pf/hclge_main.h        |  1 +
 .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 11 ++++++--
 .../hisilicon/hns3/hns3vf/hclgevf_main.c      | 28 +++++++++++++++++--
 .../hisilicon/hns3/hns3vf/hclgevf_main.h      |  1 +
 6 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 48c7b70fc2c4..58a7d62b38de 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -32,6 +32,8 @@
 
 #define HNAE3_MOD_VERSION "1.0"
 
+#define HNAE3_MIN_VECTOR_NUM	2 /* first one for misc, another for IO */
+
 /* Device IDs */
 #define HNAE3_DEV_ID_GE				0xA220
 #define HNAE3_DEV_ID_25GE			0xA221
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3fde5471e1c0..65b53ec1d9ca 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -800,6 +800,9 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev)
 		hnae3_get_field(__le16_to_cpu(req->pf_intr_vector_number),
 				HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S);
 
+		/* nic's msix numbers is always equals to the roce's. */
+		hdev->num_nic_msi = hdev->num_roce_msi;
+
 		/* PF should have NIC vectors and Roce vectors,
 		 * NIC vectors are queued before Roce vectors.
 		 */
@@ -809,6 +812,15 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev)
 		hdev->num_msi =
 		hnae3_get_field(__le16_to_cpu(req->pf_intr_vector_number),
 				HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S);
+
+		hdev->num_nic_msi = hdev->num_msi;
+	}
+
+	if (hdev->num_nic_msi < HNAE3_MIN_VECTOR_NUM) {
+		dev_err(&hdev->pdev->dev,
+			"Just %u msi resources, not enough for pf(min:2).\n",
+			hdev->num_nic_msi);
+		return -EINVAL;
 	}
 
 	return 0;
@@ -1394,6 +1406,10 @@ static int  hclge_assign_tqp(struct hclge_vport *vport, u16 num_tqps)
 	kinfo->rss_size = min_t(u16, hdev->rss_size_max,
 				vport->alloc_tqps / hdev->tm_info.num_tc);
 
+	/* ensure one to one mapping between irq and queue at default */
+	kinfo->rss_size = min_t(u16, kinfo->rss_size,
+				(hdev->num_nic_msi - 1) / hdev->tm_info.num_tc);
+
 	return 0;
 }
 
@@ -2172,7 +2188,8 @@ static int hclge_init_msi(struct hclge_dev *hdev)
 	int vectors;
 	int i;
 
-	vectors = pci_alloc_irq_vectors(pdev, 1, hdev->num_msi,
+	vectors = pci_alloc_irq_vectors(pdev, HNAE3_MIN_VECTOR_NUM,
+					hdev->num_msi,
 					PCI_IRQ_MSI | PCI_IRQ_MSIX);
 	if (vectors < 0) {
 		dev_err(&pdev->dev,
@@ -2187,6 +2204,7 @@ static int hclge_init_msi(struct hclge_dev *hdev)
 
 	hdev->num_msi = vectors;
 	hdev->num_msi_left = vectors;
+
 	hdev->base_msi_vector = pdev->irq;
 	hdev->roce_base_vector = hdev->base_msi_vector +
 				hdev->roce_base_msix_offset;
@@ -3644,6 +3662,7 @@ static int hclge_get_vector(struct hnae3_handle *handle, u16 vector_num,
 	int alloc = 0;
 	int i, j;
 
+	vector_num = min_t(u16, hdev->num_nic_msi - 1, vector_num);
 	vector_num = min(hdev->num_msi_left, vector_num);
 
 	for (j = 0; j < vector_num; j++) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 6a12285f4c76..6dc66d3f8408 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -795,6 +795,7 @@ struct hclge_dev {
 	u32 base_msi_vector;
 	u16 *vector_status;
 	int *vector_irq;
+	u16 num_nic_msi;	/* Num of nic vectors for this PF */
 	u16 num_roce_msi;	/* Num of roce vectors for this PF */
 	int roce_base_vector;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 3f41fa2bc414..856337705949 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -540,9 +540,16 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport)
 		kinfo->rss_size = kinfo->req_rss_size;
 	} else if (kinfo->rss_size > max_rss_size ||
 		   (!kinfo->req_rss_size && kinfo->rss_size < max_rss_size)) {
+		/* if user not set rss, the rss_size should compare with the
+		 * valid msi numbers to ensure one to one map between tqp and
+		 * irq as default.
+		 */
+		if (!kinfo->req_rss_size)
+			max_rss_size = min_t(u16, max_rss_size,
+					     (hdev->num_nic_msi - 1) /
+					     kinfo->num_tc);
+
 		/* Set to the maximum specification value (max_rss_size). */
-		dev_info(&hdev->pdev->dev, "rss changes from %d to %d\n",
-			 kinfo->rss_size, max_rss_size);
 		kinfo->rss_size = max_rss_size;
 	}
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index a13a0e101c3b..b094d4e9ba2d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -411,6 +411,13 @@ static int hclgevf_knic_setup(struct hclgevf_dev *hdev)
 		kinfo->tqp[i] = &hdev->htqp[i].q;
 	}
 
+	/* after init the max rss_size and tqps, adjust the default tqp numbers
+	 * and rss size with the actual vector numbers
+	 */
+	kinfo->num_tqps = min_t(u16, hdev->num_nic_msix - 1, kinfo->num_tqps);
+	kinfo->rss_size = min_t(u16, kinfo->num_tqps / kinfo->num_tc,
+				kinfo->rss_size);
+
 	return 0;
 }
 
@@ -502,6 +509,7 @@ static int hclgevf_get_vector(struct hnae3_handle *handle, u16 vector_num,
 	int alloc = 0;
 	int i, j;
 
+	vector_num = min_t(u16, hdev->num_nic_msix - 1, vector_num);
 	vector_num = min(hdev->num_msi_left, vector_num);
 
 	for (j = 0; j < vector_num; j++) {
@@ -2208,13 +2216,14 @@ static int hclgevf_init_msi(struct hclgevf_dev *hdev)
 	int vectors;
 	int i;
 
-	if (hnae3_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B))
+	if (hnae3_dev_roce_supported(hdev))
 		vectors = pci_alloc_irq_vectors(pdev,
 						hdev->roce_base_msix_offset + 1,
 						hdev->num_msi,
 						PCI_IRQ_MSIX);
 	else
-		vectors = pci_alloc_irq_vectors(pdev, 1, hdev->num_msi,
+		vectors = pci_alloc_irq_vectors(pdev, HNAE3_MIN_VECTOR_NUM,
+						hdev->num_msi,
 						PCI_IRQ_MSI | PCI_IRQ_MSIX);
 
 	if (vectors < 0) {
@@ -2230,6 +2239,7 @@ static int hclgevf_init_msi(struct hclgevf_dev *hdev)
 
 	hdev->num_msi = vectors;
 	hdev->num_msi_left = vectors;
+
 	hdev->base_msi_vector = pdev->irq;
 	hdev->roce_base_vector = pdev->irq + hdev->roce_base_msix_offset;
 
@@ -2495,7 +2505,7 @@ static int hclgevf_query_vf_resource(struct hclgevf_dev *hdev)
 
 	req = (struct hclgevf_query_res_cmd *)desc.data;
 
-	if (hnae3_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B)) {
+	if (hnae3_dev_roce_supported(hdev)) {
 		hdev->roce_base_msix_offset =
 		hnae3_get_field(__le16_to_cpu(req->msixcap_localid_ba_rocee),
 				HCLGEVF_MSIX_OFT_ROCEE_M,
@@ -2504,6 +2514,9 @@ static int hclgevf_query_vf_resource(struct hclgevf_dev *hdev)
 		hnae3_get_field(__le16_to_cpu(req->vf_intr_vector_number),
 				HCLGEVF_VEC_NUM_M, HCLGEVF_VEC_NUM_S);
 
+		/* nic's msix numbers is always equals to the roce's. */
+		hdev->num_nic_msix = hdev->num_roce_msix;
+
 		/* VF should have NIC vectors and Roce vectors, NIC vectors
 		 * are queued before Roce vectors. The offset is fixed to 64.
 		 */
@@ -2513,6 +2526,15 @@ static int hclgevf_query_vf_resource(struct hclgevf_dev *hdev)
 		hdev->num_msi =
 		hnae3_get_field(__le16_to_cpu(req->vf_intr_vector_number),
 				HCLGEVF_VEC_NUM_M, HCLGEVF_VEC_NUM_S);
+
+		hdev->num_nic_msix = hdev->num_msi;
+	}
+
+	if (hdev->num_nic_msix < HNAE3_MIN_VECTOR_NUM) {
+		dev_err(&hdev->pdev->dev,
+			"Just %u msi resources, not enough for vf(min:2).\n",
+			hdev->num_nic_msix);
+		return -EINVAL;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
index 5a9e30998a8f..3c90cff0e43a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
@@ -265,6 +265,7 @@ struct hclgevf_dev {
 	u16 num_msi;
 	u16 num_msi_left;
 	u16 num_msi_used;
+	u16 num_nic_msix;	/* Num of nic vectors for this VF */
 	u16 num_roce_msix;	/* Num of roce vectors for this VF */
 	u16 roce_base_msix_offset;
 	int roce_base_vector;
-- 
2.20.1


From f9a650acf2220a82748ad2efb24f53251770651e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 18 Oct 2019 09:16:57 -0700
Subject: [PATCH 48/55] net: netem: fix error path for corrupted GSO frames

[ Upstream commit a7fa12d15855904aff1716e1fc723c03ba38c5cc ]

To corrupt a GSO frame we first perform segmentation.  We then
proceed using the first segment instead of the full GSO skb and
requeue the rest of the segments as separate packets.

If there are any issues with processing the first segment we
still want to process the rest, therefore we jump to the
finish_segs label.

Commit 177b8007463c ("net: netem: fix backlog accounting for
corrupted GSO frames") started using the pointer to the first
segment in the "rest of segments processing", but as mentioned
above the first segment may had already been freed at this point.

Backlog corrections for parent qdiscs have to be adjusted.

Fixes: 177b8007463c ("net: netem: fix backlog accounting for corrupted GSO frames")
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index f5cb35e550f8..dad123054ce1 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -509,6 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
 		    skb_checksum_help(skb)) {
 			qdisc_drop(skb, sch, to_free);
+			skb = NULL;
 			goto finish_segs;
 		}
 
@@ -593,9 +594,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 finish_segs:
 	if (segs) {
 		unsigned int len, last_len;
-		int nb = 0;
+		int nb;
 
-		len = skb->len;
+		len = skb ? skb->len : 0;
+		nb = skb ? 1 : 0;
 
 		while (segs) {
 			skb2 = segs->next;
@@ -612,7 +614,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			}
 			segs = skb2;
 		}
-		qdisc_tree_reduce_backlog(sch, -nb, prev_len - len);
+		/* Parent qdiscs accounted for 1 skb of size @prev_len */
+		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
 	}
 	return NET_XMIT_SUCCESS;
 }
-- 
2.20.1


From 351951d03fa8fe7a4743b3478acb1b15f37b386c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Oct 2019 15:20:05 -0700
Subject: [PATCH 49/55] net: reorder 'struct net' fields to avoid false sharing

[ Upstream commit 2a06b8982f8f2f40d03a3daf634676386bd84dbc ]

Intel test robot reported a ~7% regression on TCP_CRR tests
that they bisected to the cited commit.

Indeed, every time a new TCP socket is created or deleted,
the atomic counter net->count is touched (via get_net(net)
and put_net(net) calls)

So cpus might have to reload a contended cache line in
net_hash_mix(net) calls.

We need to reorder 'struct net' fields to move @hash_mix
in a read mostly cache line.

We move in the first cache line fields that can be
dirtied often.

We probably will have to address in a followup patch
the __randomize_layout that was added in linux-4.13,
since this might break our placement choices.

Fixes: 355b98553789 ("netns: provide pure entropy for net_hash_mix()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f1f3438ed13a..8f8b37198f9b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -52,6 +52,9 @@ struct bpf_prog;
 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 
 struct net {
+	/* First cache line can be often dirtied.
+	 * Do not place here read-mostly fields.
+	 */
 	refcount_t		passive;	/* To decide when the network
 						 * namespace should be freed.
 						 */
@@ -60,7 +63,13 @@ struct net {
 						 */
 	spinlock_t		rules_mod_lock;
 
-	u32			hash_mix;
+	unsigned int		dev_unreg_count;
+
+	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
+	int			ifindex;
+
+	spinlock_t		nsid_lock;
+	atomic_t		fnhe_genid;
 
 	struct list_head	list;		/* list of network namespaces */
 	struct list_head	exit_list;	/* To linked to call pernet exit
@@ -76,11 +85,11 @@ struct net {
 #endif
 	struct user_namespace   *user_ns;	/* Owning user namespace */
 	struct ucounts		*ucounts;
-	spinlock_t		nsid_lock;
 	struct idr		netns_ids;
 
 	struct ns_common	ns;
 
+	struct list_head 	dev_base_head;
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;
 
@@ -93,12 +102,14 @@ struct net {
 
 	struct uevent_sock	*uevent_sock;		/* uevent socket */
 
-	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
-	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
-	int			ifindex;
-	unsigned int		dev_unreg_count;
+	/* Note that @hash_mix can be read millions times per second,
+	 * it is critical that it is on a read_mostly cache line.
+	 */
+	u32			hash_mix;
+
+	struct net_device       *loopback_dev;          /* The loopback */
 
 	/* core fib_rules */
 	struct list_head	rules_ops;
@@ -106,7 +117,6 @@ struct net {
 	struct list_head	fib_notifier_ops;  /* Populated by
 						    * register_pernet_subsys()
 						    */
-	struct net_device       *loopback_dev;          /* The loopback */
 	struct netns_core	core;
 	struct netns_mib	mib;
 	struct netns_packet	packet;
@@ -171,7 +181,6 @@ struct net {
 	struct netns_xdp	xdp;
 #endif
 	struct sock		*diag_nlsk;
-	atomic_t		fnhe_genid;
 } __randomize_layout;
 
 #include <linux/seq_file_net.h>
-- 
2.20.1


From 512b6f14c2519a8116b0af6f850c34c738a086cf Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 17 Oct 2019 21:29:26 +0200
Subject: [PATCH 50/55] net: usb: lan78xx: Connect PHY before registering MAC

[ Upstream commit 38b4fe320119859c11b1dc06f6b4987a16344fa1 ]

As soon as the netdev is registers, the kernel can start using the
interface. If the driver connects the MAC to the PHY after the netdev
is registered, there is a race condition where the interface can be
opened without having the PHY connected.

Change the order to close this race condition.

Fixes: 92571a1aae40 ("lan78xx: Connect phy early")
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index dd9b3f4c4561..7dd6289b1ffc 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -3792,10 +3792,14 @@ static int lan78xx_probe(struct usb_interface *intf,
 	/* driver requires remote-wakeup capability during autosuspend. */
 	intf->needs_remote_wakeup = 1;
 
+	ret = lan78xx_phy_init(dev);
+	if (ret < 0)
+		goto out4;
+
 	ret = register_netdev(netdev);
 	if (ret != 0) {
 		netif_err(dev, probe, netdev, "couldn't register the device\n");
-		goto out4;
+		goto out5;
 	}
 
 	usb_set_intfdata(intf, dev);
@@ -3808,14 +3812,10 @@ static int lan78xx_probe(struct usb_interface *intf,
 	pm_runtime_set_autosuspend_delay(&udev->dev,
 					 DEFAULT_AUTOSUSPEND_DELAY);
 
-	ret = lan78xx_phy_init(dev);
-	if (ret < 0)
-		goto out5;
-
 	return 0;
 
 out5:
-	unregister_netdev(netdev);
+	phy_disconnect(netdev->phydev);
 out4:
 	usb_free_urb(dev->urb_intr);
 out3:
-- 
2.20.1


From 89b0aac29bf3e661660a78f9ee3f6e506f976b00 Mon Sep 17 00:00:00 2001
From: Kazutoshi Noguchi <noguchi.kazutosi@gmail.com>
Date: Mon, 21 Oct 2019 00:03:07 +0900
Subject: [PATCH 51/55] r8152: add device id for Lenovo ThinkPad USB-C Dock Gen
 2

[ Upstream commit b3060531979422d5bb18d80226f978910284dc70 ]

This device is sold as 'ThinkPad USB-C Dock Gen 2 (40AS)'.
Chipset is RTL8153 and works with r8152.
Without this, the generic cdc_ether grabs the device, and the device jam
connected networks up when the machine suspends.

Signed-off-by: Kazutoshi Noguchi <noguchi.kazutosi@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c | 7 +++++++
 drivers/net/usb/r8152.c     | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 32f53de5b1fe..fe630438f67b 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -787,6 +787,13 @@ static const struct usb_device_id	products[] = {
 	.driver_info = 0,
 },
 
+/* ThinkPad USB-C Dock Gen 2 (based on Realtek RTL8153) */
+{
+	USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0xa387, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+	.driver_info = 0,
+},
+
 /* NVIDIA Tegra USB 3.0 Ethernet Adapters (based on Realtek RTL8153) */
 {
 	USB_DEVICE_AND_INTERFACE_INFO(NVIDIA_VENDOR_ID, 0x09ff, USB_CLASS_COMM,
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 9eedc0714422..7661d7475c2a 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -5402,6 +5402,7 @@ static const struct usb_device_id rtl8152_table[] = {
 	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x7205)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x720c)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x7214)},
+	{REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0xa387)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_LINKSYS, 0x0041)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_NVIDIA,  0x09ff)},
 	{REALTEK_USB_DEVICE(VENDOR_ID_TPLINK,  0x0601)},
-- 
2.20.1


From 14b9209bdea619e3d3ced5dc8d7db4aa55c57214 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 18 Oct 2019 09:16:58 -0700
Subject: [PATCH 52/55] net: netem: correct the parent's backlog when corrupted
 packet was dropped

[ Upstream commit e0ad032e144731a5928f2d75e91c2064ba1a764c ]

If packet corruption failed we jump to finish_segs and return
NET_XMIT_SUCCESS. Seeing success will make the parent qdisc
increment its backlog, that's incorrect - we need to return
NET_XMIT_DROP.

Fixes: 6071bd1aa13e ("netem: Segment GSO packets on enqueue")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index dad123054ce1..e6e6e5ae6d2b 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -616,6 +616,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
+	} else if (!skb) {
+		return NET_XMIT_DROP;
 	}
 	return NET_XMIT_SUCCESS;
 }
-- 
2.20.1


From 35a1afcfbbbec490821bd424da831840bfdc5cdc Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:30 -0700
Subject: [PATCH 53/55] net: phy: bcm7xxx: define soft_reset for 40nm EPHY

[ Upstream commit fe586b823372a9f43f90e2c6aa0573992ce7ccb7 ]

The internal 40nm EPHYs use a "Workaround for putting the PHY in
IDDQ mode." These PHYs require a soft reset to restore functionality
after they are powered back up.

This commit defines the soft_reset function to use genphy_soft_reset
during phy_init_hw to accommodate this.

Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index 8fc33867e524..af8eabe7a6d4 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -572,6 +572,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev)
 	.name           = _name,					\
 	/* PHY_BASIC_FEATURES */					\
 	.flags          = PHY_IS_INTERNAL,				\
+	.soft_reset	= genphy_soft_reset,				\
 	.config_init    = bcm7xxx_config_init,				\
 	.suspend        = bcm7xxx_suspend,				\
 	.resume         = bcm7xxx_config_init,				\
-- 
2.20.1


From 9fefa0e18095f5085e1ff2a836397ab86011c0fd Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:31 -0700
Subject: [PATCH 54/55] net: bcmgenet: soft reset 40nm EPHYs before MAC init

[ Upstream commit 1f515486275a08a17a2c806b844cca18f7de5b34 ]

It turns out that the "Workaround for putting the PHY in IDDQ mode"
used by the internal EPHYs on 40nm Set-Top Box chips when powering
down puts the interface to the GENET MAC in a state that can cause
subsequent MAC resets to be incomplete.

Rather than restore the forced soft reset when powering up internal
PHYs, this commit moves the invocation of phy_init_hw earlier in
the MAC initialization sequence to just before the MAC reset in the
open and resume functions. This allows the interface to be stable
and allows the MAC resets to be successful.

The bcmgenet_mii_probe() function is split in two to accommodate
this. The new function bcmgenet_mii_connect() handles the first
half of the functionality before the MAC initialization, and the
bcmgenet_mii_config() function is extended to provide the remaining
PHY configuration following the MAC initialization.

Fixes: 484bfa1507bf ("Revert "net: bcmgenet: Software reset EPHY after power on"")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/broadcom/genet/bcmgenet.c    |  28 +++--
 .../net/ethernet/broadcom/genet/bcmgenet.h    |   2 +-
 drivers/net/ethernet/broadcom/genet/bcmmii.c  | 112 ++++++++----------
 3 files changed, 69 insertions(+), 73 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index f7359d2271df..6f9ae436a6f4 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2877,6 +2877,12 @@ static int bcmgenet_open(struct net_device *dev)
 	if (priv->internal_phy)
 		bcmgenet_power_up(priv, GENET_POWER_PASSIVE);
 
+	ret = bcmgenet_mii_connect(dev);
+	if (ret) {
+		netdev_err(dev, "failed to connect to PHY\n");
+		goto err_clk_disable;
+	}
+
 	/* take MAC out of reset */
 	bcmgenet_umac_reset(priv);
 
@@ -2886,6 +2892,12 @@ static int bcmgenet_open(struct net_device *dev)
 	reg = bcmgenet_umac_readl(priv, UMAC_CMD);
 	priv->crc_fwd_en = !!(reg & CMD_CRC_FWD);
 
+	ret = bcmgenet_mii_config(dev, true);
+	if (ret) {
+		netdev_err(dev, "unsupported PHY\n");
+		goto err_disconnect_phy;
+	}
+
 	bcmgenet_set_hw_addr(priv, dev->dev_addr);
 
 	if (priv->internal_phy) {
@@ -2901,7 +2913,7 @@ static int bcmgenet_open(struct net_device *dev)
 	ret = bcmgenet_init_dma(priv);
 	if (ret) {
 		netdev_err(dev, "failed to initialize DMA\n");
-		goto err_clk_disable;
+		goto err_disconnect_phy;
 	}
 
 	/* Always enable ring 16 - descriptor ring */
@@ -2924,25 +2936,19 @@ static int bcmgenet_open(struct net_device *dev)
 		goto err_irq0;
 	}
 
-	ret = bcmgenet_mii_probe(dev);
-	if (ret) {
-		netdev_err(dev, "failed to connect to PHY\n");
-		goto err_irq1;
-	}
-
 	bcmgenet_netif_start(dev);
 
 	netif_tx_start_all_queues(dev);
 
 	return 0;
 
-err_irq1:
-	free_irq(priv->irq1, priv);
 err_irq0:
 	free_irq(priv->irq0, priv);
 err_fini_dma:
 	bcmgenet_dma_teardown(priv);
 	bcmgenet_fini_dma(priv);
+err_disconnect_phy:
+	phy_disconnect(dev->phydev);
 err_clk_disable:
 	if (priv->internal_phy)
 		bcmgenet_power_down(priv, GENET_POWER_PASSIVE);
@@ -3625,6 +3631,8 @@ static int bcmgenet_resume(struct device *d)
 	if (priv->internal_phy)
 		bcmgenet_power_up(priv, GENET_POWER_PASSIVE);
 
+	phy_init_hw(dev->phydev);
+
 	bcmgenet_umac_reset(priv);
 
 	init_umac(priv);
@@ -3633,8 +3641,6 @@ static int bcmgenet_resume(struct device *d)
 	if (priv->wolopts)
 		clk_disable_unprepare(priv->clk_wol);
 
-	phy_init_hw(dev->phydev);
-
 	/* Speed settings must be restored */
 	bcmgenet_mii_config(priv->dev, false);
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index dbc69d8fa05f..7fbf573d8d52 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -720,8 +720,8 @@ GENET_IO_MACRO(rbuf, GENET_RBUF_OFF);
 
 /* MDIO routines */
 int bcmgenet_mii_init(struct net_device *dev);
+int bcmgenet_mii_connect(struct net_device *dev);
 int bcmgenet_mii_config(struct net_device *dev, bool init);
-int bcmgenet_mii_probe(struct net_device *dev);
 void bcmgenet_mii_exit(struct net_device *dev);
 void bcmgenet_phy_power_set(struct net_device *dev, bool enable);
 void bcmgenet_mii_setup(struct net_device *dev);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index e7c291bf4ed1..17bb8d60a157 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -173,6 +173,46 @@ static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv)
 					  bcmgenet_fixed_phy_link_update);
 }
 
+int bcmgenet_mii_connect(struct net_device *dev)
+{
+	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct device_node *dn = priv->pdev->dev.of_node;
+	struct phy_device *phydev;
+	u32 phy_flags = 0;
+	int ret;
+
+	/* Communicate the integrated PHY revision */
+	if (priv->internal_phy)
+		phy_flags = priv->gphy_rev;
+
+	/* Initialize link state variables that bcmgenet_mii_setup() uses */
+	priv->old_link = -1;
+	priv->old_speed = -1;
+	priv->old_duplex = -1;
+	priv->old_pause = -1;
+
+	if (dn) {
+		phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup,
+					phy_flags, priv->phy_interface);
+		if (!phydev) {
+			pr_err("could not attach to PHY\n");
+			return -ENODEV;
+		}
+	} else {
+		phydev = dev->phydev;
+		phydev->dev_flags = phy_flags;
+
+		ret = phy_connect_direct(dev, phydev, bcmgenet_mii_setup,
+					 priv->phy_interface);
+		if (ret) {
+			pr_err("could not attach to PHY\n");
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
 int bcmgenet_mii_config(struct net_device *dev, bool init)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
@@ -266,71 +306,21 @@ int bcmgenet_mii_config(struct net_device *dev, bool init)
 		bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL);
 	}
 
-	if (init)
-		dev_info(kdev, "configuring instance for %s\n", phy_name);
-
-	return 0;
-}
+	if (init) {
+		linkmode_copy(phydev->advertising, phydev->supported);
 
-int bcmgenet_mii_probe(struct net_device *dev)
-{
-	struct bcmgenet_priv *priv = netdev_priv(dev);
-	struct device_node *dn = priv->pdev->dev.of_node;
-	struct phy_device *phydev;
-	u32 phy_flags = 0;
-	int ret;
-
-	/* Communicate the integrated PHY revision */
-	if (priv->internal_phy)
-		phy_flags = priv->gphy_rev;
-
-	/* Initialize link state variables that bcmgenet_mii_setup() uses */
-	priv->old_link = -1;
-	priv->old_speed = -1;
-	priv->old_duplex = -1;
-	priv->old_pause = -1;
-
-	if (dn) {
-		phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup,
-					phy_flags, priv->phy_interface);
-		if (!phydev) {
-			pr_err("could not attach to PHY\n");
-			return -ENODEV;
-		}
-	} else {
-		phydev = dev->phydev;
-		phydev->dev_flags = phy_flags;
-
-		ret = phy_connect_direct(dev, phydev, bcmgenet_mii_setup,
-					 priv->phy_interface);
-		if (ret) {
-			pr_err("could not attach to PHY\n");
-			return -ENODEV;
-		}
-	}
+		/* The internal PHY has its link interrupts routed to the
+		 * Ethernet MAC ISRs. On GENETv5 there is a hardware issue
+		 * that prevents the signaling of link UP interrupts when
+		 * the link operates at 10Mbps, so fallback to polling for
+		 * those versions of GENET.
+		 */
+		if (priv->internal_phy && !GENET_IS_V5(priv))
+			phydev->irq = PHY_IGNORE_INTERRUPT;
 
-	/* Configure port multiplexer based on what the probed PHY device since
-	 * reading the 'max-speed' property determines the maximum supported
-	 * PHY speed which is needed for bcmgenet_mii_config() to configure
-	 * things appropriately.
-	 */
-	ret = bcmgenet_mii_config(dev, true);
-	if (ret) {
-		phy_disconnect(dev->phydev);
-		return ret;
+		dev_info(kdev, "configuring instance for %s\n", phy_name);
 	}
 
-	linkmode_copy(phydev->advertising, phydev->supported);
-
-	/* The internal PHY has its link interrupts routed to the
-	 * Ethernet MAC ISRs. On GENETv5 there is a hardware issue
-	 * that prevents the signaling of link UP interrupts when
-	 * the link operates at 10Mbps, so fallback to polling for
-	 * those versions of GENET.
-	 */
-	if (priv->internal_phy && !GENET_IS_V5(priv))
-		dev->phydev->irq = PHY_IGNORE_INTERRUPT;
-
 	return 0;
 }
 
-- 
2.20.1


From c6a607400e5dd5e9c3106996d148755a867ccb45 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Wed, 16 Oct 2019 16:06:32 -0700
Subject: [PATCH 55/55] net: bcmgenet: reset 40nm EPHY on energy detect

[ Upstream commit 25382b991d252aed961cd434176240f9de6bb15f ]

The EPHY integrated into the 40nm Set-Top Box devices can falsely
detect energy when connected to a disabled peer interface. When the
peer interface is enabled the EPHY will detect and report the link
as active, but on occasion may get into a state where it is not
able to exchange data with the connected GENET MAC. This issue has
not been observed when the link parameters are auto-negotiated;
however, it has been observed with a manually configured link.

It has been empirically determined that issuing a soft reset to the
EPHY when energy is detected prevents it from getting into this bad
state.

Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 6f9ae436a6f4..7abf5cf29a84 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2018,6 +2018,8 @@ static void bcmgenet_link_intr_enable(struct bcmgenet_priv *priv)
 	 */
 	if (priv->internal_phy) {
 		int0_enable |= UMAC_IRQ_LINK_EVENT;
+		if (GENET_IS_V1(priv) || GENET_IS_V2(priv) || GENET_IS_V3(priv))
+			int0_enable |= UMAC_IRQ_PHY_DET_R;
 	} else if (priv->ext_phy) {
 		int0_enable |= UMAC_IRQ_LINK_EVENT;
 	} else if (priv->phy_interface == PHY_INTERFACE_MODE_MOCA) {
@@ -2616,9 +2618,14 @@ static void bcmgenet_irq_task(struct work_struct *work)
 	priv->irq0_stat = 0;
 	spin_unlock_irq(&priv->lock);
 
+	if (status & UMAC_IRQ_PHY_DET_R &&
+	    priv->dev->phydev->autoneg != AUTONEG_ENABLE)
+		phy_init_hw(priv->dev->phydev);
+
 	/* Link UP/DOWN event */
 	if (status & UMAC_IRQ_LINK_EVENT)
 		phy_mac_interrupt(priv->dev->phydev);
+
 }
 
 /* bcmgenet_isr1: handle Rx and Tx priority queues */
@@ -2713,7 +2720,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 	}
 
 	/* all other interested interrupts handled in bottom half */
-	status &= UMAC_IRQ_LINK_EVENT;
+	status &= (UMAC_IRQ_LINK_EVENT | UMAC_IRQ_PHY_DET_R);
 	if (status) {
 		/* Save irq status for bottom-half processing. */
 		spin_lock_irqsave(&priv->lock, flags);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [GIT] Networking
  2019-11-05  1:51 [GIT] Networking David Miller
@ 2019-11-06 15:43 ` Greg KH
  0 siblings, 0 replies; 10+ messages in thread
From: Greg KH @ 2019-11-06 15:43 UTC (permalink / raw)
  To: David Miller; +Cc: stable

On Mon, Nov 04, 2019 at 05:51:59PM -0800, David Miller wrote:
> 
> Please queue up the following networking bug fixes for v4.19 and
> v5.3 -stable, respectively.

All now queued up, thanks!

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GIT] Networking
  2019-05-31 22:18 ` Greg KH
@ 2019-05-31 23:49   ` David Miller
  0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2019-05-31 23:49 UTC (permalink / raw)
  To: greg; +Cc: stable

From: Greg KH <greg@kroah.com>
Date: Fri, 31 May 2019 15:18:10 -0700

> On Fri, May 31, 2019 at 02:16:26PM -0700, David Miller wrote:
>> 
>> Please queue up the following networking bug fixes for v5.0 and v5.1
>> -stable, respectively.
> 
> Now queued up, thanks!
> 
> Note, 5.0 will be end-of-life after this next release, so no need to do
> any patches for that kernel anymore.

Thank you Greg.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GIT] Networking
  2019-05-31 21:16 David Miller
@ 2019-05-31 22:18 ` Greg KH
  2019-05-31 23:49   ` David Miller
  0 siblings, 1 reply; 10+ messages in thread
From: Greg KH @ 2019-05-31 22:18 UTC (permalink / raw)
  To: David Miller; +Cc: stable

On Fri, May 31, 2019 at 02:16:26PM -0700, David Miller wrote:
> 
> Please queue up the following networking bug fixes for v5.0 and v5.1
> -stable, respectively.

Now queued up, thanks!

Note, 5.0 will be end-of-life after this next release, so no need to do
any patches for that kernel anymore.

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [GIT] Networking
@ 2019-05-31 21:16 David Miller
  2019-05-31 22:18 ` Greg KH
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2019-05-31 21:16 UTC (permalink / raw)
  To: stable

[-- Attachment #1: Type: Text/Plain, Size: 105 bytes --]


Please queue up the following networking bug fixes for v5.0 and v5.1
-stable, respectively.

Thank you.

[-- Attachment #2: net_50.mbox --]
[-- Type: Application/Octet-Stream, Size: 80931 bytes --]

From 181b720d9f8471698a67425dd9916eb8950a91ac Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 24 May 2019 09:49:28 -0400
Subject: [PATCH 01/33] bonding/802.3ad: fix slave link initialization
 transition states

[ Upstream commit 334031219a84b9994594015aab85ed7754c80176 ]

Once in a while, with just the right timing, 802.3ad slaves will fail to
properly initialize, winding up in a weird state, with a partner system
mac address of 00:00:00:00:00:00. This started happening after a fix to
properly track link_failure_count tracking, where an 802.3ad slave that
reported itself as link up in the miimon code, but wasn't able to get a
valid speed/duplex, started getting set to BOND_LINK_FAIL instead of
BOND_LINK_DOWN. That was the proper thing to do for the general "my link
went down" case, but has created a link initialization race that can put
the interface in this odd state.

The simple fix is to instead set the slave link to BOND_LINK_DOWN again,
if the link has never been up (last_link_up == 0), so the link state
doesn't bounce from BOND_LINK_DOWN to BOND_LINK_FAIL -- it hasn't failed
in this case, it simply hasn't been up yet, and this prevents the
unnecessary state change from DOWN to FAIL and getting stuck in an init
failure w/o a partner mac.

Fixes: ea53abfab960 ("bonding/802.3ad: fix link_failure_count tracking")
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: "David S. Miller" <davem@davemloft.net>
CC: netdev@vger.kernel.org
Tested-by: Heesoon Kim <Heesoon.Kim@stratus.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f89fc6ea6078..4eeece3576e1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3123,13 +3123,18 @@ static int bond_slave_netdev_event(unsigned long event,
 	case NETDEV_CHANGE:
 		/* For 802.3ad mode only:
 		 * Getting invalid Speed/Duplex values here will put slave
-		 * in weird state. So mark it as link-fail for the time
-		 * being and let link-monitoring (miimon) set it right when
-		 * correct speeds/duplex are available.
+		 * in weird state. Mark it as link-fail if the link was
+		 * previously up or link-down if it hasn't yet come up, and
+		 * let link-monitoring (miimon) set it right when correct
+		 * speeds/duplex are available.
 		 */
 		if (bond_update_speed_duplex(slave) &&
-		    BOND_MODE(bond) == BOND_MODE_8023AD)
-			slave->link = BOND_LINK_FAIL;
+		    BOND_MODE(bond) == BOND_MODE_8023AD) {
+			if (slave->last_link_up)
+				slave->link = BOND_LINK_FAIL;
+			else
+				slave->link = BOND_LINK_DOWN;
+		}
 
 		if (BOND_MODE(bond) == BOND_MODE_8023AD)
 			bond_3ad_adapter_speed_duplex_changed(slave);
-- 
2.20.1


From f0668474b0983f1f0b449a44e6a6ed99a670cfc5 Mon Sep 17 00:00:00 2001
From: Raju Rangoju <rajur@chelsio.com>
Date: Thu, 23 May 2019 20:41:44 +0530
Subject: [PATCH 02/33] cxgb4: offload VLAN flows regardless of VLAN ethtype

[ Upstream commit b5730061d1056abf317caea823b94d6e12b5b4f6 ]

VLAN flows never get offloaded unless ivlan_vld is set in filter spec.
It's not compulsory for vlan_ethtype to be set.

So, always enable ivlan_vld bit for offloading VLAN flows regardless of
vlan_ethtype is set or not.

Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan)
Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index c116f96956fe..f2aba5b160c2 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -228,6 +228,9 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 		fs->val.ivlan = vlan_tci;
 		fs->mask.ivlan = vlan_tci_mask;
 
+		fs->val.ivlan_vld = 1;
+		fs->mask.ivlan_vld = 1;
+
 		/* Chelsio adapters use ivlan_vld bit to match vlan packets
 		 * as 802.1Q. Also, when vlan tag is present in packets,
 		 * ethtype match is used then to match on ethtype of inner
@@ -238,8 +241,6 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 		 * ethtype value with ethtype of inner header.
 		 */
 		if (fs->val.ethtype == ETH_P_8021Q) {
-			fs->val.ivlan_vld = 1;
-			fs->mask.ivlan_vld = 1;
 			fs->val.ethtype = 0;
 			fs->mask.ethtype = 0;
 		}
-- 
2.20.1


From 22f789d2bf174df619d686dca9eeebaf974552ff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Mar 2019 12:40:33 -0700
Subject: [PATCH 03/33] inet: switch IP ID generator to siphash

[ Upstream commit df453700e8d81b1bdafdf684365ee2b9431fb702 ]

According to Amit Klein and Benny Pinkas, IP ID generation is too weak
and might be used by attackers.

Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix())
having 64bit key and Jenkins hash is risky.

It is time to switch to siphash and its 128bit keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/siphash.h  |  5 +++++
 include/net/netns/ipv4.h |  2 ++
 net/ipv4/route.c         | 12 +++++++-----
 net/ipv6/output_core.c   | 30 ++++++++++++++++--------------
 4 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index fa7a6b9cedbf..bf21591a9e5e 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -21,6 +21,11 @@ typedef struct {
 	u64 key[2];
 } siphash_key_t;
 
+static inline bool siphash_key_is_zero(const siphash_key_t *key)
+{
+	return !(key->key[0] | key->key[1]);
+}
+
 u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
 #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 104a6669e344..7698460a3dd1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -9,6 +9,7 @@
 #include <linux/uidgid.h>
 #include <net/inet_frag.h>
 #include <linux/rcupdate.h>
+#include <linux/siphash.h>
 
 struct tcpm_hash_bucket;
 struct ctl_table_header;
@@ -217,5 +218,6 @@ struct netns_ipv4 {
 	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
 
 	atomic_t	rt_genid;
+	siphash_key_t	ip_id_key;
 };
 #endif
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3c89ca325947..b66f78fad98c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -500,15 +500,17 @@ EXPORT_SYMBOL(ip_idents_reserve);
 
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 {
-	static u32 ip_idents_hashrnd __read_mostly;
 	u32 hash, id;
 
-	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+	/* Note the following code is not safe, but this is okay. */
+	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+		get_random_bytes(&net->ipv4.ip_id_key,
+				 sizeof(net->ipv4.ip_id_key));
 
-	hash = jhash_3words((__force u32)iph->daddr,
+	hash = siphash_3u32((__force u32)iph->daddr,
 			    (__force u32)iph->saddr,
-			    iph->protocol ^ net_hash_mix(net),
-			    ip_idents_hashrnd);
+			    iph->protocol,
+			    &net->ipv4.ip_id_key);
 	id = ip_idents_reserve(hash, segs);
 	iph->id = htons(id);
 }
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 4fe7c90962dd..868ae23dbae1 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -10,15 +10,25 @@
 #include <net/secure_seq.h>
 #include <linux/netfilter.h>
 
-static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
+static u32 __ipv6_select_ident(struct net *net,
 			       const struct in6_addr *dst,
 			       const struct in6_addr *src)
 {
+	const struct {
+		struct in6_addr dst;
+		struct in6_addr src;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.dst = *dst,
+		.src = *src,
+	};
 	u32 hash, id;
 
-	hash = __ipv6_addr_jhash(dst, hashrnd);
-	hash = __ipv6_addr_jhash(src, hash);
-	hash ^= net_hash_mix(net);
+	/* Note the following code is not safe, but this is okay. */
+	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+		get_random_bytes(&net->ipv4.ip_id_key,
+				 sizeof(net->ipv4.ip_id_key));
+
+	hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key);
 
 	/* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
 	 * set the hight order instead thus minimizing possible future
@@ -41,7 +51,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
  */
 __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 {
-	static u32 ip6_proxy_idents_hashrnd __read_mostly;
 	struct in6_addr buf[2];
 	struct in6_addr *addrs;
 	u32 id;
@@ -53,11 +62,7 @@ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 	if (!addrs)
 		return 0;
 
-	net_get_random_once(&ip6_proxy_idents_hashrnd,
-			    sizeof(ip6_proxy_idents_hashrnd));
-
-	id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
-				 &addrs[1], &addrs[0]);
+	id = __ipv6_select_ident(net, &addrs[1], &addrs[0]);
 	return htonl(id);
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
@@ -66,12 +71,9 @@ __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr)
 {
-	static u32 ip6_idents_hashrnd __read_mostly;
 	u32 id;
 
-	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
-
-	id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
+	id = __ipv6_select_ident(net, daddr, saddr);
 	return htonl(id);
 }
 EXPORT_SYMBOL(ipv6_select_ident);
-- 
2.20.1


From b3c8c4fb06aa04b7f9d0af06a87db506b02ae85f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 16:51:22 -0700
Subject: [PATCH 04/33] ipv4/igmp: fix another memory leak in
 igmpv3_del_delrec()

[ Upstream commit 3580d04aa674383c42de7b635d28e52a1e5bc72c ]

syzbot reported memory leaks [1] that I have back tracked to
a missing cleanup from igmpv3_del_delrec() when
(im->sfmode != MCAST_INCLUDE)

Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely
handle the cleanups before freeing.

[1]

BUG: memory leak
unreferenced object 0xffff888123e32b00 (size 64):
  comm "softirq", pid 0, jiffies 4294942968 (age 8.010s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline]
    [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553
    [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline]
    [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline]
    [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline]
    [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085
    [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475
    [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957
    [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246
    [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616
    [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130
    [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078
    [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline]
    [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline]
    [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086
    [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hangbin Liu <liuhangbin@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 765b2b32c4a4..9822366d2b41 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -632,6 +632,24 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
 	}
 }
 
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
+static void kfree_pmc(struct ip_mc_list *pmc)
+{
+	ip_sf_list_clear_all(pmc->sources);
+	ip_sf_list_clear_all(pmc->tomb);
+	kfree(pmc);
+}
+
 static void igmpv3_send_cr(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
@@ -668,7 +686,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
 			else
 				in_dev->mc_tomb = pmc_next;
 			in_dev_put(pmc->interface);
-			kfree(pmc);
+			kfree_pmc(pmc);
 		} else
 			pmc_prev = pmc;
 	}
@@ -1213,14 +1231,18 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 		im->interface = pmc->interface;
 		if (im->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
+			pmc->tomb = NULL;
+
 			im->sources = pmc->sources;
+			pmc->sources = NULL;
+
 			for (psf = im->sources; psf; psf = psf->sf_next)
 				psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		} else {
 			im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		}
 		in_dev_put(pmc->interface);
-		kfree(pmc);
+		kfree_pmc(pmc);
 	}
 	spin_unlock_bh(&im->lock);
 }
@@ -1241,21 +1263,18 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
 		nextpmc = pmc->next;
 		ip_mc_clear_src(pmc);
 		in_dev_put(pmc->interface);
-		kfree(pmc);
+		kfree_pmc(pmc);
 	}
 	/* clear dead sources, too */
 	rcu_read_lock();
 	for_each_pmc_rcu(in_dev, pmc) {
-		struct ip_sf_list *psf, *psf_next;
+		struct ip_sf_list *psf;
 
 		spin_lock_bh(&pmc->lock);
 		psf = pmc->tomb;
 		pmc->tomb = NULL;
 		spin_unlock_bh(&pmc->lock);
-		for (; psf; psf = psf_next) {
-			psf_next = psf->sf_next;
-			kfree(psf);
-		}
+		ip_sf_list_clear_all(psf);
 	}
 	rcu_read_unlock();
 }
@@ -2133,7 +2152,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 
 static void ip_mc_clear_src(struct ip_mc_list *pmc)
 {
-	struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
+	struct ip_sf_list *tomb, *sources;
 
 	spin_lock_bh(&pmc->lock);
 	tomb = pmc->tomb;
@@ -2145,14 +2164,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
 	pmc->sfcount[MCAST_EXCLUDE] = 1;
 	spin_unlock_bh(&pmc->lock);
 
-	for (psf = tomb; psf; psf = nextpsf) {
-		nextpsf = psf->sf_next;
-		kfree(psf);
-	}
-	for (psf = sources; psf; psf = nextpsf) {
-		nextpsf = psf->sf_next;
-		kfree(psf);
-	}
+	ip_sf_list_clear_all(tomb);
+	ip_sf_list_clear_all(sources);
 }
 
 /* Join a multicast group
-- 
2.20.1


From 94a90cfb817a79a197680d5d8dd703ae734cf14b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 18:35:16 -0700
Subject: [PATCH 05/33] ipv4/igmp: fix build error if !CONFIG_IP_MULTICAST

[ Upstream commit 903869bd10e6719b9df6718e785be7ec725df59f ]

ip_sf_list_clear_all() needs to be defined even if !CONFIG_IP_MULTICAST

Fixes: 3580d04aa674 ("ipv4/igmp: fix another memory leak in igmpv3_del_delrec()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9822366d2b41..1e79e1bca13c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -187,6 +187,17 @@ static void ip_ma_put(struct ip_mc_list *im)
 	     pmc != NULL;					\
 	     pmc = rtnl_dereference(pmc->next_rcu))
 
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
 #ifdef CONFIG_IP_MULTICAST
 
 /*
@@ -632,17 +643,6 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
 	}
 }
 
-static void ip_sf_list_clear_all(struct ip_sf_list *psf)
-{
-	struct ip_sf_list *next;
-
-	while (psf) {
-		next = psf->sf_next;
-		kfree(psf);
-		psf = next;
-	}
-}
-
 static void kfree_pmc(struct ip_mc_list *pmc)
 {
 	ip_sf_list_clear_all(pmc->sources);
-- 
2.20.1


From 0f3e2f8367a05631ef0bf54b6e8dd9979a164ef2 Mon Sep 17 00:00:00 2001
From: Mike Manning <mmanning@vyatta.att-mail.com>
Date: Mon, 20 May 2019 19:57:17 +0100
Subject: [PATCH 06/33] ipv6: Consider sk_bound_dev_if when binding a raw
 socket to an address

[ Upstream commit 72f7cfab6f93a8ea825fab8ccfb016d064269f7f ]

IPv6 does not consider if the socket is bound to a device when binding
to an address. The result is that a socket can be bound to eth0 and
then bound to the address of eth1. If the device is a VRF, the result
is that a socket can only be bound to an address in the default VRF.

Resolve by considering the device if sk_bound_dev_if is set.

Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5a426226c762..5cb14eabfc65 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -287,7 +287,9 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 			/* Binding to link-local address requires an interface */
 			if (!sk->sk_bound_dev_if)
 				goto out_unlock;
+		}
 
+		if (sk->sk_bound_dev_if) {
 			err = -ENODEV;
 			dev = dev_get_by_index_rcu(sock_net(sk),
 						   sk->sk_bound_dev_if);
-- 
2.20.1


From eebffccb2f48832dc1bf70d72f74c2abcf8f18ca Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 15:12:18 -0700
Subject: [PATCH 07/33] ipv6: Fix redirect with VRF

[ Upstream commit 31680ac265802397937d75461a2809a067b9fb93 ]

IPv6 redirect is broken for VRF. __ip6_route_redirect walks the FIB
entries looking for an exact match on ifindex. With VRF the flowi6_oif
is updated by l3mdev_update_flow to the l3mdev index and the
FLOWI_FLAG_SKIP_NH_OIF set in the flags to tell the lookup to skip the
device match. For redirects the device match is requires so use that
flag to know when the oif needs to be reset to the skb device index.

Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b471afce1330..457a27016e74 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2448,6 +2448,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	struct fib6_info *rt;
 	struct fib6_node *fn;
 
+	/* l3mdev_update_flow overrides oif if the device is enslaved; in
+	 * this case we must match on the real ingress device, so reset it
+	 */
+	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+		fl6->flowi6_oif = skb->dev->ifindex;
+
 	/* Get the "current" route for this destination and
 	 * check if the redirect has come from appropriate router.
 	 *
-- 
2.20.1


From e11731d0b09166badbf42aa7269440e487b76520 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 May 2019 17:35:52 -0700
Subject: [PATCH 08/33] llc: fix skb leak in llc_build_and_send_ui_pkt()

[ Upstream commit 8fb44d60d4142cd2a440620cd291d346e23c131e ]

If llc_mac_hdr_init() returns an error, we must drop the skb
since no llc_build_and_send_ui_pkt() caller will take care of this.

BUG: memory leak
unreferenced object 0xffff8881202b6800 (size 2048):
  comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.590s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    1a 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
  backtrace:
    [<00000000e25b5abe>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<00000000e25b5abe>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<00000000e25b5abe>] slab_alloc mm/slab.c:3326 [inline]
    [<00000000e25b5abe>] __do_kmalloc mm/slab.c:3658 [inline]
    [<00000000e25b5abe>] __kmalloc+0x161/0x2c0 mm/slab.c:3669
    [<00000000a1ae188a>] kmalloc include/linux/slab.h:552 [inline]
    [<00000000a1ae188a>] sk_prot_alloc+0xd6/0x170 net/core/sock.c:1608
    [<00000000ded25bbe>] sk_alloc+0x35/0x2f0 net/core/sock.c:1662
    [<000000002ecae075>] llc_sk_alloc+0x35/0x170 net/llc/llc_conn.c:950
    [<00000000551f7c47>] llc_ui_create+0x7b/0x140 net/llc/af_llc.c:173
    [<0000000029027f0e>] __sock_create+0x164/0x250 net/socket.c:1430
    [<000000008bdec225>] sock_create net/socket.c:1481 [inline]
    [<000000008bdec225>] __sys_socket+0x69/0x110 net/socket.c:1523
    [<00000000b6439228>] __do_sys_socket net/socket.c:1532 [inline]
    [<00000000b6439228>] __se_sys_socket net/socket.c:1530 [inline]
    [<00000000b6439228>] __x64_sys_socket+0x1e/0x30 net/socket.c:1530
    [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

BUG: memory leak
unreferenced object 0xffff88811d750d00 (size 224):
  comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.600s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 f0 0c 24 81 88 ff ff 00 68 2b 20 81 88 ff ff  ...$.....h+ ....
  backtrace:
    [<0000000053026172>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<0000000053026172>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<0000000053026172>] slab_alloc_node mm/slab.c:3269 [inline]
    [<0000000053026172>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579
    [<00000000fa8f3c30>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198
    [<00000000d96fdafb>] alloc_skb include/linux/skbuff.h:1058 [inline]
    [<00000000d96fdafb>] alloc_skb_with_frags+0x5f/0x250 net/core/skbuff.c:5327
    [<000000000a34a2e7>] sock_alloc_send_pskb+0x269/0x2a0 net/core/sock.c:2225
    [<00000000ee39999b>] sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2242
    [<00000000e034d810>] llc_ui_sendmsg+0x10a/0x540 net/llc/af_llc.c:933
    [<00000000c0bc8445>] sock_sendmsg_nosec net/socket.c:652 [inline]
    [<00000000c0bc8445>] sock_sendmsg+0x54/0x70 net/socket.c:671
    [<000000003b687167>] __sys_sendto+0x148/0x1f0 net/socket.c:1964
    [<00000000922d78d9>] __do_sys_sendto net/socket.c:1976 [inline]
    [<00000000922d78d9>] __se_sys_sendto net/socket.c:1972 [inline]
    [<00000000922d78d9>] __x64_sys_sendto+0x2a/0x30 net/socket.c:1972
    [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/llc_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
index 94425e421213..9e4b6bcf6920 100644
--- a/net/llc/llc_output.c
+++ b/net/llc/llc_output.c
@@ -72,6 +72,8 @@ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
 	rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
 	if (likely(!rc))
 		rc = dev_queue_xmit(skb);
+	else
+		kfree_skb(skb);
 	return rc;
 }
 
-- 
2.20.1


From 54c3ef00f4fd7a0a11f12ee474b83733953767a2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 29 May 2019 10:59:44 +0300
Subject: [PATCH 09/33] mlxsw: spectrum_acl: Avoid warning after identical
 rules insertion

[ Upstream commit ef74422020aa8c224b00a927e3e47faac4d8fae3 ]

When identical rules are inserted, the latter one goes to C-TCAM. For
that, a second eRP with the same mask is created. These 2 eRPs by the
nature cannot be merged and also one cannot be parent of another.
Teach mlxsw_sp_acl_erp_delta_fill() about this possibility and handle it
gracefully.

Reported-by: Alex Kushnarov <alexanderk@mellanox.com>
Fixes: c22291f7cf45 ("mlxsw: spectrum: acl: Implement delta for ERP")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
index 2941967e1cc5..2e5ebcd01b4b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
@@ -1169,13 +1169,12 @@ mlxsw_sp_acl_erp_delta_fill(const struct mlxsw_sp_acl_erp_key *parent_key,
 			return -EINVAL;
 	}
 	if (si == -1) {
-		/* The masks are the same, this cannot happen.
-		 * That means the caller is broken.
+		/* The masks are the same, this can happen in case eRPs with
+		 * the same mask were created in both A-TCAM and C-TCAM.
+		 * The only possible condition under which this can happen
+		 * is identical rule insertion. Delta is not possible here.
 		 */
-		WARN_ON(1);
-		*delta_start = 0;
-		*delta_mask = 0;
-		return 0;
+		return -EINVAL;
 	}
 	pmask = (unsigned char) parent_key->mask[__MASK_IDX(si)];
 	mask = (unsigned char) key->mask[__MASK_IDX(si)];
-- 
2.20.1


From e51c02ac318957b4dd2bbd178097236a6676792a Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Date: Wed, 29 May 2019 07:02:11 +0000
Subject: [PATCH 10/33] net: dsa: mv88e6xxx: fix handling of upper half of
 STATS_TYPE_PORT

[ Upstream commit 84b3fd1fc9592d431e23b077e692fa4e3fd0f086 ]

Currently, the upper half of a 4-byte STATS_TYPE_PORT statistic ends
up in bits 47:32 of the return value, instead of bits 31:16 as they
should.

Fixes: 6e46e2d821bb ("net: dsa: mv88e6xxx: Fix u64 statistics")
Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 6cba05a80892..5a81ce42b808 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -892,7 +892,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip,
 			err = mv88e6xxx_port_read(chip, port, s->reg + 1, &reg);
 			if (err)
 				return U64_MAX;
-			high = reg;
+			low |= ((u32)reg) << 16;
 		}
 		break;
 	case STATS_TYPE_BANK1:
-- 
2.20.1


From 02cf540a082bdd03b1c080b2df2d6a6ad1cddc1a Mon Sep 17 00:00:00 2001
From: Andy Duan <fugang.duan@nxp.com>
Date: Thu, 23 May 2019 01:55:28 +0000
Subject: [PATCH 11/33] net: fec: fix the clk mismatch in failed_reset path

[ Upstream commit ce8d24f9a5965a58c588f9342689702a1024433c ]

Fix the clk mismatch in the error path "failed_reset" because
below error path will disable clk_ahb and clk_ipg directly, it
should use pm_runtime_put_noidle() instead of pm_runtime_put()
to avoid to call runtime resume callback.

Reported-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Tested-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index a96ad20ee484..878ccce1dfcd 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3556,7 +3556,7 @@ fec_probe(struct platform_device *pdev)
 	if (fep->reg_phy)
 		regulator_disable(fep->reg_phy);
 failed_reset:
-	pm_runtime_put(&pdev->dev);
+	pm_runtime_put_noidle(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 failed_regulator:
 	clk_disable_unprepare(fep->clk_ahb);
-- 
2.20.1


From 49c896093219b9d5fccad9946618100da86cb53b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 29 May 2019 15:36:10 -0700
Subject: [PATCH 12/33] net-gro: fix use-after-free read in napi_gro_frags()

[ Upstream commit a4270d6795b0580287453ea55974d948393e66ef ]

If a network driver provides to napi_gro_frags() an
skb with a page fragment of exactly 14 bytes, the call
to gro_pull_from_frag0() will 'consume' the fragment
by calling skb_frag_unref(skb, 0), and the page might
be freed and reused.

Reading eth->h_proto at the end of napi_frags_skb() might
read mangled data, or crash under specific debugging features.

BUG: KASAN: use-after-free in napi_frags_skb net/core/dev.c:5833 [inline]
BUG: KASAN: use-after-free in napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841
Read of size 2 at addr ffff88809366840c by task syz-executor599/8957

CPU: 1 PID: 8957 Comm: syz-executor599 Not tainted 5.2.0-rc1+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load_n_noabort+0xf/0x20 mm/kasan/generic_report.c:142
 napi_frags_skb net/core/dev.c:5833 [inline]
 napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841
 tun_get_user+0x2f3c/0x3ff0 drivers/net/tun.c:1991
 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2037
 call_write_iter include/linux/fs.h:1872 [inline]
 do_iter_readv_writev+0x5f8/0x8f0 fs/read_write.c:693
 do_iter_write fs/read_write.c:970 [inline]
 do_iter_write+0x184/0x610 fs/read_write.c:951
 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1015
 do_writev+0x15b/0x330 fs/read_write.c:1058

Fixes: a50e233c50db ("net-gro: restore frag0 optimization")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index c8e672ac32cb..a8d017035ae9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5804,7 +5804,6 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 	skb_reset_mac_header(skb);
 	skb_gro_reset_offset(skb);
 
-	eth = skb_gro_header_fast(skb, 0);
 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 		eth = skb_gro_header_slow(skb, hlen, 0);
 		if (unlikely(!eth)) {
@@ -5814,6 +5813,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 			return NULL;
 		}
 	} else {
+		eth = (const struct ethhdr *)skb->data;
 		gro_pull_from_frag0(skb, hlen);
 		NAPI_GRO_CB(skb)->frag0 += hlen;
 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
-- 
2.20.1


From 9365db5aef61784eca1b754d9908bca159e8049e Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Mon, 27 May 2019 11:04:17 +0000
Subject: [PATCH 13/33] net: mvneta: Fix err code path of probe

[ Upstream commit d484e06e25ebb937d841dac02ac1fe76ec7d4ddd ]

Fix below issues in err code path of probe:
1. we don't need to unregister_netdev() because the netdev isn't
registered.
2. when register_netdev() fails, we also need to destroy bm pool for
HWBM case.

Fixes: dc35a10f68d3 ("net: mvneta: bm: add support for hardware buffer management")
Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 8433fb9c3eee..ea0236a2e18b 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4619,7 +4619,7 @@ static int mvneta_probe(struct platform_device *pdev)
 	err = register_netdev(dev);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to register\n");
-		goto err_free_stats;
+		goto err_netdev;
 	}
 
 	netdev_info(dev, "Using %s mac address %pM\n", mac_from,
@@ -4630,14 +4630,12 @@ static int mvneta_probe(struct platform_device *pdev)
 	return 0;
 
 err_netdev:
-	unregister_netdev(dev);
 	if (pp->bm_priv) {
 		mvneta_bm_pool_destroy(pp->bm_priv, pp->pool_long, 1 << pp->id);
 		mvneta_bm_pool_destroy(pp->bm_priv, pp->pool_short,
 				       1 << pp->id);
 		mvneta_bm_put(pp->bm_priv);
 	}
-err_free_stats:
 	free_percpu(pp->stats);
 err_free_ports:
 	free_percpu(pp->ports);
-- 
2.20.1


From 6174e5e7eee71b7c3017ceb2317b3e2cd4fd3d1e Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Wed, 29 May 2019 15:59:48 +0200
Subject: [PATCH 14/33] net: mvpp2: fix bad MVPP2_TXQ_SCHED_TOKEN_CNTR_REG
 queue value

[ Upstream commit 21808437214637952b61beaba6034d97880fbeb3 ]

MVPP2_TXQ_SCHED_TOKEN_CNTR_REG() expects the logical queue id but
the current code is passing the global tx queue offset, so it ends
up writing to unknown registers (between 0x8280 and 0x82fc, which
seemed to be unused by the hardware). This fixes the issue by using
the logical queue id instead.

Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 70031e2b2294..f063ba69eb17 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -1412,7 +1412,7 @@ static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port)
 /* Set defaults to the MVPP2 port */
 static void mvpp2_defaults_set(struct mvpp2_port *port)
 {
-	int tx_port_num, val, queue, ptxq, lrxq;
+	int tx_port_num, val, queue, lrxq;
 
 	if (port->priv->hw_version == MVPP21) {
 		/* Update TX FIFO MIN Threshold */
@@ -1433,11 +1433,9 @@ static void mvpp2_defaults_set(struct mvpp2_port *port)
 	mvpp2_write(port->priv, MVPP2_TXP_SCHED_FIXED_PRIO_REG, 0);
 
 	/* Close bandwidth for all queues */
-	for (queue = 0; queue < MVPP2_MAX_TXQ; queue++) {
-		ptxq = mvpp2_txq_phys(port->id, queue);
+	for (queue = 0; queue < MVPP2_MAX_TXQ; queue++)
 		mvpp2_write(port->priv,
-			    MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(ptxq), 0);
-	}
+			    MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(queue), 0);
 
 	/* Set refill period to 1 usec, refill tokens
 	 * and bucket size to maximum
@@ -2293,7 +2291,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port,
 	txq->descs_dma         = 0;
 
 	/* Set minimum bandwidth for disabled TXQs */
-	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->id), 0);
+	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->log_id), 0);
 
 	/* Set Tx descriptors queue starting address and size */
 	thread = mvpp2_cpu_to_thread(port->priv, get_cpu());
-- 
2.20.1


From 74518887348b399cbd01525bec3ac91e711bd94a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 28 May 2019 10:34:42 +0100
Subject: [PATCH 15/33] net: phy: marvell10g: report if the PHY fails to boot
 firmware

[ Upstream commit 3d3ced2ec5d71b99d72ae6910fbdf890bc2eccf0 ]

Some boards do not have the PHY firmware programmed in the 3310's flash,
which leads to the PHY not working as expected.  Warn the user when the
PHY fails to boot the firmware and refuse to initialise.

Fixes: 20b2af32ff3f ("net: phy: add Marvell Alaska X 88X3310 10Gigabit PHY support")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 6bac602094bd..8438f2f40d3d 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -29,6 +29,9 @@
 #define MDIO_AN_10GBT_CTRL_ADV_NBT_MASK	0x01e0
 
 enum {
+	MV_PMA_BOOT		= 0xc050,
+	MV_PMA_BOOT_FATAL	= BIT(0),
+
 	MV_PCS_BASE_T		= 0x0000,
 	MV_PCS_BASE_R		= 0x1000,
 	MV_PCS_1000BASEX	= 0x2000,
@@ -228,6 +231,16 @@ static int mv3310_probe(struct phy_device *phydev)
 	    (phydev->c45_ids.devices_in_package & mmd_mask) != mmd_mask)
 		return -ENODEV;
 
+	ret = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MV_PMA_BOOT);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MV_PMA_BOOT_FATAL) {
+		dev_warn(&phydev->mdio.dev,
+			 "PHY failed to boot firmware, status=%04x\n", ret);
+		return -ENODEV;
+	}
+
 	priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
-- 
2.20.1


From e8610583542287ebe56b0fe26d89a0aeb66c429d Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Thu, 23 May 2019 09:32:31 +0300
Subject: [PATCH 16/33] net: sched: don't use tc_action->order during action
 dump

[ Upstream commit 4097e9d250fb17958c1d9b94538386edd3f20144 ]

Function tcf_action_dump() relies on tc_action->order field when starting
nested nla to send action data to userspace. This approach breaks in
several cases:

- When multiple filters point to same shared action, tc_action->order field
  is overwritten each time it is attached to filter. This causes filter
  dump to output action with incorrect attribute for all filters that have
  the action in different position (different order) from the last set
  tc_action->order value.

- When action data is displayed using tc action API (RTM_GETACTION), action
  order is overwritten by tca_action_gd() according to its position in
  resulting array of nl attributes, which will break filter dump for all
  filters attached to that shared action that expect it to have different
  order value.

Don't rely on tc_action->order when dumping actions. Set nla according to
action position in resulting array of actions instead.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d4b8355737d8..9d4ed81a33b9 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -766,7 +766,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
 
 	for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
 		a = actions[i];
-		nest = nla_nest_start(skb, a->order);
+		nest = nla_nest_start(skb, i + 1);
 		if (nest == NULL)
 			goto nla_put_failure;
 		err = tcf_action_dump_1(skb, a, bind, ref);
@@ -1283,7 +1283,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 			ret = PTR_ERR(act);
 			goto err;
 		}
-		act->order = i;
 		attr_size += tcf_action_fill_size(act);
 		actions[i - 1] = act;
 	}
-- 
2.20.1


From 7eeb66e5418ed1943c7851400aa2e34a156eef75 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Wed, 22 May 2019 10:05:09 +0000
Subject: [PATCH 17/33] net: stmmac: fix reset gpio free missing

[ Upstream commit 49ce881c0d4c4a7a35358d9dccd5f26d0e56fc61 ]

Commit 984203ceff27 ("net: stmmac: mdio: remove reset gpio free")
removed the reset gpio free, when the driver is unbinded or rmmod,
we miss the gpio free.

This patch uses managed API to request the reset gpio, so that the
gpio could be freed properly.

Fixes: 984203ceff27 ("net: stmmac: mdio: remove reset gpio free")
Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index bdd351597b55..093a223fe408 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -267,7 +267,8 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 			of_property_read_u32_array(np,
 				"snps,reset-delays-us", data->delays, 3);
 
-			if (gpio_request(data->reset_gpio, "mdio-reset"))
+			if (devm_gpio_request(priv->device, data->reset_gpio,
+					      "mdio-reset"))
 				return 0;
 		}
 
-- 
2.20.1


From d5bd0eb1f212c51ac2c1fcb638fc44c43cc2ec6e Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 29 May 2019 07:44:01 +0200
Subject: [PATCH 18/33] r8169: fix MAC address being lost in PCI D3

[ Upstream commit 59715171fbd0172a579576f46821031800a63bc5 ]

(At least) RTL8168e forgets its MAC address in PCI D3. To fix this set
the MAC address when resuming. For resuming from runtime-suspend we
had this in place already, for resuming from S3/S5 it was missing.

The commit referenced as being fixed isn't wrong, it's just the first
one where the patch applies cleanly.

Fixes: 0f07bd850d36 ("r8169: use dev_get_drvdata where possible")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reported-by: Albert Astals Cid <aacid@kde.org>
Tested-by: Albert Astals Cid <aacid@kde.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 365cddbfc684..cb65f6a48eba 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6814,6 +6814,8 @@ static int rtl8169_resume(struct device *device)
 	struct net_device *dev = dev_get_drvdata(device);
 	struct rtl8169_private *tp = netdev_priv(dev);
 
+	rtl_rar_set(tp, dev->dev_addr);
+
 	clk_prepare_enable(tp->clk);
 
 	if (netif_running(dev))
@@ -6847,6 +6849,7 @@ static int rtl8169_runtime_resume(struct device *device)
 {
 	struct net_device *dev = dev_get_drvdata(device);
 	struct rtl8169_private *tp = netdev_priv(dev);
+
 	rtl_rar_set(tp, dev->dev_addr);
 
 	if (!tp->TxDescArray)
-- 
2.20.1


From 3f8b1345a2e7dc8e4cb1d6bc83ab80e5e7951bd7 Mon Sep 17 00:00:00 2001
From: Kloetzke Jan <Jan.Kloetzke@preh.de>
Date: Tue, 21 May 2019 13:18:40 +0000
Subject: [PATCH 19/33] usbnet: fix kernel crash after disconnect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit ad70411a978d1e6e97b1e341a7bde9a79af0c93d ]

When disconnecting cdc_ncm the kernel sporadically crashes shortly
after the disconnect:

  [   57.868812] Unable to handle kernel NULL pointer dereference at virtual address 00000000
  ...
  [   58.006653] PC is at 0x0
  [   58.009202] LR is at call_timer_fn+0xec/0x1b4
  [   58.013567] pc : [<0000000000000000>] lr : [<ffffff80080f5130>] pstate: 00000145
  [   58.020976] sp : ffffff8008003da0
  [   58.024295] x29: ffffff8008003da0 x28: 0000000000000001
  [   58.029618] x27: 000000000000000a x26: 0000000000000100
  [   58.034941] x25: 0000000000000000 x24: ffffff8008003e68
  [   58.040263] x23: 0000000000000000 x22: 0000000000000000
  [   58.045587] x21: 0000000000000000 x20: ffffffc68fac1808
  [   58.050910] x19: 0000000000000100 x18: 0000000000000000
  [   58.056232] x17: 0000007f885aff8c x16: 0000007f883a9f10
  [   58.061556] x15: 0000000000000001 x14: 000000000000006e
  [   58.066878] x13: 0000000000000000 x12: 00000000000000ba
  [   58.072201] x11: ffffffc69ff1db30 x10: 0000000000000020
  [   58.077524] x9 : 8000100008001000 x8 : 0000000000000001
  [   58.082847] x7 : 0000000000000800 x6 : ffffff8008003e70
  [   58.088169] x5 : ffffffc69ff17a28 x4 : 00000000ffff138b
  [   58.093492] x3 : 0000000000000000 x2 : 0000000000000000
  [   58.098814] x1 : 0000000000000000 x0 : 0000000000000000
  ...
  [   58.205800] [<          (null)>]           (null)
  [   58.210521] [<ffffff80080f5298>] expire_timers+0xa0/0x14c
  [   58.215937] [<ffffff80080f542c>] run_timer_softirq+0xe8/0x128
  [   58.221702] [<ffffff8008081120>] __do_softirq+0x298/0x348
  [   58.227118] [<ffffff80080a6304>] irq_exit+0x74/0xbc
  [   58.232009] [<ffffff80080e17dc>] __handle_domain_irq+0x78/0xac
  [   58.237857] [<ffffff8008080cf4>] gic_handle_irq+0x80/0xac
  ...

The crash happens roughly 125..130ms after the disconnect. This
correlates with the 'delay' timer that is started on certain USB tx/rx
errors in the URB completion handler.

The problem is a race of usbnet_stop() with usbnet_start_xmit(). In
usbnet_stop() we call usbnet_terminate_urbs() to cancel all URBs in
flight. This only makes sense if no new URBs are submitted
concurrently, though. But the usbnet_start_xmit() can run at the same
time on another CPU which almost unconditionally submits an URB. The
error callback of the new URB will then schedule the timer after it was
already stopped.

The fix adds a check if the tx queue is stopped after the tx list lock
has been taken. This should reliably prevent the submission of new URBs
while usbnet_terminate_urbs() does its job. The same thing is done on
the rx side even though it might be safe due to other flags that are
checked there.

Signed-off-by: Jan Klötzke <Jan.Kloetzke@preh.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 504282af27e5..921cc0571bd0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -506,6 +506,7 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 
 	if (netif_running (dev->net) &&
 	    netif_device_present (dev->net) &&
+	    test_bit(EVENT_DEV_OPEN, &dev->flags) &&
 	    !test_bit (EVENT_RX_HALT, &dev->flags) &&
 	    !test_bit (EVENT_DEV_ASLEEP, &dev->flags)) {
 		switch (retval = usb_submit_urb (urb, GFP_ATOMIC)) {
@@ -1431,6 +1432,11 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 		spin_unlock_irqrestore(&dev->txq.lock, flags);
 		goto drop;
 	}
+	if (netif_queue_stopped(net)) {
+		usb_autopm_put_interface_async(dev->intf);
+		spin_unlock_irqrestore(&dev->txq.lock, flags);
+		goto drop;
+	}
 
 #ifdef CONFIG_PM
 	/* if this triggers the device is still a sleep */
-- 
2.20.1


From deef0846f089d059bbbb0e324fb349ca80c22d3b Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 10 May 2019 10:26:23 -0500
Subject: [PATCH 20/33] net/mlx5: Avoid double free in fs init error unwinding
 path

[ Upstream commit 9414277a5df3669c67e818708c0f881597e0118e ]

In below code flow, for ingress acl table root ns memory leads
to double free.

mlx5_init_fs
  init_ingress_acls_root_ns()
    init_ingress_acl_root_ns
       kfree(steering->esw_ingress_root_ns);
       /* steering->esw_ingress_root_ns is not marked NULL */
  mlx5_cleanup_fs
    cleanup_ingress_acls_root_ns
       steering->esw_ingress_root_ns non NULL check passes.
       kfree(steering->esw_ingress_root_ns);
       /* double free */

Similar issue exist for other tables.

Hence zero out the pointers to not process the table again.

Fixes: 9b93ab981e3bf ("net/mlx5: Separate ingress/egress namespaces for each vport")
Fixes: 40c3eebb49e51 ("net/mlx5: Add support in RDMA RX steering")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index abbdd4906984..a49fc4fab3fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2390,6 +2390,7 @@ static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
 		cleanup_root_ns(steering->esw_egress_root_ns[i]);
 
 	kfree(steering->esw_egress_root_ns);
+	steering->esw_egress_root_ns = NULL;
 }
 
 static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
@@ -2404,6 +2405,7 @@ static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
 
 	kfree(steering->esw_ingress_root_ns);
+	steering->esw_ingress_root_ns = NULL;
 }
 
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
@@ -2572,6 +2574,7 @@ static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
 	for (i--; i >= 0; i--)
 		cleanup_root_ns(steering->esw_egress_root_ns[i]);
 	kfree(steering->esw_egress_root_ns);
+	steering->esw_egress_root_ns = NULL;
 	return err;
 }
 
@@ -2599,6 +2602,7 @@ static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 	for (i--; i >= 0; i--)
 		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
 	kfree(steering->esw_ingress_root_ns);
+	steering->esw_ingress_root_ns = NULL;
 	return err;
 }
 
-- 
2.20.1


From 59e3e2ad147017fb535228a330bcd15d91eac8db Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Mon, 20 May 2019 15:45:36 +1200
Subject: [PATCH 21/33] tipc: Avoid copying bytes beyond the supplied data

TLV_SET is called with a data pointer and a len parameter that tells us
how many bytes are pointed to by data. When invoking memcpy() we need
to careful to only copy len bytes.

Previously we would copy TLV_LENGTH(len) bytes which would copy an extra
4 bytes past the end of the data pointer which newer GCC versions
complain about.

 In file included from test.c:17:
 In function 'TLV_SET',
     inlined from 'test' at test.c:186:5:
 /usr/include/linux/tipc_config.h:317:3:
 warning: 'memcpy' forming offset [33, 36] is out of the bounds [0, 32]
 of object 'bearer_name' with type 'char[32]' [-Warray-bounds]
     memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 test.c: In function 'test':
 test.c::161:10: note:
 'bearer_name' declared here
     char bearer_name[TIPC_MAX_BEARER_NAME];
          ^~~~~~~~~~~

We still want to ensure any padding bytes at the end are initialised, do
this with a explicit memset() rather than copy bytes past the end of
data. Apply the same logic to TCM_SET.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_config.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 4b2c93b1934c..4955e1a9f1bc 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -307,8 +307,10 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
 	tlv_ptr = (struct tlv_desc *)tlv;
 	tlv_ptr->tlv_type = htons(type);
 	tlv_ptr->tlv_len  = htons(tlv_len);
-	if (len && data)
-		memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
+	if (len && data) {
+		memcpy(TLV_DATA(tlv_ptr), data, len);
+		memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len);
+	}
 	return TLV_SPACE(len);
 }
 
@@ -405,8 +407,10 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
 	tcm_hdr->tcm_len   = htonl(msg_len);
 	tcm_hdr->tcm_type  = htons(cmd);
 	tcm_hdr->tcm_flags = htons(flags);
-	if (data_len && data)
+	if (data_len && data) {
 		memcpy(TCM_DATA(msg), data, data_len);
+		memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len);
+	}
 	return TCM_SPACE(data_len);
 }
 
-- 
2.20.1


From 53e081d0ab3481dbd5c3ddb5d09a855874dbe035 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 10 May 2019 10:40:08 -0500
Subject: [PATCH 22/33] net/mlx5: Allocate root ns memory using kzalloc to
 match kfree

[ Upstream commit 25fa506b70cadb580c1e9cbd836d6417276d4bcd ]

root ns is yet another fs core node which is freed using kfree() by
tree_put_node().
Rest of the other fs core objects are also allocated using kmalloc
variants.

However, root ns memory is allocated using kvzalloc().
Hence allocate root ns memory using kzalloc().

Fixes: 2530236303d9e ("net/mlx5_core: Flow steering tree initialization")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index a49fc4fab3fa..158b941ae911 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2247,7 +2247,7 @@ static struct mlx5_flow_root_namespace
 		cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type);
 
 	/* Create the root namespace */
-	root_ns = kvzalloc(sizeof(*root_ns), GFP_KERNEL);
+	root_ns = kzalloc(sizeof(*root_ns), GFP_KERNEL);
 	if (!root_ns)
 		return NULL;
 
-- 
2.20.1


From 28892bcc5014d9afee4364c05355791233104ad8 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 23 May 2019 12:55:10 -0700
Subject: [PATCH 23/33] net/mlx5e: Disable rxhash when CQE compress is enabled

[ Upstream commit c0194e2d0ef0e5ce5e21a35640d23a706827ae28 ]

When CQE compression is enabled (Multi-host systems), compressed CQEs
might arrive to the driver rx, compressed CQEs don't have a valid hash
offload and the driver already reports a hash value of 0 and invalid hash
type on the skb for compressed CQEs, but this is not good enough.

On a congested PCIe, where CQE compression will kick in aggressively,
gro will deliver lots of out of order packets due to the invalid hash
and this might cause a serious performance drop.

The only valid solution, is to disable rxhash offload at all when CQE
compression is favorable (Multi-host systems).

Fixes: 7219ab34f184 ("net/mlx5e: CQE compression")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2d269acdbc8e..631a600bec4d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3789,6 +3789,12 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 			netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
 	}
 
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
+		features &= ~NETIF_F_RXHASH;
+		if (netdev->features & NETIF_F_RXHASH)
+			netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n");
+	}
+
 	mutex_unlock(&priv->state_lock);
 
 	return features;
@@ -3915,6 +3921,9 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
 	memcpy(&priv->tstamp, &config, sizeof(config));
 	mutex_unlock(&priv->state_lock);
 
+	/* might need to fix some features */
+	netdev_update_features(priv->netdev);
+
 	return copy_to_user(ifr->ifr_data, &config,
 			    sizeof(config)) ? -EFAULT : 0;
 }
@@ -4744,6 +4753,10 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	if (!priv->channels.params.scatter_fcs_en)
 		netdev->features  &= ~NETIF_F_RXFCS;
 
+	/* prefere CQE compression over rxhash */
+	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS))
+		netdev->features &= ~NETIF_F_RXHASH;
+
 #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
 	if (FT_CAP(flow_modify_en) &&
 	    FT_CAP(modify_root) &&
-- 
2.20.1


From fcf0ea5c2e979123ba7fa90b072ad7092ed56848 Mon Sep 17 00:00:00 2001
From: "Tan, Tee Min" <tee.min.tan@intel.com>
Date: Tue, 21 May 2019 12:55:42 +0800
Subject: [PATCH 24/33] net: stmmac: fix ethtool flow control not able to
 get/set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently ethtool was not able to get/set the flow control due to a
missing "!". It will always return -EOPNOTSUPP even the device is
flow control supported.

This patch fixes the condition check for ethtool flow control get/set
function for ETHTOOL_LINK_MODE_Asym_Pause_BIT.

Fixes: 3c1bcc8614db (“net: ethernet: Convert phydev advertize and supported from u32 to link mode”)
Signed-off-by: Tan, Tee Min <tee.min.tan@intel.com>
Reviewed-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Voon, Weifeng <weifeng.voon@intel.com@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 3c749c327cbd..e09522c5509a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -460,7 +460,7 @@ stmmac_get_pauseparam(struct net_device *netdev,
 	} else {
 		if (!linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
 				       netdev->phydev->supported) ||
-		    linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+		    !linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
 				      netdev->phydev->supported))
 			return;
 	}
@@ -491,7 +491,7 @@ stmmac_set_pauseparam(struct net_device *netdev,
 	} else {
 		if (!linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
 				       phy->supported) ||
-		    linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+		    !linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
 				      phy->supported))
 			return -EOPNOTSUPP;
 	}
-- 
2.20.1


From 0ea0923c7301a02933a7c4e12195eeaa7dd620f0 Mon Sep 17 00:00:00 2001
From: Weifeng Voon <weifeng.voon@intel.com>
Date: Tue, 21 May 2019 13:38:38 +0800
Subject: [PATCH 25/33] net: stmmac: dma channel control register need to be
 init first

stmmac_init_chan() needs to be called before stmmac_init_rx_chan() and
stmmac_init_tx_chan(). This is because if PBLx8 is to be used,
"DMA_CH(#i)_Control.PBLx8" needs to be set before programming
"DMA_CH(#i)_TX_Control.TxPBL" and "DMA_CH(#i)_RX_Control.RxPBL".

Fixes: 47f2a9ce527a ("net: stmmac: dma channel init prepared for multiple queues")
Reviewed-by: Zhang, Baoli <baoli.zhang@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Weifeng Voon <weifeng.voon@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f0e0593e54f3..8841c5de8979 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2190,6 +2190,10 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	if (priv->plat->axi)
 		stmmac_axi(priv, priv->ioaddr, priv->plat->axi);
 
+	/* DMA CSR Channel configuration */
+	for (chan = 0; chan < dma_csr_ch; chan++)
+		stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan);
+
 	/* DMA RX Channel Configuration */
 	for (chan = 0; chan < rx_channels_count; chan++) {
 		rx_q = &priv->rx_queue[chan];
@@ -2215,10 +2219,6 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 				       tx_q->tx_tail_addr, chan);
 	}
 
-	/* DMA CSR Channel configuration */
-	for (chan = 0; chan < dma_csr_ch; chan++)
-		stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan);
-
 	return ret;
 }
 
-- 
2.20.1


From 3c53a8e24dcdde49a7974a691d738fc3e6266fa1 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 22 May 2019 19:12:54 -0400
Subject: [PATCH 26/33] bnxt_en: Fix aggregation buffer leak under OOM
 condition.

[ Upstream commit 296d5b54163964b7ae536b8b57dfbd21d4e868e1 ]

For every RX packet, the driver replenishes all buffers used for that
packet and puts them back into the RX ring and RX aggregation ring.
In one code path where the RX packet has one RX buffer and one or more
aggregation buffers, we missed recycling the aggregation buffer(s) if
we are unable to allocate a new SKB buffer.  This leads to the
aggregation ring slowly running out of buffers over time.  Fix it
by properly recycling the aggregation buffers.

Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")
Reported-by: Rakesh Hemnani <rhemnani@fb.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c6ddbc0e084e..d45b2f0e3063 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1636,6 +1636,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
 		skb = bnxt_copy_skb(bnapi, data_ptr, len, dma_addr);
 		bnxt_reuse_rx_data(rxr, cons, data);
 		if (!skb) {
+			if (agg_bufs)
+				bnxt_reuse_rx_agg_bufs(cpr, cp_cons, agg_bufs);
 			rc = -ENOMEM;
 			goto next_rx;
 		}
-- 
2.20.1


From 0562573e7ed5e16445b7541c6519e5fc3213df69 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 22 May 2019 19:12:55 -0400
Subject: [PATCH 27/33] bnxt_en: Fix possible BUG() condition when calling
 pci_disable_msix().

[ Upstream commit 1b3f0b75c39f534278a895c117282014e9d0ae1f ]

When making configuration changes, the driver calls bnxt_close_nic()
and then bnxt_open_nic() for the changes to take effect.  A parameter
irq_re_init is passed to the call sequence to indicate if IRQ
should be re-initialized.  This irq_re_init parameter needs to
be included in the bnxt_reserve_rings() call.  bnxt_reserve_rings()
can only call pci_disable_msix() if the irq_re_init parameter is
true, otherwise it may hit BUG() because some IRQs may not have been
freed yet.

Fixes: 41e8d7983752 ("bnxt_en: Modify the ring reservation functions for 57500 series chips.")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 13 +++++++------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c     |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index d45b2f0e3063..0229cb524326 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7506,22 +7506,23 @@ static void bnxt_clear_int_mode(struct bnxt *bp)
 	bp->flags &= ~BNXT_FLAG_USING_MSIX;
 }
 
-int bnxt_reserve_rings(struct bnxt *bp)
+int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init)
 {
 	int tcs = netdev_get_num_tc(bp->dev);
-	bool reinit_irq = false;
+	bool irq_cleared = false;
 	int rc;
 
 	if (!bnxt_need_reserve_rings(bp))
 		return 0;
 
-	if (BNXT_NEW_RM(bp) && (bnxt_get_num_msix(bp) != bp->total_irqs)) {
+	if (irq_re_init && BNXT_NEW_RM(bp) &&
+	    bnxt_get_num_msix(bp) != bp->total_irqs) {
 		bnxt_ulp_irq_stop(bp);
 		bnxt_clear_int_mode(bp);
-		reinit_irq = true;
+		irq_cleared = true;
 	}
 	rc = __bnxt_reserve_rings(bp);
-	if (reinit_irq) {
+	if (irq_cleared) {
 		if (!rc)
 			rc = bnxt_init_int_mode(bp);
 		bnxt_ulp_irq_restart(bp, rc);
@@ -8420,7 +8421,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 			return rc;
 		}
 	}
-	rc = bnxt_reserve_rings(bp);
+	rc = bnxt_reserve_rings(bp, irq_re_init);
 	if (rc)
 		return rc;
 	if ((bp->flags & BNXT_FLAG_RFS) &&
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 2fb653e0048d..4d8b89334472 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1776,7 +1776,7 @@ unsigned int bnxt_get_avail_stat_ctxs_for_en(struct bnxt *bp);
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
 unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp);
 int bnxt_get_avail_msix(struct bnxt *bp, int num);
-int bnxt_reserve_rings(struct bnxt *bp);
+int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
 int bnxt_hwrm_set_pause(struct bnxt *);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index adabbe94a259..e1460e391952 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -788,7 +788,7 @@ static int bnxt_set_channels(struct net_device *dev,
 			 */
 		}
 	} else {
-		rc = bnxt_reserve_rings(bp);
+		rc = bnxt_reserve_rings(bp, true);
 	}
 
 	return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index ea45a9b8179e..7dd3f445afb6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -150,7 +150,7 @@ static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
 			bnxt_close_nic(bp, true, false);
 			rc = bnxt_open_nic(bp, true, false);
 		} else {
-			rc = bnxt_reserve_rings(bp);
+			rc = bnxt_reserve_rings(bp, true);
 		}
 	}
 	if (rc) {
-- 
2.20.1


From 76740c5a915123bc91639f493f23bceed357eb8b Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 22 May 2019 19:12:56 -0400
Subject: [PATCH 28/33] bnxt_en: Reduce memory usage when running in kdump
 kernel.

[ Upstream commit d629522e1d66561f38e5c8d4f52bb6d254ec0707 ]

Skip RDMA context memory allocations, reduce to 1 ring, and disable
TPA when running in the kdump kernel.  Without this patch, the driver
fails to initialize with memory allocation errors when running in a
typical kdump kernel.

Fixes: cf6daed098d1 ("bnxt_en: Increase context memory allocations on 57500 chips for RDMA.")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0229cb524326..300dbfdd4ae8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6338,7 +6338,7 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp)
 	if (!ctx || (ctx->flags & BNXT_CTX_FLAG_INITED))
 		return 0;
 
-	if (bp->flags & BNXT_FLAG_ROCE_CAP) {
+	if ((bp->flags & BNXT_FLAG_ROCE_CAP) && !is_kdump_kernel()) {
 		pg_lvl = 2;
 		extra_qps = 65536;
 		extra_srqs = 8192;
@@ -10279,7 +10279,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 
 	if (sh)
 		bp->flags |= BNXT_FLAG_SHARED_RINGS;
-	dflt_rings = netif_get_num_default_rss_queues();
+	dflt_rings = is_kdump_kernel() ? 1 : netif_get_num_default_rss_queues();
 	/* Reduce default rings on multi-port cards so that total default
 	 * rings do not exceed CPU count.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 4d8b89334472..c09b20b08395 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -20,6 +20,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/rhashtable.h>
+#include <linux/crash_dump.h>
 #include <net/devlink.h>
 #include <net/dst_metadata.h>
 #include <net/switchdev.h>
@@ -1367,7 +1368,8 @@ struct bnxt {
 #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
 #define BNXT_RX_PAGE_MODE(bp)	((bp)->flags & BNXT_FLAG_RX_PAGE_MODE)
 #define BNXT_SUPPORTS_TPA(bp)	(!BNXT_CHIP_TYPE_NITRO_A0(bp) &&	\
-				 !(bp->flags & BNXT_FLAG_CHIP_P5))
+				 !(bp->flags & BNXT_FLAG_CHIP_P5) &&	\
+				 !is_kdump_kernel())
 
 /* Chip class phase 5 */
 #define BNXT_CHIP_P5(bp)			\
-- 
2.20.1


From 54417ce354f2709d467b230b2a14b5e99117d997 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:00 -0700
Subject: [PATCH 29/33] net/tls: avoid NULL-deref on resync during device
 removal

[ Upstream commit 38030d7cb77963ba84cdbe034806e2b81245339f ]

When netdev with active kTLS sockets in unregistered
notifier callback walks the offloaded sockets and
cleans up offload state.  RX data may still be processed,
however, and if resync was requested prior to device
removal we would hit a NULL pointer dereference on
ctx->netdev use.

Make sure resync is under the device offload lock
and NULL-check the netdev pointer.

This should be safe, because the pointer is set to
NULL either in the netdev notifier (under said lock)
or when socket is completely dead and no resync can
happen.

The other access to ctx->netdev in tls_validate_xmit_skb()
does not dereference the pointer, it just checks it against
other device pointer, so it should be pretty safe (perhaps
we can add a READ_ONCE/WRITE_ONCE there, if paranoid).

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 7d5136ecee78..445d996b7412 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -555,8 +555,8 @@ static int tls_device_push_pending_record(struct sock *sk, int flags)
 void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct net_device *netdev = tls_ctx->netdev;
 	struct tls_offload_context_rx *rx_ctx;
+	struct net_device *netdev;
 	u32 is_req_pending;
 	s64 resync_req;
 	u32 req_seq;
@@ -570,10 +570,15 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 	is_req_pending = resync_req;
 
 	if (unlikely(is_req_pending) && req_seq == seq &&
-	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
-		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
-						      seq + TLS_HEADER_SIZE - 1,
-						      rcd_sn);
+	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
+		seq += TLS_HEADER_SIZE - 1;
+		down_read(&device_offload_lock);
+		netdev = tls_ctx->netdev;
+		if (netdev)
+			netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq,
+							      rcd_sn);
+		up_read(&device_offload_lock);
+	}
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
-- 
2.20.1


From bf7d7f8c696a8c9cba289b91267fc35972051508 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:01 -0700
Subject: [PATCH 30/33] net/tls: fix state removal with feature flags off

[ Upstream commit 3686637e507b48525fcea6fb91e1988bdbc14530 ]

TLS offload drivers shouldn't (and currently don't) block
the TLS offload feature changes based on whether there are
active offloaded connections or not.

This seems to be a good idea, because we want the admin to
be able to disable the TLS offload at any time, and there
is no clean way of disabling it for active connections
(TX side is quite problematic).  So if features are cleared
existing connections will stay offloaded until they close,
and new connections will not attempt offload to a given
device.

However, the offload state removal handling is currently
broken if feature flags get cleared while there are
active TLS offloads.

RX side will completely bail from cleanup, even on normal
remove path, leaving device state dangling, potentially
causing issues when the 5-tuple is reused.  It will also
fail to release the netdev reference.

Remove the RX-side warning message, in next release cycle
it should be printed when features are disabled, rather
than when connection dies, but for that we need a more
efficient method of finding connection of a given netdev
(a'la BPF offload code).

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 445d996b7412..a7ea83c2c219 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -928,12 +928,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
 	if (!netdev)
 		goto out;
 
-	if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
-		pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
-				   __func__);
-		goto out;
-	}
-
 	netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
 					TLS_OFFLOAD_CTX_DIR_RX);
 
-- 
2.20.1


From a36805d932f99de94e8504e37ba0727c4b6ac065 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:02 -0700
Subject: [PATCH 31/33] net/tls: don't ignore netdev notifications if no TLS
 features

[ Upstream commit c3f4a6c39cf269a40d45f813c05fa830318ad875 ]

On device surprise removal path (the notifier) we can't
bail just because the features are disabled.  They may
have been enabled during the lifetime of the device.
This bug leads to leaking netdev references and
use-after-frees if there are active connections while
device features are cleared.

Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a7ea83c2c219..d43ef935cdef 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -986,7 +986,8 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
-	if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
+	if (!dev->tlsdev_ops &&
+	    !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
 		return NOTIFY_DONE;
 
 	switch (event) {
-- 
2.20.1


From 5d2b47bb912cb026498f7ee2eeb992564bc9d3d9 Mon Sep 17 00:00:00 2001
From: Vishal Kulkarni <vishal@chelsio.com>
Date: Thu, 23 May 2019 08:07:21 +0530
Subject: [PATCH 32/33] cxgb4: Revert "cxgb4: Remove SGE_HOST_PAGE_SIZE
 dependency on page size"

[ Upstream commit ab0610efabb4c4f419a531455708caf1dd29357e ]

This reverts commit 2391b0030e241386d710df10e53e2cfc3c5d4fc1 which has
introduced regression. Now SGE's BAR2 Doorbell/GTS Page Size is
interpreted correctly in the firmware itself by using actual host
page size. Hence previous commit needs to be reverted.

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 2b03f6187a24..29d3399c4995 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -7139,10 +7139,21 @@ int t4_fixup_host_params(struct adapter *adap, unsigned int page_size,
 			 unsigned int cache_line_size)
 {
 	unsigned int page_shift = fls(page_size) - 1;
+	unsigned int sge_hps = page_shift - 10;
 	unsigned int stat_len = cache_line_size > 64 ? 128 : 64;
 	unsigned int fl_align = cache_line_size < 32 ? 32 : cache_line_size;
 	unsigned int fl_align_log = fls(fl_align) - 1;
 
+	t4_write_reg(adap, SGE_HOST_PAGE_SIZE_A,
+		     HOSTPAGESIZEPF0_V(sge_hps) |
+		     HOSTPAGESIZEPF1_V(sge_hps) |
+		     HOSTPAGESIZEPF2_V(sge_hps) |
+		     HOSTPAGESIZEPF3_V(sge_hps) |
+		     HOSTPAGESIZEPF4_V(sge_hps) |
+		     HOSTPAGESIZEPF5_V(sge_hps) |
+		     HOSTPAGESIZEPF6_V(sge_hps) |
+		     HOSTPAGESIZEPF7_V(sge_hps));
+
 	if (is_t4(adap->params.chip)) {
 		t4_set_reg_field(adap, SGE_CONTROL_A,
 				 INGPADBOUNDARY_V(INGPADBOUNDARY_M) |
-- 
2.20.1


From 9c140e4bd4f08b5e1db21fc9f40194fb9505e840 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 30 May 2019 18:01:21 -0400
Subject: [PATCH 33/33] net: correct zerocopy refcnt with udp MSG_MORE

[ Upstream commit 100f6d8e09905c59be45b6316f8f369c0be1b2d8 ]

TCP zerocopy takes a uarg reference for every skb, plus one for the
tcp_sendmsg_locked datapath temporarily, to avoid reaching refcnt zero
as it builds, sends and frees skbs inside its inner loop.

UDP and RAW zerocopy do not send inside the inner loop so do not need
the extra sock_zerocopy_get + sock_zerocopy_put pair. Commit
52900d22288ed ("udp: elide zerocopy operation in hot path") introduced
extra_uref to pass the initial reference taken in sock_zerocopy_alloc
to the first generated skb.

But, sock_zerocopy_realloc takes this extra reference at the start of
every call. With MSG_MORE, no new skb may be generated to attach the
extra_uref to, so refcnt is incorrectly 2 with only one skb.

Do not take the extra ref if uarg && !tcp, which implies MSG_MORE.
Update extra_uref accordingly.

This conditional assignment triggers a false positive may be used
uninitialized warning, so have to initialize extra_uref at define.

Changes v1->v2: fix typo in Fixes SHA1

Fixes: 52900d22288e7 ("udp: elide zerocopy operation in hot path")
Reported-by: syzbot <syzkaller@googlegroups.com>
Diagnosed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c     | 6 +++++-
 net/ipv4/ip_output.c  | 4 ++--
 net/ipv6/ip6_output.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 40796b8bf820..e5bfd42fd083 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1001,7 +1001,11 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 			uarg->len++;
 			uarg->bytelen = bytelen;
 			atomic_set(&sk->sk_zckey, ++next);
-			sock_zerocopy_get(uarg);
+
+			/* no extra ref when appending to datagram (MSG_MORE) */
+			if (sk->sk_type == SOCK_STREAM)
+				sock_zerocopy_get(uarg);
+
 			return uarg;
 		}
 	}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e8bb2e85c5a4..ac770940adb9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -883,7 +883,7 @@ static int __ip_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref;
+	bool paged, extra_uref = false;
 	u32 tskey = 0;
 
 	skb = skb_peek_tail(queue);
@@ -923,7 +923,7 @@ static int __ip_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
-		extra_uref = true;
+		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e71227390bec..de16c2e343ef 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1269,7 +1269,7 @@ static int __ip6_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref;
+	bool paged, extra_uref = false;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1338,7 +1338,7 @@ static int __ip6_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
-		extra_uref = true;
+		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
-- 
2.20.1


[-- Attachment #3: net_51.mbox --]
[-- Type: Application/Octet-Stream, Size: 91502 bytes --]

From 47b6c554d960b3703ebc65cd033792b0eaf0039d Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 24 May 2019 09:49:28 -0400
Subject: [PATCH 01/38] bonding/802.3ad: fix slave link initialization
 transition states

[ Upstream commit 334031219a84b9994594015aab85ed7754c80176 ]

Once in a while, with just the right timing, 802.3ad slaves will fail to
properly initialize, winding up in a weird state, with a partner system
mac address of 00:00:00:00:00:00. This started happening after a fix to
properly track link_failure_count tracking, where an 802.3ad slave that
reported itself as link up in the miimon code, but wasn't able to get a
valid speed/duplex, started getting set to BOND_LINK_FAIL instead of
BOND_LINK_DOWN. That was the proper thing to do for the general "my link
went down" case, but has created a link initialization race that can put
the interface in this odd state.

The simple fix is to instead set the slave link to BOND_LINK_DOWN again,
if the link has never been up (last_link_up == 0), so the link state
doesn't bounce from BOND_LINK_DOWN to BOND_LINK_FAIL -- it hasn't failed
in this case, it simply hasn't been up yet, and this prevents the
unnecessary state change from DOWN to FAIL and getting stuck in an init
failure w/o a partner mac.

Fixes: ea53abfab960 ("bonding/802.3ad: fix link_failure_count tracking")
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: "David S. Miller" <davem@davemloft.net>
CC: netdev@vger.kernel.org
Tested-by: Heesoon Kim <Heesoon.Kim@stratus.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ee610721098e..f96efa363d34 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3122,13 +3122,18 @@ static int bond_slave_netdev_event(unsigned long event,
 	case NETDEV_CHANGE:
 		/* For 802.3ad mode only:
 		 * Getting invalid Speed/Duplex values here will put slave
-		 * in weird state. So mark it as link-fail for the time
-		 * being and let link-monitoring (miimon) set it right when
-		 * correct speeds/duplex are available.
+		 * in weird state. Mark it as link-fail if the link was
+		 * previously up or link-down if it hasn't yet come up, and
+		 * let link-monitoring (miimon) set it right when correct
+		 * speeds/duplex are available.
 		 */
 		if (bond_update_speed_duplex(slave) &&
-		    BOND_MODE(bond) == BOND_MODE_8023AD)
-			slave->link = BOND_LINK_FAIL;
+		    BOND_MODE(bond) == BOND_MODE_8023AD) {
+			if (slave->last_link_up)
+				slave->link = BOND_LINK_FAIL;
+			else
+				slave->link = BOND_LINK_DOWN;
+		}
 
 		if (BOND_MODE(bond) == BOND_MODE_8023AD)
 			bond_3ad_adapter_speed_duplex_changed(slave);
-- 
2.20.1


From 7d197dd47a55efa3037a41ffd46c0af968398b40 Mon Sep 17 00:00:00 2001
From: Raju Rangoju <rajur@chelsio.com>
Date: Thu, 23 May 2019 20:41:44 +0530
Subject: [PATCH 02/38] cxgb4: offload VLAN flows regardless of VLAN ethtype

[ Upstream commit b5730061d1056abf317caea823b94d6e12b5b4f6 ]

VLAN flows never get offloaded unless ivlan_vld is set in filter spec.
It's not compulsory for vlan_ethtype to be set.

So, always enable ivlan_vld bit for offloading VLAN flows regardless of
vlan_ethtype is set or not.

Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan)
Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 82a8d1970060..35462bccd91a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -197,6 +197,9 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 		fs->val.ivlan = vlan_tci;
 		fs->mask.ivlan = vlan_tci_mask;
 
+		fs->val.ivlan_vld = 1;
+		fs->mask.ivlan_vld = 1;
+
 		/* Chelsio adapters use ivlan_vld bit to match vlan packets
 		 * as 802.1Q. Also, when vlan tag is present in packets,
 		 * ethtype match is used then to match on ethtype of inner
@@ -207,8 +210,6 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 		 * ethtype value with ethtype of inner header.
 		 */
 		if (fs->val.ethtype == ETH_P_8021Q) {
-			fs->val.ivlan_vld = 1;
-			fs->mask.ivlan_vld = 1;
 			fs->val.ethtype = 0;
 			fs->mask.ethtype = 0;
 		}
-- 
2.20.1


From fd9d4072123e83904c263a03950293c1a6a64e97 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 30 May 2019 16:08:40 +0200
Subject: [PATCH 03/38] ethtool: Check for vlan etype or vlan tci when parsing
 flow_rule

[ Upstream commit b73484b2fc0d0ba84a13e9d86eb4adcae718163b ]

When parsing an ethtool flow spec to build a flow_rule, the code checks
if both the vlan etype and the vlan tci are specified by the user to add
a FLOW_DISSECTOR_KEY_VLAN match.

However, when the user only specified a vlan etype or a vlan tci, this
check silently ignores these parameters.

For example, the following rule :

ethtool -N eth0 flow-type udp4 vlan 0x0010 action -1 loc 0

will result in no error being issued, but the equivalent rule will be
created and passed to the NIC driver :

ethtool -N eth0 flow-type udp4 action -1 loc 0

In the end, neither the NIC driver using the rule nor the end user have
a way to know that these keys were dropped along the way, or that
incorrect parameters were entered.

This kind of check should be left to either the driver, or the ethtool
flow spec layer.

This commit makes so that ethtool parameters are forwarded as-is to the
NIC driver.

Since none of the users of ethtool_rx_flow_rule_create are using the
VLAN dissector, I don't think this qualifies as a regression.

Fixes: eca4205f9ec3 ("ethtool: add ethtool_rx_flow_spec to flow_rule structure translator")
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Acked-by: Pablo Neira Ayuso <pablo@gnumonks.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 36ed619faf36..014dcd63b451 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -3008,11 +3008,12 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 		const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext;
 		const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext;
 
-		if (ext_m_spec->vlan_etype &&
-		    ext_m_spec->vlan_tci) {
+		if (ext_m_spec->vlan_etype) {
 			match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype;
 			match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype;
+		}
 
+		if (ext_m_spec->vlan_tci) {
 			match->key.vlan.vlan_id =
 				ntohs(ext_h_spec->vlan_tci) & 0x0fff;
 			match->mask.vlan.vlan_id =
@@ -3022,7 +3023,10 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 				(ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13;
 			match->mask.vlan.vlan_priority =
 				(ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13;
+		}
 
+		if (ext_m_spec->vlan_etype ||
+		    ext_m_spec->vlan_tci) {
 			match->dissector.used_keys |=
 				BIT(FLOW_DISSECTOR_KEY_VLAN);
 			match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
-- 
2.20.1


From 4f0b339fc413b59b5bde0c711cbb12a9f9a23a40 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Mar 2019 12:40:33 -0700
Subject: [PATCH 04/38] inet: switch IP ID generator to siphash

[ Upstream commit df453700e8d81b1bdafdf684365ee2b9431fb702 ]

According to Amit Klein and Benny Pinkas, IP ID generation is too weak
and might be used by attackers.

Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix())
having 64bit key and Jenkins hash is risky.

It is time to switch to siphash and its 128bit keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/siphash.h  |  5 +++++
 include/net/netns/ipv4.h |  2 ++
 net/ipv4/route.c         | 12 +++++++-----
 net/ipv6/output_core.c   | 30 ++++++++++++++++--------------
 4 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index fa7a6b9cedbf..bf21591a9e5e 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -21,6 +21,11 @@ typedef struct {
 	u64 key[2];
 } siphash_key_t;
 
+static inline bool siphash_key_is_zero(const siphash_key_t *key)
+{
+	return !(key->key[0] | key->key[1]);
+}
+
 u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
 #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 104a6669e344..7698460a3dd1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -9,6 +9,7 @@
 #include <linux/uidgid.h>
 #include <net/inet_frag.h>
 #include <linux/rcupdate.h>
+#include <linux/siphash.h>
 
 struct tcpm_hash_bucket;
 struct ctl_table_header;
@@ -217,5 +218,6 @@ struct netns_ipv4 {
 	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
 
 	atomic_t	rt_genid;
+	siphash_key_t	ip_id_key;
 };
 #endif
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6fdf1c195d8e..df6afb092936 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -500,15 +500,17 @@ EXPORT_SYMBOL(ip_idents_reserve);
 
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 {
-	static u32 ip_idents_hashrnd __read_mostly;
 	u32 hash, id;
 
-	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+	/* Note the following code is not safe, but this is okay. */
+	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+		get_random_bytes(&net->ipv4.ip_id_key,
+				 sizeof(net->ipv4.ip_id_key));
 
-	hash = jhash_3words((__force u32)iph->daddr,
+	hash = siphash_3u32((__force u32)iph->daddr,
 			    (__force u32)iph->saddr,
-			    iph->protocol ^ net_hash_mix(net),
-			    ip_idents_hashrnd);
+			    iph->protocol,
+			    &net->ipv4.ip_id_key);
 	id = ip_idents_reserve(hash, segs);
 	iph->id = htons(id);
 }
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 4fe7c90962dd..868ae23dbae1 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -10,15 +10,25 @@
 #include <net/secure_seq.h>
 #include <linux/netfilter.h>
 
-static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
+static u32 __ipv6_select_ident(struct net *net,
 			       const struct in6_addr *dst,
 			       const struct in6_addr *src)
 {
+	const struct {
+		struct in6_addr dst;
+		struct in6_addr src;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.dst = *dst,
+		.src = *src,
+	};
 	u32 hash, id;
 
-	hash = __ipv6_addr_jhash(dst, hashrnd);
-	hash = __ipv6_addr_jhash(src, hash);
-	hash ^= net_hash_mix(net);
+	/* Note the following code is not safe, but this is okay. */
+	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+		get_random_bytes(&net->ipv4.ip_id_key,
+				 sizeof(net->ipv4.ip_id_key));
+
+	hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key);
 
 	/* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
 	 * set the hight order instead thus minimizing possible future
@@ -41,7 +51,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
  */
 __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 {
-	static u32 ip6_proxy_idents_hashrnd __read_mostly;
 	struct in6_addr buf[2];
 	struct in6_addr *addrs;
 	u32 id;
@@ -53,11 +62,7 @@ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 	if (!addrs)
 		return 0;
 
-	net_get_random_once(&ip6_proxy_idents_hashrnd,
-			    sizeof(ip6_proxy_idents_hashrnd));
-
-	id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
-				 &addrs[1], &addrs[0]);
+	id = __ipv6_select_ident(net, &addrs[1], &addrs[0]);
 	return htonl(id);
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
@@ -66,12 +71,9 @@ __be32 ipv6_select_ident(struct net *net,
 			 const struct in6_addr *daddr,
 			 const struct in6_addr *saddr)
 {
-	static u32 ip6_idents_hashrnd __read_mostly;
 	u32 id;
 
-	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
-
-	id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
+	id = __ipv6_select_ident(net, daddr, saddr);
 	return htonl(id);
 }
 EXPORT_SYMBOL(ipv6_select_ident);
-- 
2.20.1


From 566e70a1b2c05d50717d3e58e81e72f93bc1a0c1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 16:51:22 -0700
Subject: [PATCH 05/38] ipv4/igmp: fix another memory leak in
 igmpv3_del_delrec()

[ Upstream commit 3580d04aa674383c42de7b635d28e52a1e5bc72c ]

syzbot reported memory leaks [1] that I have back tracked to
a missing cleanup from igmpv3_del_delrec() when
(im->sfmode != MCAST_INCLUDE)

Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely
handle the cleanups before freeing.

[1]

BUG: memory leak
unreferenced object 0xffff888123e32b00 (size 64):
  comm "softirq", pid 0, jiffies 4294942968 (age 8.010s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline]
    [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553
    [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline]
    [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline]
    [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline]
    [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085
    [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475
    [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957
    [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246
    [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616
    [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130
    [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078
    [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline]
    [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline]
    [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086
    [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hangbin Liu <liuhangbin@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6c2febc39dca..1a8d36dd49d4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -633,6 +633,24 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
 	}
 }
 
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
+static void kfree_pmc(struct ip_mc_list *pmc)
+{
+	ip_sf_list_clear_all(pmc->sources);
+	ip_sf_list_clear_all(pmc->tomb);
+	kfree(pmc);
+}
+
 static void igmpv3_send_cr(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
@@ -669,7 +687,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
 			else
 				in_dev->mc_tomb = pmc_next;
 			in_dev_put(pmc->interface);
-			kfree(pmc);
+			kfree_pmc(pmc);
 		} else
 			pmc_prev = pmc;
 	}
@@ -1215,14 +1233,18 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 		im->interface = pmc->interface;
 		if (im->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
+			pmc->tomb = NULL;
+
 			im->sources = pmc->sources;
+			pmc->sources = NULL;
+
 			for (psf = im->sources; psf; psf = psf->sf_next)
 				psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		} else {
 			im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		}
 		in_dev_put(pmc->interface);
-		kfree(pmc);
+		kfree_pmc(pmc);
 	}
 	spin_unlock_bh(&im->lock);
 }
@@ -1243,21 +1265,18 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
 		nextpmc = pmc->next;
 		ip_mc_clear_src(pmc);
 		in_dev_put(pmc->interface);
-		kfree(pmc);
+		kfree_pmc(pmc);
 	}
 	/* clear dead sources, too */
 	rcu_read_lock();
 	for_each_pmc_rcu(in_dev, pmc) {
-		struct ip_sf_list *psf, *psf_next;
+		struct ip_sf_list *psf;
 
 		spin_lock_bh(&pmc->lock);
 		psf = pmc->tomb;
 		pmc->tomb = NULL;
 		spin_unlock_bh(&pmc->lock);
-		for (; psf; psf = psf_next) {
-			psf_next = psf->sf_next;
-			kfree(psf);
-		}
+		ip_sf_list_clear_all(psf);
 	}
 	rcu_read_unlock();
 }
@@ -2123,7 +2142,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
 
 static void ip_mc_clear_src(struct ip_mc_list *pmc)
 {
-	struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
+	struct ip_sf_list *tomb, *sources;
 
 	spin_lock_bh(&pmc->lock);
 	tomb = pmc->tomb;
@@ -2135,14 +2154,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
 	pmc->sfcount[MCAST_EXCLUDE] = 1;
 	spin_unlock_bh(&pmc->lock);
 
-	for (psf = tomb; psf; psf = nextpsf) {
-		nextpsf = psf->sf_next;
-		kfree(psf);
-	}
-	for (psf = sources; psf; psf = nextpsf) {
-		nextpsf = psf->sf_next;
-		kfree(psf);
-	}
+	ip_sf_list_clear_all(tomb);
+	ip_sf_list_clear_all(sources);
 }
 
 /* Join a multicast group
-- 
2.20.1


From 2b50768b91f8141ceffa611b90d767ff1c43cc8c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 May 2019 18:35:16 -0700
Subject: [PATCH 06/38] ipv4/igmp: fix build error if !CONFIG_IP_MULTICAST

[ Upstream commit 903869bd10e6719b9df6718e785be7ec725df59f ]

ip_sf_list_clear_all() needs to be defined even if !CONFIG_IP_MULTICAST

Fixes: 3580d04aa674 ("ipv4/igmp: fix another memory leak in igmpv3_del_delrec()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1a8d36dd49d4..eb03153dfe12 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -188,6 +188,17 @@ static void ip_ma_put(struct ip_mc_list *im)
 	     pmc != NULL;					\
 	     pmc = rtnl_dereference(pmc->next_rcu))
 
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
 #ifdef CONFIG_IP_MULTICAST
 
 /*
@@ -633,17 +644,6 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
 	}
 }
 
-static void ip_sf_list_clear_all(struct ip_sf_list *psf)
-{
-	struct ip_sf_list *next;
-
-	while (psf) {
-		next = psf->sf_next;
-		kfree(psf);
-		psf = next;
-	}
-}
-
 static void kfree_pmc(struct ip_mc_list *pmc)
 {
 	ip_sf_list_clear_all(pmc->sources);
-- 
2.20.1


From 4175cd4eae5a7de23da8c56d915c049316e5b805 Mon Sep 17 00:00:00 2001
From: Mike Manning <mmanning@vyatta.att-mail.com>
Date: Mon, 20 May 2019 19:57:17 +0100
Subject: [PATCH 07/38] ipv6: Consider sk_bound_dev_if when binding a raw
 socket to an address

[ Upstream commit 72f7cfab6f93a8ea825fab8ccfb016d064269f7f ]

IPv6 does not consider if the socket is bound to a device when binding
to an address. The result is that a socket can be bound to eth0 and
then bound to the address of eth1. If the device is a VRF, the result
is that a socket can only be bound to an address in the default VRF.

Resolve by considering the device if sk_bound_dev_if is set.

Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5a426226c762..5cb14eabfc65 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -287,7 +287,9 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 			/* Binding to link-local address requires an interface */
 			if (!sk->sk_bound_dev_if)
 				goto out_unlock;
+		}
 
+		if (sk->sk_bound_dev_if) {
 			err = -ENODEV;
 			dev = dev_get_by_index_rcu(sock_net(sk),
 						   sk->sk_bound_dev_if);
-- 
2.20.1


From 3f3433b3b4e186d8b1d8ec261485012d27a28db7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 22 May 2019 15:12:18 -0700
Subject: [PATCH 08/38] ipv6: Fix redirect with VRF

[ Upstream commit 31680ac265802397937d75461a2809a067b9fb93 ]

IPv6 redirect is broken for VRF. __ip6_route_redirect walks the FIB
entries looking for an exact match on ifindex. With VRF the flowi6_oif
is updated by l3mdev_update_flow to the l3mdev index and the
FLOWI_FLAG_SKIP_NH_OIF set in the flags to tell the lookup to skip the
device match. For redirects the device match is requires so use that
flag to know when the oif needs to be reset to the skb device index.

Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e470589fb93b..ab348489bd8a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2442,6 +2442,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	struct fib6_info *rt;
 	struct fib6_node *fn;
 
+	/* l3mdev_update_flow overrides oif if the device is enslaved; in
+	 * this case we must match on the real ingress device, so reset it
+	 */
+	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+		fl6->flowi6_oif = skb->dev->ifindex;
+
 	/* Get the "current" route for this destination and
 	 * check if the redirect has come from appropriate router.
 	 *
-- 
2.20.1


From 6d5d5472268a609cde0d399cf39cfce2193841c3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 May 2019 17:35:52 -0700
Subject: [PATCH 09/38] llc: fix skb leak in llc_build_and_send_ui_pkt()

[ Upstream commit 8fb44d60d4142cd2a440620cd291d346e23c131e ]

If llc_mac_hdr_init() returns an error, we must drop the skb
since no llc_build_and_send_ui_pkt() caller will take care of this.

BUG: memory leak
unreferenced object 0xffff8881202b6800 (size 2048):
  comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.590s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    1a 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
  backtrace:
    [<00000000e25b5abe>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<00000000e25b5abe>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<00000000e25b5abe>] slab_alloc mm/slab.c:3326 [inline]
    [<00000000e25b5abe>] __do_kmalloc mm/slab.c:3658 [inline]
    [<00000000e25b5abe>] __kmalloc+0x161/0x2c0 mm/slab.c:3669
    [<00000000a1ae188a>] kmalloc include/linux/slab.h:552 [inline]
    [<00000000a1ae188a>] sk_prot_alloc+0xd6/0x170 net/core/sock.c:1608
    [<00000000ded25bbe>] sk_alloc+0x35/0x2f0 net/core/sock.c:1662
    [<000000002ecae075>] llc_sk_alloc+0x35/0x170 net/llc/llc_conn.c:950
    [<00000000551f7c47>] llc_ui_create+0x7b/0x140 net/llc/af_llc.c:173
    [<0000000029027f0e>] __sock_create+0x164/0x250 net/socket.c:1430
    [<000000008bdec225>] sock_create net/socket.c:1481 [inline]
    [<000000008bdec225>] __sys_socket+0x69/0x110 net/socket.c:1523
    [<00000000b6439228>] __do_sys_socket net/socket.c:1532 [inline]
    [<00000000b6439228>] __se_sys_socket net/socket.c:1530 [inline]
    [<00000000b6439228>] __x64_sys_socket+0x1e/0x30 net/socket.c:1530
    [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

BUG: memory leak
unreferenced object 0xffff88811d750d00 (size 224):
  comm "syz-executor907", pid 7074, jiffies 4294943781 (age 8.600s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 f0 0c 24 81 88 ff ff 00 68 2b 20 81 88 ff ff  ...$.....h+ ....
  backtrace:
    [<0000000053026172>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline]
    [<0000000053026172>] slab_post_alloc_hook mm/slab.h:439 [inline]
    [<0000000053026172>] slab_alloc_node mm/slab.c:3269 [inline]
    [<0000000053026172>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579
    [<00000000fa8f3c30>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198
    [<00000000d96fdafb>] alloc_skb include/linux/skbuff.h:1058 [inline]
    [<00000000d96fdafb>] alloc_skb_with_frags+0x5f/0x250 net/core/skbuff.c:5327
    [<000000000a34a2e7>] sock_alloc_send_pskb+0x269/0x2a0 net/core/sock.c:2225
    [<00000000ee39999b>] sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2242
    [<00000000e034d810>] llc_ui_sendmsg+0x10a/0x540 net/llc/af_llc.c:933
    [<00000000c0bc8445>] sock_sendmsg_nosec net/socket.c:652 [inline]
    [<00000000c0bc8445>] sock_sendmsg+0x54/0x70 net/socket.c:671
    [<000000003b687167>] __sys_sendto+0x148/0x1f0 net/socket.c:1964
    [<00000000922d78d9>] __do_sys_sendto net/socket.c:1976 [inline]
    [<00000000922d78d9>] __se_sys_sendto net/socket.c:1972 [inline]
    [<00000000922d78d9>] __x64_sys_sendto+0x2a/0x30 net/socket.c:1972
    [<00000000cec820c1>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301
    [<000000000c32554f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/llc_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
index 94425e421213..9e4b6bcf6920 100644
--- a/net/llc/llc_output.c
+++ b/net/llc/llc_output.c
@@ -72,6 +72,8 @@ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
 	rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
 	if (likely(!rc))
 		rc = dev_queue_xmit(skb);
+	else
+		kfree_skb(skb);
 	return rc;
 }
 
-- 
2.20.1


From 22d0b646ee9a2cf1860c844d42dde6f7cd668e54 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 29 May 2019 10:59:44 +0300
Subject: [PATCH 10/38] mlxsw: spectrum_acl: Avoid warning after identical
 rules insertion

[ Upstream commit ef74422020aa8c224b00a927e3e47faac4d8fae3 ]

When identical rules are inserted, the latter one goes to C-TCAM. For
that, a second eRP with the same mask is created. These 2 eRPs by the
nature cannot be merged and also one cannot be parent of another.
Teach mlxsw_sp_acl_erp_delta_fill() about this possibility and handle it
gracefully.

Reported-by: Alex Kushnarov <alexanderk@mellanox.com>
Fixes: c22291f7cf45 ("mlxsw: spectrum: acl: Implement delta for ERP")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
index c1a9cc9a3292..4c98950380d5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
@@ -1171,13 +1171,12 @@ mlxsw_sp_acl_erp_delta_fill(const struct mlxsw_sp_acl_erp_key *parent_key,
 			return -EINVAL;
 	}
 	if (si == -1) {
-		/* The masks are the same, this cannot happen.
-		 * That means the caller is broken.
+		/* The masks are the same, this can happen in case eRPs with
+		 * the same mask were created in both A-TCAM and C-TCAM.
+		 * The only possible condition under which this can happen
+		 * is identical rule insertion. Delta is not possible here.
 		 */
-		WARN_ON(1);
-		*delta_start = 0;
-		*delta_mask = 0;
-		return 0;
+		return -EINVAL;
 	}
 	pmask = (unsigned char) parent_key->mask[__MASK_IDX(si)];
 	mask = (unsigned char) key->mask[__MASK_IDX(si)];
-- 
2.20.1


From d34ba53a6562ed79d84f3b7b4cd7e88d2b4e7248 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Date: Wed, 29 May 2019 07:02:11 +0000
Subject: [PATCH 11/38] net: dsa: mv88e6xxx: fix handling of upper half of
 STATS_TYPE_PORT

[ Upstream commit 84b3fd1fc9592d431e23b077e692fa4e3fd0f086 ]

Currently, the upper half of a 4-byte STATS_TYPE_PORT statistic ends
up in bits 47:32 of the return value, instead of bits 31:16 as they
should.

Fixes: 6e46e2d821bb ("net: dsa: mv88e6xxx: Fix u64 statistics")
Signed-off-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index f4e2db44ad91..720f1dde2c2d 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -910,7 +910,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip,
 			err = mv88e6xxx_port_read(chip, port, s->reg + 1, &reg);
 			if (err)
 				return U64_MAX;
-			high = reg;
+			low |= ((u32)reg) << 16;
 		}
 		break;
 	case STATS_TYPE_BANK1:
-- 
2.20.1


From 446fe83f2fbd59fc487b5acdb6a6588b305aef2d Mon Sep 17 00:00:00 2001
From: Andy Duan <fugang.duan@nxp.com>
Date: Thu, 23 May 2019 01:55:28 +0000
Subject: [PATCH 12/38] net: fec: fix the clk mismatch in failed_reset path

[ Upstream commit ce8d24f9a5965a58c588f9342689702a1024433c ]

Fix the clk mismatch in the error path "failed_reset" because
below error path will disable clk_ahb and clk_ipg directly, it
should use pm_runtime_put_noidle() instead of pm_runtime_put()
to avoid to call runtime resume callback.

Reported-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Tested-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index a96ad20ee484..878ccce1dfcd 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3556,7 +3556,7 @@ fec_probe(struct platform_device *pdev)
 	if (fep->reg_phy)
 		regulator_disable(fep->reg_phy);
 failed_reset:
-	pm_runtime_put(&pdev->dev);
+	pm_runtime_put_noidle(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 failed_regulator:
 	clk_disable_unprepare(fep->clk_ahb);
-- 
2.20.1


From 6ca39eeb7cdff3cd130be36db9b9a3cb06da9ad8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 29 May 2019 15:36:10 -0700
Subject: [PATCH 13/38] net-gro: fix use-after-free read in napi_gro_frags()

[ Upstream commit a4270d6795b0580287453ea55974d948393e66ef ]

If a network driver provides to napi_gro_frags() an
skb with a page fragment of exactly 14 bytes, the call
to gro_pull_from_frag0() will 'consume' the fragment
by calling skb_frag_unref(skb, 0), and the page might
be freed and reused.

Reading eth->h_proto at the end of napi_frags_skb() might
read mangled data, or crash under specific debugging features.

BUG: KASAN: use-after-free in napi_frags_skb net/core/dev.c:5833 [inline]
BUG: KASAN: use-after-free in napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841
Read of size 2 at addr ffff88809366840c by task syz-executor599/8957

CPU: 1 PID: 8957 Comm: syz-executor599 Not tainted 5.2.0-rc1+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load_n_noabort+0xf/0x20 mm/kasan/generic_report.c:142
 napi_frags_skb net/core/dev.c:5833 [inline]
 napi_gro_frags+0xc6f/0xd10 net/core/dev.c:5841
 tun_get_user+0x2f3c/0x3ff0 drivers/net/tun.c:1991
 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2037
 call_write_iter include/linux/fs.h:1872 [inline]
 do_iter_readv_writev+0x5f8/0x8f0 fs/read_write.c:693
 do_iter_write fs/read_write.c:970 [inline]
 do_iter_write+0x184/0x610 fs/read_write.c:951
 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1015
 do_writev+0x15b/0x330 fs/read_write.c:1058

Fixes: a50e233c50db ("net-gro: restore frag0 optimization")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 255f99cb7c48..c6b2f6db0a9b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5804,7 +5804,6 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 	skb_reset_mac_header(skb);
 	skb_gro_reset_offset(skb);
 
-	eth = skb_gro_header_fast(skb, 0);
 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 		eth = skb_gro_header_slow(skb, hlen, 0);
 		if (unlikely(!eth)) {
@@ -5814,6 +5813,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 			return NULL;
 		}
 	} else {
+		eth = (const struct ethhdr *)skb->data;
 		gro_pull_from_frag0(skb, hlen);
 		NAPI_GRO_CB(skb)->frag0 += hlen;
 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
-- 
2.20.1


From 08aef1678d8405078133bed4176d53ffd2e5dae4 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Mon, 27 May 2019 11:04:17 +0000
Subject: [PATCH 14/38] net: mvneta: Fix err code path of probe

[ Upstream commit d484e06e25ebb937d841dac02ac1fe76ec7d4ddd ]

Fix below issues in err code path of probe:
1. we don't need to unregister_netdev() because the netdev isn't
registered.
2. when register_netdev() fails, we also need to destroy bm pool for
HWBM case.

Fixes: dc35a10f68d3 ("net: mvneta: bm: add support for hardware buffer management")
Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index c0a3718b2e2a..c7f4b72b3c07 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4674,7 +4674,7 @@ static int mvneta_probe(struct platform_device *pdev)
 	err = register_netdev(dev);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to register\n");
-		goto err_free_stats;
+		goto err_netdev;
 	}
 
 	netdev_info(dev, "Using %s mac address %pM\n", mac_from,
@@ -4685,14 +4685,12 @@ static int mvneta_probe(struct platform_device *pdev)
 	return 0;
 
 err_netdev:
-	unregister_netdev(dev);
 	if (pp->bm_priv) {
 		mvneta_bm_pool_destroy(pp->bm_priv, pp->pool_long, 1 << pp->id);
 		mvneta_bm_pool_destroy(pp->bm_priv, pp->pool_short,
 				       1 << pp->id);
 		mvneta_bm_put(pp->bm_priv);
 	}
-err_free_stats:
 	free_percpu(pp->stats);
 err_free_ports:
 	free_percpu(pp->ports);
-- 
2.20.1


From 8d2c1b21f2c15dd3f695d77ba7f4097a951fbcd6 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Wed, 29 May 2019 15:59:48 +0200
Subject: [PATCH 15/38] net: mvpp2: fix bad MVPP2_TXQ_SCHED_TOKEN_CNTR_REG
 queue value

[ Upstream commit 21808437214637952b61beaba6034d97880fbeb3 ]

MVPP2_TXQ_SCHED_TOKEN_CNTR_REG() expects the logical queue id but
the current code is passing the global tx queue offset, so it ends
up writing to unknown registers (between 0x8280 and 0x82fc, which
seemed to be unused by the hardware). This fixes the issue by using
the logical queue id instead.

Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 25fbed2b8d94..f4f076d7090e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -1455,7 +1455,7 @@ static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port)
 /* Set defaults to the MVPP2 port */
 static void mvpp2_defaults_set(struct mvpp2_port *port)
 {
-	int tx_port_num, val, queue, ptxq, lrxq;
+	int tx_port_num, val, queue, lrxq;
 
 	if (port->priv->hw_version == MVPP21) {
 		/* Update TX FIFO MIN Threshold */
@@ -1476,11 +1476,9 @@ static void mvpp2_defaults_set(struct mvpp2_port *port)
 	mvpp2_write(port->priv, MVPP2_TXP_SCHED_FIXED_PRIO_REG, 0);
 
 	/* Close bandwidth for all queues */
-	for (queue = 0; queue < MVPP2_MAX_TXQ; queue++) {
-		ptxq = mvpp2_txq_phys(port->id, queue);
+	for (queue = 0; queue < MVPP2_MAX_TXQ; queue++)
 		mvpp2_write(port->priv,
-			    MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(ptxq), 0);
-	}
+			    MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(queue), 0);
 
 	/* Set refill period to 1 usec, refill tokens
 	 * and bucket size to maximum
@@ -2336,7 +2334,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port,
 	txq->descs_dma         = 0;
 
 	/* Set minimum bandwidth for disabled TXQs */
-	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->id), 0);
+	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->log_id), 0);
 
 	/* Set Tx descriptors queue starting address and size */
 	thread = mvpp2_cpu_to_thread(port->priv, get_cpu());
-- 
2.20.1


From c4d4b07c2dd7319873d964479ba82a05237e988e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 28 May 2019 10:34:42 +0100
Subject: [PATCH 16/38] net: phy: marvell10g: report if the PHY fails to boot
 firmware

[ Upstream commit 3d3ced2ec5d71b99d72ae6910fbdf890bc2eccf0 ]

Some boards do not have the PHY firmware programmed in the 3310's flash,
which leads to the PHY not working as expected.  Warn the user when the
PHY fails to boot the firmware and refuse to initialise.

Fixes: 20b2af32ff3f ("net: phy: add Marvell Alaska X 88X3310 10Gigabit PHY support")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 100b401b1f4a..754cde873dde 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -31,6 +31,9 @@
 #define MV_PHY_ALASKA_NBT_QUIRK_REV	(MARVELL_PHY_ID_88X3310 | 0xa)
 
 enum {
+	MV_PMA_BOOT		= 0xc050,
+	MV_PMA_BOOT_FATAL	= BIT(0),
+
 	MV_PCS_BASE_T		= 0x0000,
 	MV_PCS_BASE_R		= 0x1000,
 	MV_PCS_1000BASEX	= 0x2000,
@@ -211,6 +214,16 @@ static int mv3310_probe(struct phy_device *phydev)
 	    (phydev->c45_ids.devices_in_package & mmd_mask) != mmd_mask)
 		return -ENODEV;
 
+	ret = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MV_PMA_BOOT);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MV_PMA_BOOT_FATAL) {
+		dev_warn(&phydev->mdio.dev,
+			 "PHY failed to boot firmware, status=%04x\n", ret);
+		return -ENODEV;
+	}
+
 	priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
-- 
2.20.1


From 676d1074d07b0d033f88c684b73c2ac75e92e0e4 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Thu, 23 May 2019 09:32:31 +0300
Subject: [PATCH 17/38] net: sched: don't use tc_action->order during action
 dump

[ Upstream commit 4097e9d250fb17958c1d9b94538386edd3f20144 ]

Function tcf_action_dump() relies on tc_action->order field when starting
nested nla to send action data to userspace. This approach breaks in
several cases:

- When multiple filters point to same shared action, tc_action->order field
  is overwritten each time it is attached to filter. This causes filter
  dump to output action with incorrect attribute for all filters that have
  the action in different position (different order) from the last set
  tc_action->order value.

- When action data is displayed using tc action API (RTM_GETACTION), action
  order is overwritten by tca_action_gd() according to its position in
  resulting array of nl attributes, which will break filter dump for all
  filters attached to that shared action that expect it to have different
  order value.

Don't rely on tc_action->order when dumping actions. Set nla according to
action position in resulting array of actions instead.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 5a87e271d35a..5b56b1cb2417 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -800,7 +800,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
 
 	for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
 		a = actions[i];
-		nest = nla_nest_start(skb, a->order);
+		nest = nla_nest_start(skb, i + 1);
 		if (nest == NULL)
 			goto nla_put_failure;
 		err = tcf_action_dump_1(skb, a, bind, ref);
@@ -1300,7 +1300,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 			ret = PTR_ERR(act);
 			goto err;
 		}
-		act->order = i;
 		attr_size += tcf_action_fill_size(act);
 		actions[i - 1] = act;
 	}
-- 
2.20.1


From 2635bfe75a6b05cb4238e5be03733eab5a9d8241 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Wed, 22 May 2019 10:05:09 +0000
Subject: [PATCH 18/38] net: stmmac: fix reset gpio free missing

[ Upstream commit 49ce881c0d4c4a7a35358d9dccd5f26d0e56fc61 ]

Commit 984203ceff27 ("net: stmmac: mdio: remove reset gpio free")
removed the reset gpio free, when the driver is unbinded or rmmod,
we miss the gpio free.

This patch uses managed API to request the reset gpio, so that the
gpio could be freed properly.

Fixes: 984203ceff27 ("net: stmmac: mdio: remove reset gpio free")
Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index bdd351597b55..093a223fe408 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -267,7 +267,8 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 			of_property_read_u32_array(np,
 				"snps,reset-delays-us", data->delays, 3);
 
-			if (gpio_request(data->reset_gpio, "mdio-reset"))
+			if (devm_gpio_request(priv->device, data->reset_gpio,
+					      "mdio-reset"))
 				return 0;
 		}
 
-- 
2.20.1


From 488a0ee6cd7215c89caa2c11bc8d9b3501bfb6f9 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 29 May 2019 07:44:01 +0200
Subject: [PATCH 19/38] r8169: fix MAC address being lost in PCI D3

[ Upstream commit 59715171fbd0172a579576f46821031800a63bc5 ]

(At least) RTL8168e forgets its MAC address in PCI D3. To fix this set
the MAC address when resuming. For resuming from runtime-suspend we
had this in place already, for resuming from S3/S5 it was missing.

The commit referenced as being fixed isn't wrong, it's just the first
one where the patch applies cleanly.

Fixes: 0f07bd850d36 ("r8169: use dev_get_drvdata where possible")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reported-by: Albert Astals Cid <aacid@kde.org>
Tested-by: Albert Astals Cid <aacid@kde.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index ed651dde6ef9..6d176be51a6b 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6914,6 +6914,8 @@ static int rtl8169_resume(struct device *device)
 	struct net_device *dev = dev_get_drvdata(device);
 	struct rtl8169_private *tp = netdev_priv(dev);
 
+	rtl_rar_set(tp, dev->dev_addr);
+
 	clk_prepare_enable(tp->clk);
 
 	if (netif_running(dev))
@@ -6947,6 +6949,7 @@ static int rtl8169_runtime_resume(struct device *device)
 {
 	struct net_device *dev = dev_get_drvdata(device);
 	struct rtl8169_private *tp = netdev_priv(dev);
+
 	rtl_rar_set(tp, dev->dev_addr);
 
 	if (!tp->TxDescArray)
-- 
2.20.1


From 0dd0f90b8f9303d8e647394ee6816f488028e3b9 Mon Sep 17 00:00:00 2001
From: Kloetzke Jan <Jan.Kloetzke@preh.de>
Date: Tue, 21 May 2019 13:18:40 +0000
Subject: [PATCH 20/38] usbnet: fix kernel crash after disconnect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit ad70411a978d1e6e97b1e341a7bde9a79af0c93d ]

When disconnecting cdc_ncm the kernel sporadically crashes shortly
after the disconnect:

  [   57.868812] Unable to handle kernel NULL pointer dereference at virtual address 00000000
  ...
  [   58.006653] PC is at 0x0
  [   58.009202] LR is at call_timer_fn+0xec/0x1b4
  [   58.013567] pc : [<0000000000000000>] lr : [<ffffff80080f5130>] pstate: 00000145
  [   58.020976] sp : ffffff8008003da0
  [   58.024295] x29: ffffff8008003da0 x28: 0000000000000001
  [   58.029618] x27: 000000000000000a x26: 0000000000000100
  [   58.034941] x25: 0000000000000000 x24: ffffff8008003e68
  [   58.040263] x23: 0000000000000000 x22: 0000000000000000
  [   58.045587] x21: 0000000000000000 x20: ffffffc68fac1808
  [   58.050910] x19: 0000000000000100 x18: 0000000000000000
  [   58.056232] x17: 0000007f885aff8c x16: 0000007f883a9f10
  [   58.061556] x15: 0000000000000001 x14: 000000000000006e
  [   58.066878] x13: 0000000000000000 x12: 00000000000000ba
  [   58.072201] x11: ffffffc69ff1db30 x10: 0000000000000020
  [   58.077524] x9 : 8000100008001000 x8 : 0000000000000001
  [   58.082847] x7 : 0000000000000800 x6 : ffffff8008003e70
  [   58.088169] x5 : ffffffc69ff17a28 x4 : 00000000ffff138b
  [   58.093492] x3 : 0000000000000000 x2 : 0000000000000000
  [   58.098814] x1 : 0000000000000000 x0 : 0000000000000000
  ...
  [   58.205800] [<          (null)>]           (null)
  [   58.210521] [<ffffff80080f5298>] expire_timers+0xa0/0x14c
  [   58.215937] [<ffffff80080f542c>] run_timer_softirq+0xe8/0x128
  [   58.221702] [<ffffff8008081120>] __do_softirq+0x298/0x348
  [   58.227118] [<ffffff80080a6304>] irq_exit+0x74/0xbc
  [   58.232009] [<ffffff80080e17dc>] __handle_domain_irq+0x78/0xac
  [   58.237857] [<ffffff8008080cf4>] gic_handle_irq+0x80/0xac
  ...

The crash happens roughly 125..130ms after the disconnect. This
correlates with the 'delay' timer that is started on certain USB tx/rx
errors in the URB completion handler.

The problem is a race of usbnet_stop() with usbnet_start_xmit(). In
usbnet_stop() we call usbnet_terminate_urbs() to cancel all URBs in
flight. This only makes sense if no new URBs are submitted
concurrently, though. But the usbnet_start_xmit() can run at the same
time on another CPU which almost unconditionally submits an URB. The
error callback of the new URB will then schedule the timer after it was
already stopped.

The fix adds a check if the tx queue is stopped after the tx list lock
has been taken. This should reliably prevent the submission of new URBs
while usbnet_terminate_urbs() does its job. The same thing is done on
the rx side even though it might be safe due to other flags that are
checked there.

Signed-off-by: Jan Klötzke <Jan.Kloetzke@preh.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 504282af27e5..921cc0571bd0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -506,6 +506,7 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 
 	if (netif_running (dev->net) &&
 	    netif_device_present (dev->net) &&
+	    test_bit(EVENT_DEV_OPEN, &dev->flags) &&
 	    !test_bit (EVENT_RX_HALT, &dev->flags) &&
 	    !test_bit (EVENT_DEV_ASLEEP, &dev->flags)) {
 		switch (retval = usb_submit_urb (urb, GFP_ATOMIC)) {
@@ -1431,6 +1432,11 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 		spin_unlock_irqrestore(&dev->txq.lock, flags);
 		goto drop;
 	}
+	if (netif_queue_stopped(net)) {
+		usb_autopm_put_interface_async(dev->intf);
+		spin_unlock_irqrestore(&dev->txq.lock, flags);
+		goto drop;
+	}
 
 #ifdef CONFIG_PM
 	/* if this triggers the device is still a sleep */
-- 
2.20.1


From 9f80d4f90aae159499761c2dbcadd205a9f338d1 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 10 May 2019 10:26:23 -0500
Subject: [PATCH 21/38] net/mlx5: Avoid double free in fs init error unwinding
 path

[ Upstream commit 9414277a5df3669c67e818708c0f881597e0118e ]

In below code flow, for ingress acl table root ns memory leads
to double free.

mlx5_init_fs
  init_ingress_acls_root_ns()
    init_ingress_acl_root_ns
       kfree(steering->esw_ingress_root_ns);
       /* steering->esw_ingress_root_ns is not marked NULL */
  mlx5_cleanup_fs
    cleanup_ingress_acls_root_ns
       steering->esw_ingress_root_ns non NULL check passes.
       kfree(steering->esw_ingress_root_ns);
       /* double free */

Similar issue exist for other tables.

Hence zero out the pointers to not process the table again.

Fixes: 9b93ab981e3bf ("net/mlx5: Separate ingress/egress namespaces for each vport")
Fixes: 40c3eebb49e51 ("net/mlx5: Add support in RDMA RX steering")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 581cc145795d..92df4c744e32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2429,6 +2429,7 @@ static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
 		cleanup_root_ns(steering->esw_egress_root_ns[i]);
 
 	kfree(steering->esw_egress_root_ns);
+	steering->esw_egress_root_ns = NULL;
 }
 
 static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
@@ -2443,6 +2444,7 @@ static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
 
 	kfree(steering->esw_ingress_root_ns);
+	steering->esw_ingress_root_ns = NULL;
 }
 
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
@@ -2611,6 +2613,7 @@ static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
 	for (i--; i >= 0; i--)
 		cleanup_root_ns(steering->esw_egress_root_ns[i]);
 	kfree(steering->esw_egress_root_ns);
+	steering->esw_egress_root_ns = NULL;
 	return err;
 }
 
@@ -2638,6 +2641,7 @@ static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
 	for (i--; i >= 0; i--)
 		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
 	kfree(steering->esw_ingress_root_ns);
+	steering->esw_ingress_root_ns = NULL;
 	return err;
 }
 
-- 
2.20.1


From 9f611399aecd43755ef63c55a61c75d592fec41e Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Mon, 20 May 2019 15:45:36 +1200
Subject: [PATCH 22/38] tipc: Avoid copying bytes beyond the supplied data

TLV_SET is called with a data pointer and a len parameter that tells us
how many bytes are pointed to by data. When invoking memcpy() we need
to careful to only copy len bytes.

Previously we would copy TLV_LENGTH(len) bytes which would copy an extra
4 bytes past the end of the data pointer which newer GCC versions
complain about.

 In file included from test.c:17:
 In function 'TLV_SET',
     inlined from 'test' at test.c:186:5:
 /usr/include/linux/tipc_config.h:317:3:
 warning: 'memcpy' forming offset [33, 36] is out of the bounds [0, 32]
 of object 'bearer_name' with type 'char[32]' [-Warray-bounds]
     memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 test.c: In function 'test':
 test.c::161:10: note:
 'bearer_name' declared here
     char bearer_name[TIPC_MAX_BEARER_NAME];
          ^~~~~~~~~~~

We still want to ensure any padding bytes at the end are initialised, do
this with a explicit memset() rather than copy bytes past the end of
data. Apply the same logic to TCM_SET.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_config.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 4b2c93b1934c..4955e1a9f1bc 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -307,8 +307,10 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
 	tlv_ptr = (struct tlv_desc *)tlv;
 	tlv_ptr->tlv_type = htons(type);
 	tlv_ptr->tlv_len  = htons(tlv_len);
-	if (len && data)
-		memcpy(TLV_DATA(tlv_ptr), data, tlv_len);
+	if (len && data) {
+		memcpy(TLV_DATA(tlv_ptr), data, len);
+		memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len);
+	}
 	return TLV_SPACE(len);
 }
 
@@ -405,8 +407,10 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
 	tcm_hdr->tcm_len   = htonl(msg_len);
 	tcm_hdr->tcm_type  = htons(cmd);
 	tcm_hdr->tcm_flags = htons(flags);
-	if (data_len && data)
+	if (data_len && data) {
 		memcpy(TCM_DATA(msg), data, data_len);
+		memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len);
+	}
 	return TCM_SPACE(data_len);
 }
 
-- 
2.20.1


From 790a757e76a49ced4531aa707755cbea0abe6b5e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Fri, 10 May 2019 10:40:08 -0500
Subject: [PATCH 23/38] net/mlx5: Allocate root ns memory using kzalloc to
 match kfree

[ Upstream commit 25fa506b70cadb580c1e9cbd836d6417276d4bcd ]

root ns is yet another fs core node which is freed using kfree() by
tree_put_node().
Rest of the other fs core objects are also allocated using kmalloc
variants.

However, root ns memory is allocated using kvzalloc().
Hence allocate root ns memory using kzalloc().

Fixes: 2530236303d9e ("net/mlx5_core: Flow steering tree initialization")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 92df4c744e32..e29e5beb239d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2286,7 +2286,7 @@ static struct mlx5_flow_root_namespace
 		cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type);
 
 	/* Create the root namespace */
-	root_ns = kvzalloc(sizeof(*root_ns), GFP_KERNEL);
+	root_ns = kzalloc(sizeof(*root_ns), GFP_KERNEL);
 	if (!root_ns)
 		return NULL;
 
-- 
2.20.1


From 2d2697386f2e4a11e896025ba9fa779cfe3d110d Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 23 May 2019 12:55:10 -0700
Subject: [PATCH 24/38] net/mlx5e: Disable rxhash when CQE compress is enabled

[ Upstream commit c0194e2d0ef0e5ce5e21a35640d23a706827ae28 ]

When CQE compression is enabled (Multi-host systems), compressed CQEs
might arrive to the driver rx, compressed CQEs don't have a valid hash
offload and the driver already reports a hash value of 0 and invalid hash
type on the skb for compressed CQEs, but this is not good enough.

On a congested PCIe, where CQE compression will kick in aggressively,
gro will deliver lots of out of order packets due to the invalid hash
and this might cause a serious performance drop.

The only valid solution, is to disable rxhash offload at all when CQE
compression is favorable (Multi-host systems).

Fixes: 7219ab34f184 ("net/mlx5e: CQE compression")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 46157e2a1e5a..1e2688e2ed47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3750,6 +3750,12 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 			netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
 	}
 
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
+		features &= ~NETIF_F_RXHASH;
+		if (netdev->features & NETIF_F_RXHASH)
+			netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n");
+	}
+
 	mutex_unlock(&priv->state_lock);
 
 	return features;
@@ -3875,6 +3881,9 @@ int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
 	memcpy(&priv->tstamp, &config, sizeof(config));
 	mutex_unlock(&priv->state_lock);
 
+	/* might need to fix some features */
+	netdev_update_features(priv->netdev);
+
 	return copy_to_user(ifr->ifr_data, &config,
 			    sizeof(config)) ? -EFAULT : 0;
 }
@@ -4734,6 +4743,10 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	if (!priv->channels.params.scatter_fcs_en)
 		netdev->features  &= ~NETIF_F_RXFCS;
 
+	/* prefere CQE compression over rxhash */
+	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS))
+		netdev->features &= ~NETIF_F_RXHASH;
+
 #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
 	if (FT_CAP(flow_modify_en) &&
 	    FT_CAP(modify_root) &&
-- 
2.20.1


From b2f4e2d854863e216e5f484caf1e3b5b312d2fa2 Mon Sep 17 00:00:00 2001
From: "Tan, Tee Min" <tee.min.tan@intel.com>
Date: Tue, 21 May 2019 12:55:42 +0800
Subject: [PATCH 25/38] net: stmmac: fix ethtool flow control not able to
 get/set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently ethtool was not able to get/set the flow control due to a
missing "!". It will always return -EOPNOTSUPP even the device is
flow control supported.

This patch fixes the condition check for ethtool flow control get/set
function for ETHTOOL_LINK_MODE_Asym_Pause_BIT.

Fixes: 3c1bcc8614db (“net: ethernet: Convert phydev advertize and supported from u32 to link mode”)
Signed-off-by: Tan, Tee Min <tee.min.tan@intel.com>
Reviewed-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Voon, Weifeng <weifeng.voon@intel.com@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 3c749c327cbd..e09522c5509a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -460,7 +460,7 @@ stmmac_get_pauseparam(struct net_device *netdev,
 	} else {
 		if (!linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
 				       netdev->phydev->supported) ||
-		    linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+		    !linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
 				      netdev->phydev->supported))
 			return;
 	}
@@ -491,7 +491,7 @@ stmmac_set_pauseparam(struct net_device *netdev,
 	} else {
 		if (!linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
 				       phy->supported) ||
-		    linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+		    !linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
 				      phy->supported))
 			return -EOPNOTSUPP;
 	}
-- 
2.20.1


From fecbea67df3acd4958d77a49664bf16f0c35399e Mon Sep 17 00:00:00 2001
From: Weifeng Voon <weifeng.voon@intel.com>
Date: Tue, 21 May 2019 13:38:38 +0800
Subject: [PATCH 26/38] net: stmmac: dma channel control register need to be
 init first

stmmac_init_chan() needs to be called before stmmac_init_rx_chan() and
stmmac_init_tx_chan(). This is because if PBLx8 is to be used,
"DMA_CH(#i)_Control.PBLx8" needs to be set before programming
"DMA_CH(#i)_TX_Control.TxPBL" and "DMA_CH(#i)_RX_Control.RxPBL".

Fixes: 47f2a9ce527a ("net: stmmac: dma channel init prepared for multiple queues")
Reviewed-by: Zhang, Baoli <baoli.zhang@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Weifeng Voon <weifeng.voon@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 48712437d0da..3c409862c52e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2208,6 +2208,10 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	if (priv->plat->axi)
 		stmmac_axi(priv, priv->ioaddr, priv->plat->axi);
 
+	/* DMA CSR Channel configuration */
+	for (chan = 0; chan < dma_csr_ch; chan++)
+		stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan);
+
 	/* DMA RX Channel Configuration */
 	for (chan = 0; chan < rx_channels_count; chan++) {
 		rx_q = &priv->rx_queue[chan];
@@ -2233,10 +2237,6 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 				       tx_q->tx_tail_addr, chan);
 	}
 
-	/* DMA CSR Channel configuration */
-	for (chan = 0; chan < dma_csr_ch; chan++)
-		stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan);
-
 	return ret;
 }
 
-- 
2.20.1


From 4868f9b90ed3e87d526b758f1fa49b1083dd6a08 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 22 May 2019 19:12:54 -0400
Subject: [PATCH 27/38] bnxt_en: Fix aggregation buffer leak under OOM
 condition.

[ Upstream commit 296d5b54163964b7ae536b8b57dfbd21d4e868e1 ]

For every RX packet, the driver replenishes all buffers used for that
packet and puts them back into the RX ring and RX aggregation ring.
In one code path where the RX packet has one RX buffer and one or more
aggregation buffers, we missed recycling the aggregation buffer(s) if
we are unable to allocate a new SKB buffer.  This leads to the
aggregation ring slowly running out of buffers over time.  Fix it
by properly recycling the aggregation buffers.

Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")
Reported-by: Rakesh Hemnani <rhemnani@fb.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 52ade133b57c..0519d79e500b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1640,6 +1640,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
 		skb = bnxt_copy_skb(bnapi, data_ptr, len, dma_addr);
 		bnxt_reuse_rx_data(rxr, cons, data);
 		if (!skb) {
+			if (agg_bufs)
+				bnxt_reuse_rx_agg_bufs(cpr, cp_cons, agg_bufs);
 			rc = -ENOMEM;
 			goto next_rx;
 		}
-- 
2.20.1


From d6ae8d374bd217f186d72be2f8c82cd160bbebba Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 22 May 2019 19:12:55 -0400
Subject: [PATCH 28/38] bnxt_en: Fix possible BUG() condition when calling
 pci_disable_msix().

[ Upstream commit 1b3f0b75c39f534278a895c117282014e9d0ae1f ]

When making configuration changes, the driver calls bnxt_close_nic()
and then bnxt_open_nic() for the changes to take effect.  A parameter
irq_re_init is passed to the call sequence to indicate if IRQ
should be re-initialized.  This irq_re_init parameter needs to
be included in the bnxt_reserve_rings() call.  bnxt_reserve_rings()
can only call pci_disable_msix() if the irq_re_init parameter is
true, otherwise it may hit BUG() because some IRQs may not have been
freed yet.

Fixes: 41e8d7983752 ("bnxt_en: Modify the ring reservation functions for 57500 series chips.")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 13 +++++++------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c     |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0519d79e500b..96567c7e8fc5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7514,22 +7514,23 @@ static void bnxt_clear_int_mode(struct bnxt *bp)
 	bp->flags &= ~BNXT_FLAG_USING_MSIX;
 }
 
-int bnxt_reserve_rings(struct bnxt *bp)
+int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init)
 {
 	int tcs = netdev_get_num_tc(bp->dev);
-	bool reinit_irq = false;
+	bool irq_cleared = false;
 	int rc;
 
 	if (!bnxt_need_reserve_rings(bp))
 		return 0;
 
-	if (BNXT_NEW_RM(bp) && (bnxt_get_num_msix(bp) != bp->total_irqs)) {
+	if (irq_re_init && BNXT_NEW_RM(bp) &&
+	    bnxt_get_num_msix(bp) != bp->total_irqs) {
 		bnxt_ulp_irq_stop(bp);
 		bnxt_clear_int_mode(bp);
-		reinit_irq = true;
+		irq_cleared = true;
 	}
 	rc = __bnxt_reserve_rings(bp);
-	if (reinit_irq) {
+	if (irq_cleared) {
 		if (!rc)
 			rc = bnxt_init_int_mode(bp);
 		bnxt_ulp_irq_restart(bp, rc);
@@ -8428,7 +8429,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 			return rc;
 		}
 	}
-	rc = bnxt_reserve_rings(bp);
+	rc = bnxt_reserve_rings(bp, irq_re_init);
 	if (rc)
 		return rc;
 	if ((bp->flags & BNXT_FLAG_RFS) &&
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index cf81ace7a6e6..2305e6c943b4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1778,7 +1778,7 @@ unsigned int bnxt_get_avail_stat_ctxs_for_en(struct bnxt *bp);
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
 unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp);
 int bnxt_get_avail_msix(struct bnxt *bp, int num);
-int bnxt_reserve_rings(struct bnxt *bp);
+int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
 int bnxt_hwrm_set_pause(struct bnxt *);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index adabbe94a259..e1460e391952 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -788,7 +788,7 @@ static int bnxt_set_channels(struct net_device *dev,
 			 */
 		}
 	} else {
-		rc = bnxt_reserve_rings(bp);
+		rc = bnxt_reserve_rings(bp, true);
 	}
 
 	return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index cf475873ce81..bfa342a98d08 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -147,7 +147,7 @@ static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
 			bnxt_close_nic(bp, true, false);
 			rc = bnxt_open_nic(bp, true, false);
 		} else {
-			rc = bnxt_reserve_rings(bp);
+			rc = bnxt_reserve_rings(bp, true);
 		}
 	}
 	if (rc) {
-- 
2.20.1


From 35f72a7d992b822c47026ffe4765a2339ae3f755 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Wed, 22 May 2019 19:12:56 -0400
Subject: [PATCH 29/38] bnxt_en: Reduce memory usage when running in kdump
 kernel.

[ Upstream commit d629522e1d66561f38e5c8d4f52bb6d254ec0707 ]

Skip RDMA context memory allocations, reduce to 1 ring, and disable
TPA when running in the kdump kernel.  Without this patch, the driver
fails to initialize with memory allocation errors when running in a
typical kdump kernel.

Fixes: cf6daed098d1 ("bnxt_en: Increase context memory allocations on 57500 chips for RDMA.")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 96567c7e8fc5..30cafe4cdb6e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6342,7 +6342,7 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp)
 	if (!ctx || (ctx->flags & BNXT_CTX_FLAG_INITED))
 		return 0;
 
-	if (bp->flags & BNXT_FLAG_ROCE_CAP) {
+	if ((bp->flags & BNXT_FLAG_ROCE_CAP) && !is_kdump_kernel()) {
 		pg_lvl = 2;
 		extra_qps = 65536;
 		extra_srqs = 8192;
@@ -10340,7 +10340,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 
 	if (sh)
 		bp->flags |= BNXT_FLAG_SHARED_RINGS;
-	dflt_rings = netif_get_num_default_rss_queues();
+	dflt_rings = is_kdump_kernel() ? 1 : netif_get_num_default_rss_queues();
 	/* Reduce default rings on multi-port cards so that total default
 	 * rings do not exceed CPU count.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 2305e6c943b4..0fb93280ad4e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -20,6 +20,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/rhashtable.h>
+#include <linux/crash_dump.h>
 #include <net/devlink.h>
 #include <net/dst_metadata.h>
 #include <net/xdp.h>
@@ -1367,7 +1368,8 @@ struct bnxt {
 #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
 #define BNXT_RX_PAGE_MODE(bp)	((bp)->flags & BNXT_FLAG_RX_PAGE_MODE)
 #define BNXT_SUPPORTS_TPA(bp)	(!BNXT_CHIP_TYPE_NITRO_A0(bp) &&	\
-				 !(bp->flags & BNXT_FLAG_CHIP_P5))
+				 !(bp->flags & BNXT_FLAG_CHIP_P5) &&	\
+				 !is_kdump_kernel())
 
 /* Chip class phase 5 */
 #define BNXT_CHIP_P5(bp)			\
-- 
2.20.1


From 74c4f58233de47c9b3506ba5394626b99237b85d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 24 May 2019 10:34:30 -0700
Subject: [PATCH 30/38] net/tls: fix lowat calculation if some data came from
 previous record

[ Upstream commit 46a1695960d0600d58da7af33c65f24f3d839674 ]

If some of the data came from the previous record, i.e. from
the rx_list it had already been decrypted, so it's not counted
towards the "decrypted" variable, but the "copied" variable.
Take that into account when checking lowat.

When calculating lowat target we need to pass the original len.
E.g. if lowat is at 80, len is 100 and we had 30 bytes on rx_list
target would currently be incorrectly calculated as 70, even though
we only need 50 more bytes to make up the 80.

Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Tested-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 29d6af43dd24..467c68503799 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1685,13 +1685,12 @@ int tls_sw_recvmsg(struct sock *sk,
 		copied = err;
 	}
 
-	len = len - copied;
-	if (len) {
-		target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
-		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-	} else {
+	if (len <= copied)
 		goto recv_end;
-	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+	len = len - copied;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
 		bool retain_skb = false;
@@ -1826,7 +1825,7 @@ int tls_sw_recvmsg(struct sock *sk,
 		}
 
 		/* If we have a new message from strparser, continue now. */
-		if (decrypted >= target && !ctx->recv_pkt)
+		if (decrypted + copied >= target && !ctx->recv_pkt)
 			break;
 	} while (len);
 
-- 
2.20.1


From f032178cfd5d7f7fa7dcb655d9999d31c21a34d9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 24 May 2019 10:34:31 -0700
Subject: [PATCH 31/38] selftests/tls: test for lowat overshoot with multiple
 records

[ Upstream commit 7718a855cd7ae9fc27a2aa1532ee105d52eb7634 ]

Set SO_RCVLOWAT and test it gets respected when gathering
data from multiple records.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tls.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 47ddfc154036..01efbcd2258c 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -575,6 +575,25 @@ TEST_F(tls, recv_peek_large_buf_mult_recs)
 	EXPECT_EQ(memcmp(test_str, buf, len), 0);
 }
 
+TEST_F(tls, recv_lowat)
+{
+	char send_mem[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+	char recv_mem[20];
+	int lowat = 8;
+
+	EXPECT_EQ(send(self->fd, send_mem, 10, 0), 10);
+	EXPECT_EQ(send(self->fd, send_mem, 5, 0), 5);
+
+	memset(recv_mem, 0, 20);
+	EXPECT_EQ(setsockopt(self->cfd, SOL_SOCKET, SO_RCVLOWAT,
+			     &lowat, sizeof(lowat)), 0);
+	EXPECT_EQ(recv(self->cfd, recv_mem, 1, MSG_WAITALL), 1);
+	EXPECT_EQ(recv(self->cfd, recv_mem + 1, 6, MSG_WAITALL), 6);
+	EXPECT_EQ(recv(self->cfd, recv_mem + 7, 10, 0), 8);
+
+	EXPECT_EQ(memcmp(send_mem, recv_mem, 10), 0);
+	EXPECT_EQ(memcmp(send_mem, recv_mem + 10, 5), 0);
+}
 
 TEST_F(tls, pollin)
 {
-- 
2.20.1


From 7046edfac25c54c0d1d68abdac08169bf2465d90 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 24 May 2019 10:34:32 -0700
Subject: [PATCH 32/38] net/tls: fix no wakeup on partial reads

[ Upstream commit 04b25a5411f966c2e586909a8496553b71876fae ]

When tls_sw_recvmsg() partially copies a record it pops that
record from ctx->recv_pkt and places it on rx_list.

Next iteration of tls_sw_recvmsg() reads from rx_list via
process_rx_list() before it enters the decryption loop.
If there is no more records to be read tls_wait_data()
will put the process on the wait queue and got to sleep.
This is incorrect, because some data was already copied
in process_rx_list().

In case of RPC connections process may never get woken up,
because peer also simply blocks in read().

I think this may also fix a similar issue when BPF is at
play, because after __tcp_bpf_recvmsg() returns some data
we subtract it from len and use continue to restart the
loop, but len could have just reached 0, so again we'd
sleep unnecessarily. That's added by:
commit d3b18ad31f93 ("tls: add bpf support to sk_msg handling")

Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records")
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Tested-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 467c68503799..d350ff73a391 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1692,7 +1692,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	len = len - copied;
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
-	do {
+	while (len && (decrypted + copied < target || ctx->recv_pkt)) {
 		bool retain_skb = false;
 		bool zc = false;
 		int to_decrypt;
@@ -1823,11 +1823,7 @@ int tls_sw_recvmsg(struct sock *sk,
 		} else {
 			break;
 		}
-
-		/* If we have a new message from strparser, continue now. */
-		if (decrypted + copied >= target && !ctx->recv_pkt)
-			break;
-	} while (len);
+	}
 
 recv_end:
 	if (num_async) {
-- 
2.20.1


From 2d47f6eb72072125b62c53eadfeb386bd73f97a0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 24 May 2019 10:34:33 -0700
Subject: [PATCH 33/38] selftests/tls: add test for sleeping even though there
 is data

[ Upstream commit 043556d0917a1a5ea58795fe1656a2bce06d2649 ]

Add a test which sends 15 bytes of data, and then tries
to read 10 byes twice.  Previously the second read would
sleep indifinitely, since the record was already decrypted
and there is only 5 bytes left, not full 10.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tls.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 01efbcd2258c..278c86134556 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -442,6 +442,21 @@ TEST_F(tls, multiple_send_single_recv)
 	EXPECT_EQ(memcmp(send_mem, recv_mem + send_len, send_len), 0);
 }
 
+TEST_F(tls, single_send_multiple_recv_non_align)
+{
+	const unsigned int total_len = 15;
+	const unsigned int recv_len = 10;
+	char recv_mem[recv_len * 2];
+	char send_mem[total_len];
+
+	EXPECT_GE(send(self->fd, send_mem, total_len, 0), 0);
+	memset(recv_mem, 0, total_len);
+
+	EXPECT_EQ(recv(self->cfd, recv_mem, recv_len, 0), recv_len);
+	EXPECT_EQ(recv(self->cfd, recv_mem + recv_len, recv_len, 0), 5);
+	EXPECT_EQ(memcmp(send_mem, recv_mem, total_len), 0);
+}
+
 TEST_F(tls, recv_partial)
 {
 	char const *test_str = "test_read_partial";
-- 
2.20.1


From 8ca102363de971c09f27b0d09d28128bf0bb66f0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:00 -0700
Subject: [PATCH 34/38] net/tls: avoid NULL-deref on resync during device
 removal

[ Upstream commit 38030d7cb77963ba84cdbe034806e2b81245339f ]

When netdev with active kTLS sockets in unregistered
notifier callback walks the offloaded sockets and
cleans up offload state.  RX data may still be processed,
however, and if resync was requested prior to device
removal we would hit a NULL pointer dereference on
ctx->netdev use.

Make sure resync is under the device offload lock
and NULL-check the netdev pointer.

This should be safe, because the pointer is set to
NULL either in the netdev notifier (under said lock)
or when socket is completely dead and no resync can
happen.

The other access to ctx->netdev in tls_validate_xmit_skb()
does not dereference the pointer, it just checks it against
other device pointer, so it should be pretty safe (perhaps
we can add a READ_ONCE/WRITE_ONCE there, if paranoid).

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 14dedb24fa7b..c9c1891e8201 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -573,8 +573,8 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct net_device *netdev = tls_ctx->netdev;
 	struct tls_offload_context_rx *rx_ctx;
+	struct net_device *netdev;
 	u32 is_req_pending;
 	s64 resync_req;
 	u32 req_seq;
@@ -588,10 +588,15 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
 	is_req_pending = resync_req;
 
 	if (unlikely(is_req_pending) && req_seq == seq &&
-	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
-		netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
-						      seq + TLS_HEADER_SIZE - 1,
-						      rcd_sn);
+	    atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) {
+		seq += TLS_HEADER_SIZE - 1;
+		down_read(&device_offload_lock);
+		netdev = tls_ctx->netdev;
+		if (netdev)
+			netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq,
+							      rcd_sn);
+		up_read(&device_offload_lock);
+	}
 }
 
 static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
-- 
2.20.1


From 9cc57a32209531dfae3120dddf14c83e71cbd1cb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:01 -0700
Subject: [PATCH 35/38] net/tls: fix state removal with feature flags off

[ Upstream commit 3686637e507b48525fcea6fb91e1988bdbc14530 ]

TLS offload drivers shouldn't (and currently don't) block
the TLS offload feature changes based on whether there are
active offloaded connections or not.

This seems to be a good idea, because we want the admin to
be able to disable the TLS offload at any time, and there
is no clean way of disabling it for active connections
(TX side is quite problematic).  So if features are cleared
existing connections will stay offloaded until they close,
and new connections will not attempt offload to a given
device.

However, the offload state removal handling is currently
broken if feature flags get cleared while there are
active TLS offloads.

RX side will completely bail from cleanup, even on normal
remove path, leaving device state dangling, potentially
causing issues when the 5-tuple is reused.  It will also
fail to release the netdev reference.

Remove the RX-side warning message, in next release cycle
it should be printed when features are disabled, rather
than when connection dies, but for that we need a more
efficient method of finding connection of a given netdev
(a'la BPF offload code).

Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index c9c1891e8201..e9547322e24a 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -948,12 +948,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk)
 	if (!netdev)
 		goto out;
 
-	if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
-		pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
-				   __func__);
-		goto out;
-	}
-
 	netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
 					TLS_OFFLOAD_CTX_DIR_RX);
 
-- 
2.20.1


From 0e69f58a18093ddd2f4f6486974ff6f092e9cbb7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 21 May 2019 19:02:02 -0700
Subject: [PATCH 36/38] net/tls: don't ignore netdev notifications if no TLS
 features

[ Upstream commit c3f4a6c39cf269a40d45f813c05fa830318ad875 ]

On device surprise removal path (the notifier) we can't
bail just because the features are disabled.  They may
have been enabled during the lifetime of the device.
This bug leads to leaking netdev references and
use-after-frees if there are active connections while
device features are cleared.

Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index e9547322e24a..b443840e82b8 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1006,7 +1006,8 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
-	if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
+	if (!dev->tlsdev_ops &&
+	    !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
 		return NOTIFY_DONE;
 
 	switch (event) {
-- 
2.20.1


From 132f56f8ec29e95495f7c86fee6d09aec12c770b Mon Sep 17 00:00:00 2001
From: Vishal Kulkarni <vishal@chelsio.com>
Date: Thu, 23 May 2019 08:07:21 +0530
Subject: [PATCH 37/38] cxgb4: Revert "cxgb4: Remove SGE_HOST_PAGE_SIZE
 dependency on page size"

[ Upstream commit ab0610efabb4c4f419a531455708caf1dd29357e ]

This reverts commit 2391b0030e241386d710df10e53e2cfc3c5d4fc1 which has
introduced regression. Now SGE's BAR2 Doorbell/GTS Page Size is
interpreted correctly in the firmware itself by using actual host
page size. Hence previous commit needs to be reverted.

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index a3544041ad32..8d63eed628d7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -7206,10 +7206,21 @@ int t4_fixup_host_params(struct adapter *adap, unsigned int page_size,
 			 unsigned int cache_line_size)
 {
 	unsigned int page_shift = fls(page_size) - 1;
+	unsigned int sge_hps = page_shift - 10;
 	unsigned int stat_len = cache_line_size > 64 ? 128 : 64;
 	unsigned int fl_align = cache_line_size < 32 ? 32 : cache_line_size;
 	unsigned int fl_align_log = fls(fl_align) - 1;
 
+	t4_write_reg(adap, SGE_HOST_PAGE_SIZE_A,
+		     HOSTPAGESIZEPF0_V(sge_hps) |
+		     HOSTPAGESIZEPF1_V(sge_hps) |
+		     HOSTPAGESIZEPF2_V(sge_hps) |
+		     HOSTPAGESIZEPF3_V(sge_hps) |
+		     HOSTPAGESIZEPF4_V(sge_hps) |
+		     HOSTPAGESIZEPF5_V(sge_hps) |
+		     HOSTPAGESIZEPF6_V(sge_hps) |
+		     HOSTPAGESIZEPF7_V(sge_hps));
+
 	if (is_t4(adap->params.chip)) {
 		t4_set_reg_field(adap, SGE_CONTROL_A,
 				 INGPADBOUNDARY_V(INGPADBOUNDARY_M) |
-- 
2.20.1


From a56bf38d03a5c14f31c9f314e2ab909d2c5d5f81 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 30 May 2019 18:01:21 -0400
Subject: [PATCH 38/38] net: correct zerocopy refcnt with udp MSG_MORE

[ Upstream commit 100f6d8e09905c59be45b6316f8f369c0be1b2d8 ]

TCP zerocopy takes a uarg reference for every skb, plus one for the
tcp_sendmsg_locked datapath temporarily, to avoid reaching refcnt zero
as it builds, sends and frees skbs inside its inner loop.

UDP and RAW zerocopy do not send inside the inner loop so do not need
the extra sock_zerocopy_get + sock_zerocopy_put pair. Commit
52900d22288ed ("udp: elide zerocopy operation in hot path") introduced
extra_uref to pass the initial reference taken in sock_zerocopy_alloc
to the first generated skb.

But, sock_zerocopy_realloc takes this extra reference at the start of
every call. With MSG_MORE, no new skb may be generated to attach the
extra_uref to, so refcnt is incorrectly 2 with only one skb.

Do not take the extra ref if uarg && !tcp, which implies MSG_MORE.
Update extra_uref accordingly.

This conditional assignment triggers a false positive may be used
uninitialized warning, so have to initialize extra_uref at define.

Changes v1->v2: fix typo in Fixes SHA1

Fixes: 52900d22288e7 ("udp: elide zerocopy operation in hot path")
Reported-by: syzbot <syzkaller@googlegroups.com>
Diagnosed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c     | 6 +++++-
 net/ipv4/ip_output.c  | 4 ++--
 net/ipv6/ip6_output.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 40796b8bf820..e5bfd42fd083 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1001,7 +1001,11 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 			uarg->len++;
 			uarg->bytelen = bytelen;
 			atomic_set(&sk->sk_zckey, ++next);
-			sock_zerocopy_get(uarg);
+
+			/* no extra ref when appending to datagram (MSG_MORE) */
+			if (sk->sk_type == SOCK_STREAM)
+				sock_zerocopy_get(uarg);
+
 			return uarg;
 		}
 	}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e8bb2e85c5a4..ac770940adb9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -883,7 +883,7 @@ static int __ip_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref;
+	bool paged, extra_uref = false;
 	u32 tskey = 0;
 
 	skb = skb_peek_tail(queue);
@@ -923,7 +923,7 @@ static int __ip_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
-		extra_uref = true;
+		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e51f3c648b09..b5e0c85bcd57 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1275,7 +1275,7 @@ static int __ip6_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref;
+	bool paged, extra_uref = false;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1344,7 +1344,7 @@ static int __ip6_append_data(struct sock *sk,
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
-		extra_uref = true;
+		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
 		if (rt->dst.dev->features & NETIF_F_SG &&
 		    csummode == CHECKSUM_PARTIAL) {
 			paged = true;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [GIT] Networking
  2018-08-21  5:36 ` Greg KH
@ 2018-08-23  7:04   ` Greg KH
  0 siblings, 0 replies; 10+ messages in thread
From: Greg KH @ 2018-08-23  7:04 UTC (permalink / raw)
  To: David Miller; +Cc: stable

On Tue, Aug 21, 2018 at 07:36:22AM +0200, Greg KH wrote:
> On Mon, Aug 20, 2018 at 05:59:37PM -0700, David Miller wrote:
> > 
> > Please queue up the following networking bug fixes for v4.17
> > and v4.18 -stable, respectively.
> 
> All now queued up, thanks!

Also, I want to mark 4.17.y end-of-life this week, so no need to make
patches up for that tree anymore.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [GIT] Networking
  2018-08-21  0:59 David Miller
@ 2018-08-21  5:36 ` Greg KH
  2018-08-23  7:04   ` Greg KH
  0 siblings, 1 reply; 10+ messages in thread
From: Greg KH @ 2018-08-21  5:36 UTC (permalink / raw)
  To: David Miller; +Cc: stable

On Mon, Aug 20, 2018 at 05:59:37PM -0700, David Miller wrote:
> 
> Please queue up the following networking bug fixes for v4.17
> and v4.18 -stable, respectively.

All now queued up, thanks!

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [GIT] Networking
@ 2018-08-21  0:59 David Miller
  2018-08-21  5:36 ` Greg KH
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2018-08-21  0:59 UTC (permalink / raw)
  To: stable

[-- Attachment #1: Type: Text/Plain, Size: 107 bytes --]


Please queue up the following networking bug fixes for v4.17
and v4.18 -stable, respectively.

Thank you!

[-- Attachment #2: net_417.mbox --]
[-- Type: Application/Octet-Stream, Size: 18959 bytes --]

From 34795e4f64d15bb43a594ab928786fffc54357d8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 15 Aug 2018 12:14:05 -0700
Subject: [PATCH 1/8] isdn: Disable IIOCDBGVAR

[ Upstream commit 5e22002aa8809e2efab2da95855f73f63e14a36c ]

It was possible to directly leak the kernel address where the isdn_dev
structure pointer was stored. This is a kernel ASLR bypass for anyone
with access to the ioctl. The code had been present since the beginning
of git history, though this shouldn't ever be needed for normal operation,
therefore remove it.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Karsten Keil <isdn@linux-pingi.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/i4l/isdn_common.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 7c6f3f5d9d9a..66ac7fa6e034 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -1640,13 +1640,7 @@ isdn_ioctl(struct file *file, uint cmd, ulong arg)
 			} else
 				return -EINVAL;
 		case IIOCDBGVAR:
-			if (arg) {
-				if (copy_to_user(argp, &dev, sizeof(ulong)))
-					return -EFAULT;
-				return 0;
-			} else
-				return -EINVAL;
-			break;
+			return -EINVAL;
 		default:
 			if ((cmd & IIOCDRVCTL) == IIOCDRVCTL)
 				cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK;
-- 
2.17.1


From 58f98f69b856c4aed59a73d5408877451e1ed55f Mon Sep 17 00:00:00 2001
From: Jeremy Cline <jcline@redhat.com>
Date: Mon, 13 Aug 2018 22:23:13 +0000
Subject: [PATCH 2/8] net: sock_diag: Fix spectre v1 gadget in
 __sock_diag_cmd()

[ Upstream commit 66b51b0a0341fd42ce657739bdae0561b0410a85 ]

req->sdiag_family is a user-controlled value that's used as an array
index. Sanitize it after the bounds check to avoid speculative
out-of-bounds array access.

This also protects the sock_is_registered() call, so this removes the
sanitize call there.

Fixes: e978de7a6d38 ("net: socket: Fix potential spectre v1 gadget in sock_is_registered")
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: konrad.wilk@oracle.com
Cc: jamie.iles@oracle.com
Cc: liran.alon@oracle.com
Cc: stable@vger.kernel.org
Signed-off-by: Jeremy Cline <jcline@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_diag.c | 2 ++
 net/socket.c         | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..3312a5849a97 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/tcp.h>
 #include <linux/workqueue.h>
+#include <linux/nospec.h>
 
 #include <linux/inet_diag.h>
 #include <linux/sock_diag.h>
@@ -218,6 +219,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	if (req->sdiag_family >= AF_MAX)
 		return -EINVAL;
+	req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
 
 	if (sock_diag_handlers[req->sdiag_family] == NULL)
 		sock_load_diag_module(req->sdiag_family, 0);
diff --git a/net/socket.c b/net/socket.c
index 6a6aa84b64c1..0316b380389e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2694,8 +2694,7 @@ EXPORT_SYMBOL(sock_unregister);
 
 bool sock_is_registered(int family)
 {
-	return family < NPROTO &&
-		rcu_access_pointer(net_families[array_index_nospec(family, NPROTO)]);
+	return family < NPROTO && rcu_access_pointer(net_families[family]);
 }
 
 static int __init sock_init(void)
-- 
2.17.1


From a490d91b1f86978092efa4e3e22ae723499620e2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 12 Aug 2018 13:26:26 +0200
Subject: [PATCH 3/8] r8169: don't use MSI-X on RTL8168g

[ Upstream commit 7c53a722459c1d6ffb0f5b2058c06ca8980b8600 ]

There have been two reports that network doesn't come back on resume
from suspend when using MSI-X. Both cases affect the same chip version
(RTL8168g - version 40), on different systems. Falling back to MSI
fixes the issue.
Even though we don't really have a proof yet that the network chip
version is to blame, let's disable MSI-X for this version.

Reported-by: Steve Dodd <steved424@gmail.com>
Reported-by: Lou Reed <gogen@disroot.org>
Tested-by: Steve Dodd <steved424@gmail.com>
Tested-by: Lou Reed <gogen@disroot.org>
Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 764b25fa470c..d19065857afe 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8066,6 +8066,11 @@ static int rtl_alloc_irq(struct rtl8169_private *tp)
 		RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
 		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
 		flags = PCI_IRQ_LEGACY;
+	} else if (tp->mac_version == RTL_GIGA_MAC_VER_40) {
+		/* This version was reported to have issues with resume
+		 * from suspend when using MSI-X
+		 */
+		flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI;
 	} else {
 		flags = PCI_IRQ_ALL_TYPES;
 	}
-- 
2.17.1


From e70b2dd0f11e3050cf3919c43960033504678392 Mon Sep 17 00:00:00 2001
From: Jian-Hong Pan <jian-hong@endlessm.com>
Date: Fri, 17 Aug 2018 13:07:35 +0800
Subject: [PATCH 4/8] r8169: don't use MSI-X on RTL8106e

[ Upstream commit 7bb05b85bc2d1a1b647b91424b2ed4a18e6ecd81 ]

Found the ethernet network on ASUS X441UAR doesn't come back on resume
from suspend when using MSI-X.  The chip is RTL8106e - version 39.

[   21.848357] libphy: r8169: probed
[   21.848473] r8169 0000:02:00.0 eth0: RTL8106e, 0c:9d:92:32:67:b4, XID
44900000, IRQ 127
[   22.518860] r8169 0000:02:00.0 enp2s0: renamed from eth0
[   29.458041] Generic PHY r8169-200:00: attached PHY driver [Generic
PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)
[   63.227398] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full -
flow control off
[  124.514648] Generic PHY r8169-200:00: attached PHY driver [Generic
PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)

Here is the ethernet controller in detail:

02:00.0 Ethernet controller [0200]: Realtek Semiconductor Co., Ltd.
RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller [10ec:8136]
(rev 07)
	Subsystem: ASUSTeK Computer Inc. RTL810xE PCI Express Fast
Ethernet controller [1043:200f]
	Flags: bus master, fast devsel, latency 0, IRQ 16
	I/O ports at e000 [size=256]
	Memory at ef100000 (64-bit, non-prefetchable) [size=4K]
	Memory at e0000000 (64-bit, prefetchable) [size=16K]
	Capabilities: <access denied>
	Kernel driver in use: r8169
	Kernel modules: r8169

Falling back to MSI fixes the issue.

Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling")
Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index d19065857afe..d3c6ce074571 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8061,17 +8061,20 @@ static int rtl_alloc_irq(struct rtl8169_private *tp)
 {
 	unsigned int flags;
 
-	if (tp->mac_version <= RTL_GIGA_MAC_VER_06) {
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
 		RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
 		RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
 		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
 		flags = PCI_IRQ_LEGACY;
-	} else if (tp->mac_version == RTL_GIGA_MAC_VER_40) {
+		break;
+	case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_40:
 		/* This version was reported to have issues with resume
 		 * from suspend when using MSI-X
 		 */
 		flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI;
-	} else {
+		break;
+	default:
 		flags = PCI_IRQ_ALL_TYPES;
 	}
 
-- 
2.17.1


From e531c9cd4d87ae8cc3b1b2d02e65f5a2ada2bc4f Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Sun, 19 Aug 2018 15:05:04 +0800
Subject: [PATCH 5/8] ip_vti: fix a null pointer deferrence when create vti
 fallback tunnel

[ Upstream commit cd1aa9c2c665cafbd05b83507d3f1096f3912aa4 ]

After set fb_tunnels_only_for_init_net to 1, the itn->fb_tunnel_dev will
be NULL and will cause following crash:

[ 2742.849298] BUG: unable to handle kernel NULL pointer dereference at 0000000000000941
[ 2742.851380] PGD 800000042c21a067 P4D 800000042c21a067 PUD 42aaed067 PMD 0
[ 2742.852818] Oops: 0002 [#1] SMP PTI
[ 2742.853570] CPU: 7 PID: 2484 Comm: unshare Kdump: loaded Not tainted 4.18.0-rc8+ #2
[ 2742.855163] Hardware name: Fedora Project OpenStack Nova, BIOS seabios-1.7.5-11.el7 04/01/2014
[ 2742.856970] RIP: 0010:vti_init_net+0x3a/0x50 [ip_vti]
[ 2742.858034] Code: 90 83 c0 48 c7 c2 20 a1 83 c0 48 89 fb e8 6e 3b f6 ff 85 c0 75 22 8b 0d f4 19 00 00 48 8b 93 00 14 00 00 48 8b 14 ca 48 8b 12 <c6> 82 41 09 00 00 04 c6 82 38 09 00 00 45 5b c3 66 0f 1f 44 00 00
[ 2742.861940] RSP: 0018:ffff9be28207fde0 EFLAGS: 00010246
[ 2742.863044] RAX: 0000000000000000 RBX: ffff8a71ebed4980 RCX: 0000000000000013
[ 2742.864540] RDX: 0000000000000000 RSI: 0000000000000013 RDI: ffff8a71ebed4980
[ 2742.866020] RBP: ffff8a71ea717000 R08: ffffffffc083903c R09: ffff8a71ea717000
[ 2742.867505] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a71ebed4980
[ 2742.868987] R13: 0000000000000013 R14: ffff8a71ea5b49c0 R15: 0000000000000000
[ 2742.870473] FS:  00007f02266c9740(0000) GS:ffff8a71ffdc0000(0000) knlGS:0000000000000000
[ 2742.872143] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2742.873340] CR2: 0000000000000941 CR3: 000000042bc20006 CR4: 00000000001606e0
[ 2742.874821] Call Trace:
[ 2742.875358]  ops_init+0x38/0xf0
[ 2742.876078]  setup_net+0xd9/0x1f0
[ 2742.876789]  copy_net_ns+0xb7/0x130
[ 2742.877538]  create_new_namespaces+0x11a/0x1d0
[ 2742.878525]  unshare_nsproxy_namespaces+0x55/0xa0
[ 2742.879526]  ksys_unshare+0x1a7/0x330
[ 2742.880313]  __x64_sys_unshare+0xe/0x20
[ 2742.881131]  do_syscall_64+0x5b/0x180
[ 2742.881933]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reproduce:
echo 1 > /proc/sys/net/core/fb_tunnels_only_for_init_net
modprobe ip_vti
unshare -n

Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces")
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3f091ccad9af..f38cb21d773d 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -438,7 +438,8 @@ static int __net_init vti_init_net(struct net *net)
 	if (err)
 		return err;
 	itn = net_generic(net, vti_net_id);
-	vti_fb_tunnel_init(itn->fb_tunnel_dev);
+	if (itn->fb_tunnel_dev)
+		vti_fb_tunnel_init(itn->fb_tunnel_dev);
 	return 0;
 }
 
-- 
2.17.1


From 01781418870fe6a2350d6a41031b9d2d82f983ad Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 14 Aug 2018 17:28:26 +0800
Subject: [PATCH 6/8] cls_matchall: fix tcf_unbind_filter missing

[ Upstream commit a51c76b4dfb30496dc65396a957ef0f06af7fb22 ]

Fix tcf_unbind_filter missing in cls_matchall as this will trigger
WARN_ON() in cbq_destroy_class().

Fixes: fd62d9f5c575f ("net/sched: matchall: Fix configuration race")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2ba721a590a7..a74b4d6ee186 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -122,6 +122,8 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	if (!head)
 		return;
 
+	tcf_unbind_filter(tp, &head->res);
+
 	if (!tc_skip_hw(head->flags))
 		mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
 
-- 
2.17.1


From 13b1c6926e686f195e4039f62f4938757fd68ee1 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 18 Jul 2018 18:10:50 +0200
Subject: [PATCH 7/8] net: ethernet: mvneta: Fix napi structure mixup on armada
 3700

[ Upstream commit 7a86f05faf112463cfbbdfd222012e247de461a1 ]

The mvneta Ethernet driver is used on a few different Marvell SoCs.
Some SoCs have per cpu interrupts for Ethernet events. Some SoCs have
a single interrupt, independent of the CPU. The driver handles this by
having a per CPU napi structure when there are per CPU interrupts, and
a global napi structure when there is a single interrupt.

When the napi core calls mvneta_poll(), it passes the napi
instance. This was not being propagated through the call chain, and
instead the per-cpu napi instance was passed to napi_gro_receive()
call. This breaks when there is a single global napi instance.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Fixes: 2636ac3cc2b4 ("net: mvneta: Add network support for Armada 3700 SoC")
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 0ad2f3f7da85..ec84db47d82d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1901,10 +1901,10 @@ static void mvneta_rxq_drop_pkts(struct mvneta_port *pp,
 }
 
 /* Main rx processing when using software buffer management */
-static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
+static int mvneta_rx_swbm(struct napi_struct *napi,
+			  struct mvneta_port *pp, int rx_todo,
 			  struct mvneta_rx_queue *rxq)
 {
-	struct mvneta_pcpu_port *port = this_cpu_ptr(pp->ports);
 	struct net_device *dev = pp->dev;
 	int rx_done;
 	u32 rcvd_pkts = 0;
@@ -1959,7 +1959,7 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 
 			skb->protocol = eth_type_trans(skb, dev);
 			mvneta_rx_csum(pp, rx_status, skb);
-			napi_gro_receive(&port->napi, skb);
+			napi_gro_receive(napi, skb);
 
 			rcvd_pkts++;
 			rcvd_bytes += rx_bytes;
@@ -2001,7 +2001,7 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 
 		mvneta_rx_csum(pp, rx_status, skb);
 
-		napi_gro_receive(&port->napi, skb);
+		napi_gro_receive(napi, skb);
 	}
 
 	if (rcvd_pkts) {
@@ -2020,10 +2020,10 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 }
 
 /* Main rx processing when using hardware buffer management */
-static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
+static int mvneta_rx_hwbm(struct napi_struct *napi,
+			  struct mvneta_port *pp, int rx_todo,
 			  struct mvneta_rx_queue *rxq)
 {
-	struct mvneta_pcpu_port *port = this_cpu_ptr(pp->ports);
 	struct net_device *dev = pp->dev;
 	int rx_done;
 	u32 rcvd_pkts = 0;
@@ -2085,7 +2085,7 @@ static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
 
 			skb->protocol = eth_type_trans(skb, dev);
 			mvneta_rx_csum(pp, rx_status, skb);
-			napi_gro_receive(&port->napi, skb);
+			napi_gro_receive(napi, skb);
 
 			rcvd_pkts++;
 			rcvd_bytes += rx_bytes;
@@ -2129,7 +2129,7 @@ static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
 
 		mvneta_rx_csum(pp, rx_status, skb);
 
-		napi_gro_receive(&port->napi, skb);
+		napi_gro_receive(napi, skb);
 	}
 
 	if (rcvd_pkts) {
@@ -2722,9 +2722,11 @@ static int mvneta_poll(struct napi_struct *napi, int budget)
 	if (rx_queue) {
 		rx_queue = rx_queue - 1;
 		if (pp->bm_priv)
-			rx_done = mvneta_rx_hwbm(pp, budget, &pp->rxqs[rx_queue]);
+			rx_done = mvneta_rx_hwbm(napi, pp, budget,
+						 &pp->rxqs[rx_queue]);
 		else
-			rx_done = mvneta_rx_swbm(pp, budget, &pp->rxqs[rx_queue]);
+			rx_done = mvneta_rx_swbm(napi, pp, budget,
+						 &pp->rxqs[rx_queue]);
 	}
 
 	if (rx_done < budget) {
-- 
2.17.1


From ba648c3d43bd7a255e4ad24e967735e0c3254c02 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Fri, 10 Aug 2018 11:36:27 +0800
Subject: [PATCH 8/8] net: mvneta: fix mvneta_config_rss on armada 3700

[ Upstream commit 0f5c6c30a0f8c629b92ecdaef61b315c43fde10a ]

The mvneta Ethernet driver is used on a few different Marvell SoCs.
Some SoCs have per cpu interrupts for Ethernet events, the driver uses
a per CPU napi structure for this case. Some SoCs such as armada 3700
have a single interrupt for Ethernet events, the driver uses a global
napi structure for this case.

Current mvneta_config_rss() always operates the per cpu napi structure.
Fix it by operating a global napi for "single interrupt" case, and per
cpu napi structure for remaining cases.

Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Fixes: 2636ac3cc2b4 ("net: mvneta: Add network support for Armada 3700 SoC")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 31 +++++++++++++++++----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index ec84db47d82d..82ac1d10f239 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4020,13 +4020,18 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
 
 	on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
 
-	/* We have to synchronise on the napi of each CPU */
-	for_each_online_cpu(cpu) {
-		struct mvneta_pcpu_port *pcpu_port =
-			per_cpu_ptr(pp->ports, cpu);
+	if (!pp->neta_armada3700) {
+		/* We have to synchronise on the napi of each CPU */
+		for_each_online_cpu(cpu) {
+			struct mvneta_pcpu_port *pcpu_port =
+				per_cpu_ptr(pp->ports, cpu);
 
-		napi_synchronize(&pcpu_port->napi);
-		napi_disable(&pcpu_port->napi);
+			napi_synchronize(&pcpu_port->napi);
+			napi_disable(&pcpu_port->napi);
+		}
+	} else {
+		napi_synchronize(&pp->napi);
+		napi_disable(&pp->napi);
 	}
 
 	pp->rxq_def = pp->indir[0];
@@ -4043,12 +4048,16 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
 	mvneta_percpu_elect(pp);
 	spin_unlock(&pp->lock);
 
-	/* We have to synchronise on the napi of each CPU */
-	for_each_online_cpu(cpu) {
-		struct mvneta_pcpu_port *pcpu_port =
-			per_cpu_ptr(pp->ports, cpu);
+	if (!pp->neta_armada3700) {
+		/* We have to synchronise on the napi of each CPU */
+		for_each_online_cpu(cpu) {
+			struct mvneta_pcpu_port *pcpu_port =
+				per_cpu_ptr(pp->ports, cpu);
 
-		napi_enable(&pcpu_port->napi);
+			napi_enable(&pcpu_port->napi);
+		}
+	} else {
+		napi_enable(&pp->napi);
 	}
 
 	netif_tx_start_all_queues(pp->dev);
-- 
2.17.1


[-- Attachment #3: net_418.mbox --]
[-- Type: Application/Octet-Stream, Size: 20483 bytes --]

From e1e20a5d810037330ecc4dbdcf40df4e7293d351 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 15 Aug 2018 12:14:05 -0700
Subject: [PATCH 1/9] isdn: Disable IIOCDBGVAR

[ Upstream commit 5e22002aa8809e2efab2da95855f73f63e14a36c ]

It was possible to directly leak the kernel address where the isdn_dev
structure pointer was stored. This is a kernel ASLR bypass for anyone
with access to the ioctl. The code had been present since the beginning
of git history, though this shouldn't ever be needed for normal operation,
therefore remove it.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Karsten Keil <isdn@linux-pingi.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/i4l/isdn_common.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 7a501dbe7123..6a5b3f00f9ad 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -1640,13 +1640,7 @@ isdn_ioctl(struct file *file, uint cmd, ulong arg)
 			} else
 				return -EINVAL;
 		case IIOCDBGVAR:
-			if (arg) {
-				if (copy_to_user(argp, &dev, sizeof(ulong)))
-					return -EFAULT;
-				return 0;
-			} else
-				return -EINVAL;
-			break;
+			return -EINVAL;
 		default:
 			if ((cmd & IIOCDRVCTL) == IIOCDRVCTL)
 				cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK;
-- 
2.17.1


From e2570d35e5f35b8936731567992f77f2723f066b Mon Sep 17 00:00:00 2001
From: Jeremy Cline <jcline@redhat.com>
Date: Mon, 13 Aug 2018 22:23:13 +0000
Subject: [PATCH 2/9] net: sock_diag: Fix spectre v1 gadget in
 __sock_diag_cmd()

[ Upstream commit 66b51b0a0341fd42ce657739bdae0561b0410a85 ]

req->sdiag_family is a user-controlled value that's used as an array
index. Sanitize it after the bounds check to avoid speculative
out-of-bounds array access.

This also protects the sock_is_registered() call, so this removes the
sanitize call there.

Fixes: e978de7a6d38 ("net: socket: Fix potential spectre v1 gadget in sock_is_registered")
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: konrad.wilk@oracle.com
Cc: jamie.iles@oracle.com
Cc: liran.alon@oracle.com
Cc: stable@vger.kernel.org
Signed-off-by: Jeremy Cline <jcline@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_diag.c | 2 ++
 net/socket.c         | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..3312a5849a97 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/tcp.h>
 #include <linux/workqueue.h>
+#include <linux/nospec.h>
 
 #include <linux/inet_diag.h>
 #include <linux/sock_diag.h>
@@ -218,6 +219,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	if (req->sdiag_family >= AF_MAX)
 		return -EINVAL;
+	req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
 
 	if (sock_diag_handlers[req->sdiag_family] == NULL)
 		sock_load_diag_module(req->sdiag_family, 0);
diff --git a/net/socket.c b/net/socket.c
index 8c24d5dc4bc8..4ac3b834cce9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2690,8 +2690,7 @@ EXPORT_SYMBOL(sock_unregister);
 
 bool sock_is_registered(int family)
 {
-	return family < NPROTO &&
-		rcu_access_pointer(net_families[array_index_nospec(family, NPROTO)]);
+	return family < NPROTO && rcu_access_pointer(net_families[family]);
 }
 
 static int __init sock_init(void)
-- 
2.17.1


From 3d9aef8e75052df02480d1cd3eb9438e16b6f570 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 14 Aug 2018 19:10:50 +0200
Subject: [PATCH 3/9] hv/netvsc: Fix NULL dereference at single queue mode
 fallback

[ Upstream commit b19b46346f483ae055fa027cb2d5c2ca91484b91 ]

The recent commit 916c5e1413be ("hv/netvsc: fix handling of fallback
to single queue mode") tried to fix the fallback behavior to a single
queue mode, but it changed the function to return zero incorrectly,
while the function should return an object pointer.  Eventually this
leads to a NULL dereference at the callers that expect non-NULL
value.

Fix it by returning the proper net_device object.

Fixes: 916c5e1413be ("hv/netvsc: fix handling of fallback to single queue mode")
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 408ece27131c..2a5209f23f29 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1338,7 +1338,7 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 	/* setting up multiple channels failed */
 	net_device->max_chn = 1;
 	net_device->num_chn = 1;
-	return 0;
+	return net_device;
 
 err_dev_remv:
 	rndis_filter_device_remove(dev, net_device);
-- 
2.17.1


From e4740e8da5288c7297e8f42f2a776f19cb36ba65 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 12 Aug 2018 13:26:26 +0200
Subject: [PATCH 4/9] r8169: don't use MSI-X on RTL8168g

[ Upstream commit 7c53a722459c1d6ffb0f5b2058c06ca8980b8600 ]

There have been two reports that network doesn't come back on resume
from suspend when using MSI-X. Both cases affect the same chip version
(RTL8168g - version 40), on different systems. Falling back to MSI
fixes the issue.
Even though we don't really have a proof yet that the network chip
version is to blame, let's disable MSI-X for this version.

Reported-by: Steve Dodd <steved424@gmail.com>
Reported-by: Lou Reed <gogen@disroot.org>
Tested-by: Steve Dodd <steved424@gmail.com>
Tested-by: Lou Reed <gogen@disroot.org>
Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index eaedc11ed686..a0311fe1a457 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7544,6 +7544,11 @@ static int rtl_alloc_irq(struct rtl8169_private *tp)
 		RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
 		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
 		flags = PCI_IRQ_LEGACY;
+	} else if (tp->mac_version == RTL_GIGA_MAC_VER_40) {
+		/* This version was reported to have issues with resume
+		 * from suspend when using MSI-X
+		 */
+		flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI;
 	} else {
 		flags = PCI_IRQ_ALL_TYPES;
 	}
-- 
2.17.1


From 0938ae849a8ac940a512769159a49655c0096b57 Mon Sep 17 00:00:00 2001
From: Jian-Hong Pan <jian-hong@endlessm.com>
Date: Fri, 17 Aug 2018 13:07:35 +0800
Subject: [PATCH 5/9] r8169: don't use MSI-X on RTL8106e

[ Upstream commit 7bb05b85bc2d1a1b647b91424b2ed4a18e6ecd81 ]

Found the ethernet network on ASUS X441UAR doesn't come back on resume
from suspend when using MSI-X.  The chip is RTL8106e - version 39.

[   21.848357] libphy: r8169: probed
[   21.848473] r8169 0000:02:00.0 eth0: RTL8106e, 0c:9d:92:32:67:b4, XID
44900000, IRQ 127
[   22.518860] r8169 0000:02:00.0 enp2s0: renamed from eth0
[   29.458041] Generic PHY r8169-200:00: attached PHY driver [Generic
PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)
[   63.227398] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full -
flow control off
[  124.514648] Generic PHY r8169-200:00: attached PHY driver [Generic
PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE)

Here is the ethernet controller in detail:

02:00.0 Ethernet controller [0200]: Realtek Semiconductor Co., Ltd.
RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller [10ec:8136]
(rev 07)
	Subsystem: ASUSTeK Computer Inc. RTL810xE PCI Express Fast
Ethernet controller [1043:200f]
	Flags: bus master, fast devsel, latency 0, IRQ 16
	I/O ports at e000 [size=256]
	Memory at ef100000 (64-bit, non-prefetchable) [size=4K]
	Memory at e0000000 (64-bit, prefetchable) [size=16K]
	Capabilities: <access denied>
	Kernel driver in use: r8169
	Kernel modules: r8169

Falling back to MSI fixes the issue.

Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling")
Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index a0311fe1a457..9ceb34bac3a9 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7539,17 +7539,20 @@ static int rtl_alloc_irq(struct rtl8169_private *tp)
 {
 	unsigned int flags;
 
-	if (tp->mac_version <= RTL_GIGA_MAC_VER_06) {
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
 		RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
 		RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
 		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
 		flags = PCI_IRQ_LEGACY;
-	} else if (tp->mac_version == RTL_GIGA_MAC_VER_40) {
+		break;
+	case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_40:
 		/* This version was reported to have issues with resume
 		 * from suspend when using MSI-X
 		 */
 		flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI;
-	} else {
+		break;
+	default:
 		flags = PCI_IRQ_ALL_TYPES;
 	}
 
-- 
2.17.1


From 9c4d61b26c72a100da8e77bab02717ab8c582dde Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Sun, 19 Aug 2018 15:05:04 +0800
Subject: [PATCH 6/9] ip_vti: fix a null pointer deferrence when create vti
 fallback tunnel

[ Upstream commit cd1aa9c2c665cafbd05b83507d3f1096f3912aa4 ]

After set fb_tunnels_only_for_init_net to 1, the itn->fb_tunnel_dev will
be NULL and will cause following crash:

[ 2742.849298] BUG: unable to handle kernel NULL pointer dereference at 0000000000000941
[ 2742.851380] PGD 800000042c21a067 P4D 800000042c21a067 PUD 42aaed067 PMD 0
[ 2742.852818] Oops: 0002 [#1] SMP PTI
[ 2742.853570] CPU: 7 PID: 2484 Comm: unshare Kdump: loaded Not tainted 4.18.0-rc8+ #2
[ 2742.855163] Hardware name: Fedora Project OpenStack Nova, BIOS seabios-1.7.5-11.el7 04/01/2014
[ 2742.856970] RIP: 0010:vti_init_net+0x3a/0x50 [ip_vti]
[ 2742.858034] Code: 90 83 c0 48 c7 c2 20 a1 83 c0 48 89 fb e8 6e 3b f6 ff 85 c0 75 22 8b 0d f4 19 00 00 48 8b 93 00 14 00 00 48 8b 14 ca 48 8b 12 <c6> 82 41 09 00 00 04 c6 82 38 09 00 00 45 5b c3 66 0f 1f 44 00 00
[ 2742.861940] RSP: 0018:ffff9be28207fde0 EFLAGS: 00010246
[ 2742.863044] RAX: 0000000000000000 RBX: ffff8a71ebed4980 RCX: 0000000000000013
[ 2742.864540] RDX: 0000000000000000 RSI: 0000000000000013 RDI: ffff8a71ebed4980
[ 2742.866020] RBP: ffff8a71ea717000 R08: ffffffffc083903c R09: ffff8a71ea717000
[ 2742.867505] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a71ebed4980
[ 2742.868987] R13: 0000000000000013 R14: ffff8a71ea5b49c0 R15: 0000000000000000
[ 2742.870473] FS:  00007f02266c9740(0000) GS:ffff8a71ffdc0000(0000) knlGS:0000000000000000
[ 2742.872143] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2742.873340] CR2: 0000000000000941 CR3: 000000042bc20006 CR4: 00000000001606e0
[ 2742.874821] Call Trace:
[ 2742.875358]  ops_init+0x38/0xf0
[ 2742.876078]  setup_net+0xd9/0x1f0
[ 2742.876789]  copy_net_ns+0xb7/0x130
[ 2742.877538]  create_new_namespaces+0x11a/0x1d0
[ 2742.878525]  unshare_nsproxy_namespaces+0x55/0xa0
[ 2742.879526]  ksys_unshare+0x1a7/0x330
[ 2742.880313]  __x64_sys_unshare+0xe/0x20
[ 2742.881131]  do_syscall_64+0x5b/0x180
[ 2742.881933]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reproduce:
echo 1 > /proc/sys/net/core/fb_tunnels_only_for_init_net
modprobe ip_vti
unshare -n

Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces")
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3f091ccad9af..f38cb21d773d 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -438,7 +438,8 @@ static int __net_init vti_init_net(struct net *net)
 	if (err)
 		return err;
 	itn = net_generic(net, vti_net_id);
-	vti_fb_tunnel_init(itn->fb_tunnel_dev);
+	if (itn->fb_tunnel_dev)
+		vti_fb_tunnel_init(itn->fb_tunnel_dev);
 	return 0;
 }
 
-- 
2.17.1


From 5b3ab6016c3abdd045538c613d6c2a933a254028 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 18 Jul 2018 18:10:50 +0200
Subject: [PATCH 7/9] net: ethernet: mvneta: Fix napi structure mixup on armada
 3700

[ Upstream commit 7a86f05faf112463cfbbdfd222012e247de461a1 ]

The mvneta Ethernet driver is used on a few different Marvell SoCs.
Some SoCs have per cpu interrupts for Ethernet events. Some SoCs have
a single interrupt, independent of the CPU. The driver handles this by
having a per CPU napi structure when there are per CPU interrupts, and
a global napi structure when there is a single interrupt.

When the napi core calls mvneta_poll(), it passes the napi
instance. This was not being propagated through the call chain, and
instead the per-cpu napi instance was passed to napi_gro_receive()
call. This breaks when there is a single global napi instance.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Fixes: 2636ac3cc2b4 ("net: mvneta: Add network support for Armada 3700 SoC")
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 0ad2f3f7da85..ec84db47d82d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1901,10 +1901,10 @@ static void mvneta_rxq_drop_pkts(struct mvneta_port *pp,
 }
 
 /* Main rx processing when using software buffer management */
-static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
+static int mvneta_rx_swbm(struct napi_struct *napi,
+			  struct mvneta_port *pp, int rx_todo,
 			  struct mvneta_rx_queue *rxq)
 {
-	struct mvneta_pcpu_port *port = this_cpu_ptr(pp->ports);
 	struct net_device *dev = pp->dev;
 	int rx_done;
 	u32 rcvd_pkts = 0;
@@ -1959,7 +1959,7 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 
 			skb->protocol = eth_type_trans(skb, dev);
 			mvneta_rx_csum(pp, rx_status, skb);
-			napi_gro_receive(&port->napi, skb);
+			napi_gro_receive(napi, skb);
 
 			rcvd_pkts++;
 			rcvd_bytes += rx_bytes;
@@ -2001,7 +2001,7 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 
 		mvneta_rx_csum(pp, rx_status, skb);
 
-		napi_gro_receive(&port->napi, skb);
+		napi_gro_receive(napi, skb);
 	}
 
 	if (rcvd_pkts) {
@@ -2020,10 +2020,10 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
 }
 
 /* Main rx processing when using hardware buffer management */
-static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
+static int mvneta_rx_hwbm(struct napi_struct *napi,
+			  struct mvneta_port *pp, int rx_todo,
 			  struct mvneta_rx_queue *rxq)
 {
-	struct mvneta_pcpu_port *port = this_cpu_ptr(pp->ports);
 	struct net_device *dev = pp->dev;
 	int rx_done;
 	u32 rcvd_pkts = 0;
@@ -2085,7 +2085,7 @@ static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
 
 			skb->protocol = eth_type_trans(skb, dev);
 			mvneta_rx_csum(pp, rx_status, skb);
-			napi_gro_receive(&port->napi, skb);
+			napi_gro_receive(napi, skb);
 
 			rcvd_pkts++;
 			rcvd_bytes += rx_bytes;
@@ -2129,7 +2129,7 @@ static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
 
 		mvneta_rx_csum(pp, rx_status, skb);
 
-		napi_gro_receive(&port->napi, skb);
+		napi_gro_receive(napi, skb);
 	}
 
 	if (rcvd_pkts) {
@@ -2722,9 +2722,11 @@ static int mvneta_poll(struct napi_struct *napi, int budget)
 	if (rx_queue) {
 		rx_queue = rx_queue - 1;
 		if (pp->bm_priv)
-			rx_done = mvneta_rx_hwbm(pp, budget, &pp->rxqs[rx_queue]);
+			rx_done = mvneta_rx_hwbm(napi, pp, budget,
+						 &pp->rxqs[rx_queue]);
 		else
-			rx_done = mvneta_rx_swbm(pp, budget, &pp->rxqs[rx_queue]);
+			rx_done = mvneta_rx_swbm(napi, pp, budget,
+						 &pp->rxqs[rx_queue]);
 	}
 
 	if (rx_done < budget) {
-- 
2.17.1


From 6d0ca4da21a9107c075b8492de33d31751e8b8f6 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Fri, 10 Aug 2018 11:36:27 +0800
Subject: [PATCH 8/9] net: mvneta: fix mvneta_config_rss on armada 3700

[ Upstream commit 0f5c6c30a0f8c629b92ecdaef61b315c43fde10a ]

The mvneta Ethernet driver is used on a few different Marvell SoCs.
Some SoCs have per cpu interrupts for Ethernet events, the driver uses
a per CPU napi structure for this case. Some SoCs such as armada 3700
have a single interrupt for Ethernet events, the driver uses a global
napi structure for this case.

Current mvneta_config_rss() always operates the per cpu napi structure.
Fix it by operating a global napi for "single interrupt" case, and per
cpu napi structure for remaining cases.

Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Fixes: 2636ac3cc2b4 ("net: mvneta: Add network support for Armada 3700 SoC")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 31 +++++++++++++++++----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index ec84db47d82d..82ac1d10f239 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4020,13 +4020,18 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
 
 	on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
 
-	/* We have to synchronise on the napi of each CPU */
-	for_each_online_cpu(cpu) {
-		struct mvneta_pcpu_port *pcpu_port =
-			per_cpu_ptr(pp->ports, cpu);
+	if (!pp->neta_armada3700) {
+		/* We have to synchronise on the napi of each CPU */
+		for_each_online_cpu(cpu) {
+			struct mvneta_pcpu_port *pcpu_port =
+				per_cpu_ptr(pp->ports, cpu);
 
-		napi_synchronize(&pcpu_port->napi);
-		napi_disable(&pcpu_port->napi);
+			napi_synchronize(&pcpu_port->napi);
+			napi_disable(&pcpu_port->napi);
+		}
+	} else {
+		napi_synchronize(&pp->napi);
+		napi_disable(&pp->napi);
 	}
 
 	pp->rxq_def = pp->indir[0];
@@ -4043,12 +4048,16 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
 	mvneta_percpu_elect(pp);
 	spin_unlock(&pp->lock);
 
-	/* We have to synchronise on the napi of each CPU */
-	for_each_online_cpu(cpu) {
-		struct mvneta_pcpu_port *pcpu_port =
-			per_cpu_ptr(pp->ports, cpu);
+	if (!pp->neta_armada3700) {
+		/* We have to synchronise on the napi of each CPU */
+		for_each_online_cpu(cpu) {
+			struct mvneta_pcpu_port *pcpu_port =
+				per_cpu_ptr(pp->ports, cpu);
 
-		napi_enable(&pcpu_port->napi);
+			napi_enable(&pcpu_port->napi);
+		}
+	} else {
+		napi_enable(&pp->napi);
 	}
 
 	netif_tx_start_all_queues(pp->dev);
-- 
2.17.1


From a71409b7c7d2c9654af6eb791776670598541303 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 14 Aug 2018 17:28:26 +0800
Subject: [PATCH 9/9] cls_matchall: fix tcf_unbind_filter missing

[ Upstream commit a51c76b4dfb30496dc65396a957ef0f06af7fb22 ]

Fix tcf_unbind_filter missing in cls_matchall as this will trigger
WARN_ON() in cbq_destroy_class().

Fixes: fd62d9f5c575f ("net/sched: matchall: Fix configuration race")
Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 47b207ef7762..7ad65daf66a4 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -111,6 +111,8 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	if (!head)
 		return;
 
+	tcf_unbind_filter(tp, &head->res);
+
 	if (!tc_skip_hw(head->flags))
 		mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [GIT] Networking
  2018-05-22 17:49 David Miller
@ 2018-05-22 18:13 ` Greg KH
  0 siblings, 0 replies; 10+ messages in thread
From: Greg KH @ 2018-05-22 18:13 UTC (permalink / raw)
  To: David Miller; +Cc: stable

On Tue, May 22, 2018 at 01:49:52PM -0400, David Miller wrote:
> 
> Please queue up the following networking bug fixes for v4.14 and v4.16
> -stable, respectively.

All now queued up, thanks!

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [GIT] Networking
@ 2018-05-22 17:49 David Miller
  2018-05-22 18:13 ` Greg KH
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2018-05-22 17:49 UTC (permalink / raw)
  To: stable

[-- Attachment #1: Type: Text/Plain, Size: 107 bytes --]


Please queue up the following networking bug fixes for v4.14 and v4.16
-stable, respectively.

Thank you.

[-- Attachment #2: net_414.mbox --]
[-- Type: Application/Octet-Stream, Size: 109943 bytes --]

From a30dc9a846f6af8b8d8cd9eca1e347a7be0eb801 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 17 May 2018 14:50:44 -0700
Subject: [PATCH 01/34] net: Fix a bug in removing queues from XPS map

[ Upstream commit 6358d49ac23995fdfe157cc8747ab0f274d3954b ]

While removing queues from the XPS map, the individual CPU ID
alone was used to index the CPUs map, this should be changed to also
factor in the traffic class mapping for the CPU-to-queue lookup.

Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes")
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index e7d56c5adde6..6ca771f2f25b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2058,7 +2058,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 		int i, j;
 
 		for (i = count, j = offset; i--; j++) {
-			if (!remove_xps_queue(dev_maps, cpu, j))
+			if (!remove_xps_queue(dev_maps, tci, j))
 				break;
 		}
 
-- 
2.14.3


From ec63acc4955ad649175b4585a4c1d45e2db8c5da Mon Sep 17 00:00:00 2001
From: Tarick Bedeir <tarick@google.com>
Date: Sun, 13 May 2018 16:38:45 -0700
Subject: [PATCH 02/34] net/mlx4_core: Fix error handling in
 mlx4_init_port_info.

[ Upstream commit 57f6f99fdad9984801cde05c1db68fe39b474a10 ]

Avoid exiting the function with a lingering sysfs file (if the first
call to device_create_file() fails while the second succeeds), and avoid
calling devlink_port_unregister() twice.

In other words, either mlx4_init_port_info() succeeds and returns zero, or
it fails, returns non-zero, and requires no cleanup.

Fixes: 096335b3f983 ("mlx4_core: Allow dynamic MTU configuration for IB ports")
Signed-off-by: Tarick Bedeir <tarick@google.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index e61c99ef741d..c273a3ebb8e8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3007,6 +3007,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}
 
 	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
@@ -3028,9 +3029,10 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 				   &info->port_attr);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}
 
-	return err;
+	return 0;
 }
 
 static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
-- 
2.14.3


From ffb5f2263becaaf888f5a186e5525d3699d13ac6 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 16 May 2018 12:54:29 +0200
Subject: [PATCH 03/34] net/sched: fix refcnt leak in the error path of
 tcf_vlan_init()

[ Upstream commit 5a4931ae0193f8a4a97e8260fd0df1d705d83299 ]

Similarly to what was done with commit a52956dfc503 ("net sched actions:
fix refcnt leak in skbmod"), fix the error path of tcf_vlan_init() to avoid
refcnt leaks when wrong value of TCA_VLAN_PUSH_VLAN_PROTOCOL is given.

Fixes: 5026c9b1bafc ("net sched: vlan action fix late binding")
CC: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_vlan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 16eb067a8d8f..5c10a0fce35b 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -154,6 +154,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			case htons(ETH_P_8021AD):
 				break;
 			default:
+				if (exists)
+					tcf_idr_release(*a, bind);
 				return -EPROTONOSUPPORT;
 			}
 		} else {
-- 
2.14.3


From 49a1a21859a30f6623426667fa768994dcdc8a40 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 18 May 2018 14:51:44 +0200
Subject: [PATCH 04/34] net: sched: red: avoid hashing NULL child

[ Upstream commit 44a63b137f7b6e4c7bd6c9cc21615941cb36509d ]

Hangbin reported an Oops triggered by the syzkaller qdisc rules:

 kasan: GPF could be caused by NULL-ptr deref or user memory access
 general protection fault: 0000 [#1] SMP KASAN PTI
 Modules linked in: sch_red
 CPU: 0 PID: 28699 Comm: syz-executor5 Not tainted 4.17.0-rc4.kcov #1
 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
 RIP: 0010:qdisc_hash_add+0x26/0xa0
 RSP: 0018:ffff8800589cf470 EFLAGS: 00010203
 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff824ad971
 RDX: 0000000000000007 RSI: ffffc9000ce9f000 RDI: 000000000000003c
 RBP: 0000000000000001 R08: ffffed000b139ea2 R09: ffff8800589cf4f0
 R10: ffff8800589cf50f R11: ffffed000b139ea2 R12: ffff880054019fc0
 R13: ffff880054019fb4 R14: ffff88005c0af600 R15: ffff880054019fb0
 FS:  00007fa6edcb1700(0000) GS:ffff88005ce00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000020000740 CR3: 000000000fc16000 CR4: 00000000000006f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  red_change+0x2d2/0xed0 [sch_red]
  qdisc_create+0x57e/0xef0
  tc_modify_qdisc+0x47f/0x14e0
  rtnetlink_rcv_msg+0x6a8/0x920
  netlink_rcv_skb+0x2a2/0x3c0
  netlink_unicast+0x511/0x740
  netlink_sendmsg+0x825/0xc30
  sock_sendmsg+0xc5/0x100
  ___sys_sendmsg+0x778/0x8e0
  __sys_sendmsg+0xf5/0x1b0
  do_syscall_64+0xbd/0x3b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x450869
 RSP: 002b:00007fa6edcb0c48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007fa6edcb16b4 RCX: 0000000000450869
 RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000013
 RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
 R13: 0000000000008778 R14: 0000000000702838 R15: 00007fa6edcb1700
 Code: e9 0b fe ff ff 0f 1f 44 00 00 55 53 48 89 fb 89 f5 e8 3f 07 f3 fe 48 8d 7b 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 51
 RIP: qdisc_hash_add+0x26/0xa0 RSP: ffff8800589cf470

When a red qdisc is updated with a 0 limit, the child qdisc is left
unmodified, no additional scheduler is created in red_change(),
the 'child' local variable is rightfully NULL and must not add it
to the hash table.

This change addresses the above issue moving qdisc_hash_add() right
after the child qdisc creation. It additionally removes unneeded checks
for noop_qdisc.

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: 49b499718fa1 ("net: sched: make default fifo qdiscs appear in the dump")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 5 +++--
 net/sched/sch_tbf.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d87c41e82917..c453b8d81c9e 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -191,10 +191,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
-	}
 
-	if (child != &noop_qdisc)
+		/* child is fifo, no need to check for noop_qdisc */
 		qdisc_hash_add(child, true);
+	}
+
 	sch_tree_lock(sch);
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 120f4f365967..b36ecb58aa6e 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -388,6 +388,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 			err = PTR_ERR(child);
 			goto done;
 		}
+
+		/* child is fifo, no need to check for noop_qdisc */
+		qdisc_hash_add(child, true);
 	}
 
 	sch_tree_lock(sch);
@@ -396,8 +399,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
-		if (child != &noop_qdisc)
-			qdisc_hash_add(child, true);
 	}
 	q->limit = qopt->limit;
 	if (tb[TCA_TBF_PBURST])
-- 
2.14.3


From 763ffb6694d18183dd9f0830b2db488e8256bc85 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 May 2018 17:01:30 -0700
Subject: [PATCH 05/34] net/smc: check for missing nlattrs in SMC_PNETID
 messages

[ Upstream commit d49baa7e12ee70c0a7b821d088a770c94c02e494 ]

It's possible to crash the kernel in several different ways by sending
messages to the SMC_PNETID generic netlink family that are missing the
expected attributes:

- Missing SMC_PNETID_NAME => null pointer dereference when comparing
  names.
- Missing SMC_PNETID_ETHNAME => null pointer dereference accessing
  smc_pnetentry::ndev.
- Missing SMC_PNETID_IBNAME => null pointer dereference accessing
  smc_pnetentry::smcibdev.
- Missing SMC_PNETID_IBPORT => out of bounds array access to
  smc_ib_device::pattr[-1].

Fix it by validating that all expected attributes are present and that
SMC_PNETID_IBPORT is nonzero.

Reported-by: syzbot+5cd61039dc9b8bfa6e47@syzkaller.appspotmail.com
Fixes: 6812baabf24d ("smc: establish pnet table management")
Cc: <stable@vger.kernel.org> # v4.11+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_pnet.c | 71 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 74568cdbca70..d7b88b2d1b22 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -245,40 +245,45 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
 static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
 			       struct nlattr *tb[])
 {
-	char *string, *ibname = NULL;
-	int rc = 0;
+	char *string, *ibname;
+	int rc;
 
 	memset(pnetelem, 0, sizeof(*pnetelem));
 	INIT_LIST_HEAD(&pnetelem->list);
-	if (tb[SMC_PNETID_NAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
-		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_ETHNAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
-		pnetelem->ndev = dev_get_by_name(net, string);
-		if (!pnetelem->ndev)
-			return -ENOENT;
-	}
-	if (tb[SMC_PNETID_IBNAME]) {
-		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
-		ibname = strim(ibname);
-		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
-		if (!pnetelem->smcibdev) {
-			rc = -ENOENT;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_IBPORT]) {
-		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
-		if (pnetelem->ib_port > SMC_MAX_PORTS) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_NAME])
+		goto error;
+	string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+	if (!smc_pnetid_valid(string, pnetelem->pnet_name))
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_ETHNAME])
+		goto error;
+	rc = -ENOENT;
+	string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+	pnetelem->ndev = dev_get_by_name(net, string);
+	if (!pnetelem->ndev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBNAME])
+		goto error;
+	rc = -ENOENT;
+	ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+	ibname = strim(ibname);
+	pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+	if (!pnetelem->smcibdev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBPORT])
+		goto error;
+	pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+	if (pnetelem->ib_port < 1 || pnetelem->ib_port > SMC_MAX_PORTS)
+		goto error;
+
 	return 0;
 
 error:
@@ -307,6 +312,8 @@ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
 	void *hdr;
 	int rc;
 
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	pnetelem = smc_pnet_find_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 	if (!pnetelem)
@@ -359,6 +366,8 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
 
 static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
 {
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	return smc_pnet_remove_by_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 }
-- 
2.14.3


From d7d22f21f5e5ab3252a3d9847f370f8debb7493e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 17 May 2018 13:13:29 -0400
Subject: [PATCH 06/34] net: test tailroom before appending to linear skb

[ Upstream commit 113f99c3358564a0647d444c2ae34e8b1abfd5b9 ]

Device features may change during transmission. In particular with
corking, a device may toggle scatter-gather in between allocating
and writing to an skb.

Do not unconditionally assume that !NETIF_F_SG at write time implies
that the same held at alloc time and thus the skb has sufficient
tailroom.

This issue predates git history.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 3 ++-
 net/ipv6/ip6_output.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e8e675be60ec..63d5d66e040a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1040,7 +1040,8 @@ static int __ip_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ffbb81609016..0f2d74885bcb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1488,7 +1488,8 @@ static int __ip6_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
-- 
2.14.3


From dbfd57ed60f451aa32e5fcd4901bc20ef95239d3 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 11 May 2018 13:24:25 -0400
Subject: [PATCH 07/34] packet: in packet_snd start writing at link layer
 allocation

[ Upstream commit b84bbaf7a6c8cca24f8acf25a2c8e46913a947ba ]

Packet sockets allow construction of packets shorter than
dev->hard_header_len to accommodate protocols with variable length
link layer headers. These packets are padded to dev->hard_header_len,
because some device drivers interpret that as a minimum packet size.

packet_snd reserves dev->hard_header_len bytes on allocation.
SOCK_DGRAM sockets call skb_push in dev_hard_header() to ensure that
link layer headers are stored in the reserved range. SOCK_RAW sockets
do the same in tpacket_snd, but not in packet_snd.

Syzbot was able to send a zero byte packet to a device with massive
116B link layer header, causing padding to cross over into skb_shinfo.
Fix this by writing from the start of the llheader reserved range also
in the case of packet_snd/SOCK_RAW.

Update skb_set_network_header to the new offset. This also corrects
it for SOCK_DGRAM, where it incorrectly double counted reserve due to
the skb_push in dev_hard_header.

Fixes: 9ed988cd5915 ("packet: validate variable length ll headers")
Reported-by: syzbot+71d74a5406d02057d559@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3994b71f8197..8351faabba62 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2912,13 +2912,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	if (skb == NULL)
 		goto out_unlock;
 
-	skb_set_network_header(skb, reserve);
+	skb_reset_network_header(skb);
 
 	err = -EINVAL;
 	if (sock->type == SOCK_DGRAM) {
 		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
 		if (unlikely(offset < 0))
 			goto out_free;
+	} else if (reserve) {
+		skb_push(skb, reserve);
 	}
 
 	/* Returns -EFAULT on error */
-- 
2.14.3


From b7af3c4a36c17bdd262761e8b6ac640304215105 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 May 2018 04:47:55 -0700
Subject: [PATCH 08/34] sock_diag: fix use-after-free read in __sk_free

[ Upstream commit 9709020c86f6bf8439ca3effc58cfca49a5de192 ]

We must not call sock_diag_has_destroy_listeners(sk) on a socket
that has no reference on net structure.

BUG: KASAN: use-after-free in sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
BUG: KASAN: use-after-free in __sk_free+0x329/0x340 net/core/sock.c:1609
Read of size 8 at addr ffff88018a02e3a0 by task swapper/1/0

CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.17.0-rc5+ #54
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
 __sk_free+0x329/0x340 net/core/sock.c:1609
 sk_free+0x42/0x50 net/core/sock.c:1623
 sock_put include/net/sock.h:1664 [inline]
 reqsk_free include/net/request_sock.h:116 [inline]
 reqsk_put include/net/request_sock.h:124 [inline]
 inet_csk_reqsk_queue_drop_and_put net/ipv4/inet_connection_sock.c:672 [inline]
 reqsk_timer_handler+0xe27/0x10e0 net/ipv4/inet_connection_sock.c:739
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:525 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
 </IRQ>
RIP: 0010:native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:54
RSP: 0018:ffff8801d9ae7c38 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
RAX: dffffc0000000000 RBX: 1ffff1003b35cf8a RCX: 0000000000000000
RDX: 1ffffffff11a30d0 RSI: 0000000000000001 RDI: ffffffff88d18680
RBP: ffff8801d9ae7c38 R08: ffffed003b5e46c3 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: ffff8801d9ae7cf0 R14: ffffffff897bef20 R15: 0000000000000000
 arch_safe_halt arch/x86/include/asm/paravirt.h:94 [inline]
 default_idle+0xc2/0x440 arch/x86/kernel/process.c:354
 arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:345
 default_idle_call+0x6d/0x90 kernel/sched/idle.c:93
 cpuidle_idle_call kernel/sched/idle.c:153 [inline]
 do_idle+0x395/0x560 kernel/sched/idle.c:262
 cpu_startup_entry+0x104/0x120 kernel/sched/idle.c:368
 start_secondary+0x426/0x5b0 arch/x86/kernel/smpboot.c:269
 secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242

Allocated by task 4557:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
 kmem_cache_zalloc include/linux/slab.h:691 [inline]
 net_alloc net/core/net_namespace.c:383 [inline]
 copy_net_ns+0x159/0x4c0 net/core/net_namespace.c:423
 create_new_namespaces+0x69d/0x8f0 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xc3/0x1f0 kernel/nsproxy.c:206
 ksys_unshare+0x708/0xf90 kernel/fork.c:2408
 __do_sys_unshare kernel/fork.c:2476 [inline]
 __se_sys_unshare kernel/fork.c:2474 [inline]
 __x64_sys_unshare+0x31/0x40 kernel/fork.c:2474
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 69:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 net_free net/core/net_namespace.c:399 [inline]
 net_drop_ns.part.14+0x11a/0x130 net/core/net_namespace.c:406
 net_drop_ns net/core/net_namespace.c:405 [inline]
 cleanup_net+0x6a1/0xb20 net/core/net_namespace.c:541
 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
 kthread+0x345/0x410 kernel/kthread.c:240
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

The buggy address belongs to the object at ffff88018a02c140
 which belongs to the cache net_namespace of size 8832
The buggy address is located 8800 bytes inside of
 8832-byte region [ffff88018a02c140, ffff88018a02e3c0)
The buggy address belongs to the page:
page:ffffea0006280b00 count:1 mapcount:0 mapping:ffff88018a02c140 index:0x0 compound_mapcount: 0
flags: 0x2fffc0000008100(slab|head)
raw: 02fffc0000008100 ffff88018a02c140 0000000000000000 0000000100000001
raw: ffffea00062a1320 ffffea0006268020 ffff8801d9bdde40 0000000000000000
page dumped because: kasan: bad access detected

Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Craig Gallek <kraig@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index ec6eb546b228..68d08ed5521e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1595,7 +1595,7 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
-	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
 		sock_diag_broadcast_destroy(sk);
 	else
 		sk_destruct(sk);
-- 
2.14.3


From 16a487ac303dbaefeca8459816ba471b56da3c89 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 May 2018 21:14:26 -0700
Subject: [PATCH 09/34] tcp: purge write queue in tcp_connect_init()

[ Upstream commit 7f582b248d0a86bae5788c548d7bb5bca6f7691a ]

syzkaller found a reliable way to crash the host, hitting a BUG()
in __tcp_retransmit_skb()

Malicous MSG_FASTOPEN is the root cause. We need to purge write queue
in tcp_connect_init() at the point we init snd_una/write_seq.

This patch also replaces the BUG() by a less intrusive WARN_ON_ONCE()

kernel BUG at net/ipv4/tcp_output.c:2837!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 5276 Comm: syz-executor0 Not tainted 4.17.0-rc3+ #51
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__tcp_retransmit_skb+0x2992/0x2eb0 net/ipv4/tcp_output.c:2837
RSP: 0000:ffff8801dae06ff8 EFLAGS: 00010206
RAX: ffff8801b9fe61c0 RBX: 00000000ffc18a16 RCX: ffffffff864e1a49
RDX: 0000000000000100 RSI: ffffffff864e2e12 RDI: 0000000000000005
RBP: ffff8801dae073a0 R08: ffff8801b9fe61c0 R09: ffffed0039c40dd2
R10: ffffed0039c40dd2 R11: ffff8801ce206e93 R12: 00000000421eeaad
R13: ffff8801ce206d4e R14: ffff8801ce206cc0 R15: ffff8801cd4f4a80
FS:  0000000000000000(0000) GS:ffff8801dae00000(0063) knlGS:00000000096bc900
CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000020000000 CR3: 00000001c47b6000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 tcp_retransmit_skb+0x2e/0x250 net/ipv4/tcp_output.c:2923
 tcp_retransmit_timer+0xc50/0x3060 net/ipv4/tcp_timer.c:488
 tcp_write_timer_handler+0x339/0x960 net/ipv4/tcp_timer.c:573
 tcp_write_timer+0x111/0x1d0 net/ipv4/tcp_timer.c:593
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:525 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863

Fixes: cf60af03ca4e ("net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 83d11cd2eb65..abae5196cd3a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2814,8 +2814,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		return -EBUSY;
 
 	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
-		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
-			BUG();
+		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
 			return -ENOMEM;
 	}
@@ -3312,6 +3314,7 @@ static void tcp_connect_init(struct sock *sk)
 	sock_reset_flag(sk, SOCK_DONE);
 	tp->snd_wnd = 0;
 	tcp_init_wl(tp, 0);
+	tcp_write_queue_purge(sk);
 	tp->snd_una = tp->write_seq;
 	tp->snd_sml = tp->write_seq;
 	tp->snd_up = tp->write_seq;
-- 
2.14.3


From 52370239e7a32c669d183beaf45831fd91d5faa4 Mon Sep 17 00:00:00 2001
From: "hpreg@vmware.com" <hpreg@vmware.com>
Date: Mon, 14 May 2018 08:14:34 -0400
Subject: [PATCH 10/34] vmxnet3: set the DMA mask before the first DMA map
 operation

[ Upstream commit 61aeecea40afb2b89933e27cd4adb10fc2e75cfd ]

The DMA mask must be set before, not after, the first DMA map operation, or
the first DMA map operation could in theory fail on some systems.

Fixes: b0eb57cb97e78 ("VMXNET3: Add support for virtual IOMMU")
Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 50 +++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index cf95290b160c..23cd3a3c4ab3 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2675,7 +2675,7 @@ vmxnet3_set_mac_addr(struct net_device *netdev, void *p)
 /* ==================== initialization and cleanup routines ============ */
 
 static int
-vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
+vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter)
 {
 	int err;
 	unsigned long mmio_start, mmio_len;
@@ -2687,30 +2687,12 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
 		return err;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
-		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
-			dev_err(&pdev->dev,
-				"pci_set_consistent_dma_mask failed\n");
-			err = -EIO;
-			goto err_set_mask;
-		}
-		*dma64 = true;
-	} else {
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
-			dev_err(&pdev->dev,
-				"pci_set_dma_mask failed\n");
-			err = -EIO;
-			goto err_set_mask;
-		}
-		*dma64 = false;
-	}
-
 	err = pci_request_selected_regions(pdev, (1 << 2) - 1,
 					   vmxnet3_driver_name);
 	if (err) {
 		dev_err(&pdev->dev,
 			"Failed to request region for adapter: error %d\n", err);
-		goto err_set_mask;
+		goto err_enable_device;
 	}
 
 	pci_set_master(pdev);
@@ -2738,7 +2720,7 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
 	iounmap(adapter->hw_addr0);
 err_ioremap:
 	pci_release_selected_regions(pdev, (1 << 2) - 1);
-err_set_mask:
+err_enable_device:
 	pci_disable_device(pdev);
 	return err;
 }
@@ -3243,7 +3225,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 #endif
 	};
 	int err;
-	bool dma64 = false; /* stupid gcc */
+	bool dma64;
 	u32 ver;
 	struct net_device *netdev;
 	struct vmxnet3_adapter *adapter;
@@ -3289,6 +3271,24 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	adapter->rx_ring_size = VMXNET3_DEF_RX_RING_SIZE;
 	adapter->rx_ring2_size = VMXNET3_DEF_RX_RING2_SIZE;
 
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_consistent_dma_mask failed\n");
+			err = -EIO;
+			goto err_set_mask;
+		}
+		dma64 = true;
+	} else {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_dma_mask failed\n");
+			err = -EIO;
+			goto err_set_mask;
+		}
+		dma64 = false;
+	}
+
 	spin_lock_init(&adapter->cmd_lock);
 	adapter->adapter_pa = dma_map_single(&adapter->pdev->dev, adapter,
 					     sizeof(struct vmxnet3_adapter),
@@ -3296,7 +3296,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	if (dma_mapping_error(&adapter->pdev->dev, adapter->adapter_pa)) {
 		dev_err(&pdev->dev, "Failed to map dma\n");
 		err = -EFAULT;
-		goto err_dma_map;
+		goto err_set_mask;
 	}
 	adapter->shared = dma_alloc_coherent(
 				&adapter->pdev->dev,
@@ -3347,7 +3347,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	}
 #endif /* VMXNET3_RSS */
 
-	err = vmxnet3_alloc_pci_resources(adapter, &dma64);
+	err = vmxnet3_alloc_pci_resources(adapter);
 	if (err < 0)
 		goto err_alloc_pci;
 
@@ -3493,7 +3493,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 err_alloc_shared:
 	dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
 			 sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
-err_dma_map:
+err_set_mask:
 	free_netdev(netdev);
 	return err;
 }
-- 
2.14.3


From 9e7b1da219e1a6b266dc07d7960ffb866ce143e2 Mon Sep 17 00:00:00 2001
From: "hpreg@vmware.com" <hpreg@vmware.com>
Date: Mon, 14 May 2018 08:14:49 -0400
Subject: [PATCH 11/34] vmxnet3: use DMA memory barriers where required

[ Upstream commit f3002c1374fb2367c9d8dbb28852791ef90d2bac ]

The gen bits must be read first from (resp. written last to) DMA memory.
The proper way to enforce this on Linux is to call dma_rmb() (resp.
dma_wmb()).

Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 23cd3a3c4ab3..3628fd7e606f 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -369,6 +369,11 @@ vmxnet3_tq_tx_complete(struct vmxnet3_tx_queue *tq,
 
 	gdesc = tq->comp_ring.base + tq->comp_ring.next2proc;
 	while (VMXNET3_TCD_GET_GEN(&gdesc->tcd) == tq->comp_ring.gen) {
+		/* Prevent any &gdesc->tcd field from being (speculatively)
+		 * read before (&gdesc->tcd)->gen is read.
+		 */
+		dma_rmb();
+
 		completed += vmxnet3_unmap_pkt(VMXNET3_TCD_GET_TXIDX(
 					       &gdesc->tcd), tq, adapter->pdev,
 					       adapter);
@@ -1099,6 +1104,11 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		gdesc->txd.tci = skb_vlan_tag_get(skb);
 	}
 
+	/* Ensure that the write to (&gdesc->txd)->gen will be observed after
+	 * all other writes to &gdesc->txd.
+	 */
+	dma_wmb();
+
 	/* finally flips the GEN bit of the SOP desc. */
 	gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^
 						  VMXNET3_TXD_GEN);
@@ -1286,6 +1296,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			 */
 			break;
 		}
+
+		/* Prevent any rcd field from being (speculatively) read before
+		 * rcd->gen is read.
+		 */
+		dma_rmb();
+
 		BUG_ON(rcd->rqID != rq->qid && rcd->rqID != rq->qid2 &&
 		       rcd->rqID != rq->dataRingQid);
 		idx = rcd->rxdIdx;
@@ -1515,6 +1531,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		ring->next2comp = idx;
 		num_to_alloc = vmxnet3_cmd_ring_desc_avail(ring);
 		ring = rq->rx_ring + ring_idx;
+
+		/* Ensure that the writes to rxd->gen bits will be observed
+		 * after all other writes to rxd objects.
+		 */
+		dma_wmb();
+
 		while (num_to_alloc) {
 			vmxnet3_getRxDesc(rxd, &ring->base[ring->next2fill].rxd,
 					  &rxCmdDesc);
-- 
2.14.3


From 30bbd3783da7f41d9df1e81232b932765bbcffb4 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 14 May 2018 15:32:00 -0700
Subject: [PATCH 12/34] hv_netvsc: Fix the real number of queues of non-vRSS
 cases

[ Commit 6450f8f269a9271985e4a8c13920b7e4cf21c0f3 upstream. ]

For older hosts without multi-channel (vRSS) support, and some error
cases, we still need to set the real number of queues to one.
This patch adds this missing setting.

Fixes: 8195b1396ec8 ("hv_netvsc: fix deadlock on hotplug")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 444e560d928b..dff93f682435 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1932,6 +1932,12 @@ static int netvsc_probe(struct hv_device *dev,
 	/* We always need headroom for rndis header */
 	net->needed_headroom = RNDIS_AND_PPI_SIZE;
 
+	/* Initialize the number of queues to be 1, we may change it if more
+	 * channels are offered later.
+	 */
+	netif_set_real_num_tx_queues(net, 1);
+	netif_set_real_num_rx_queues(net, 1);
+
 	/* Notify the netvsc driver of the new device */
 	memset(&device_info, 0, sizeof(device_info));
 	device_info.ring_size = ring_size;
-- 
2.14.3


From 5ee4a89b6afc668941ed6ba692929dd10904c00b Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 14 May 2018 15:32:01 -0700
Subject: [PATCH 13/34] hv_netvsc: Rename ind_table to rx_table

[ Commit 47371300dfc269dd8d150e5b872bdbbda98ba809 upstream. ]

Rename this variable because it is the Receive indirection
table.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h   | 2 +-
 drivers/net/hyperv/netvsc_drv.c   | 4 ++--
 drivers/net/hyperv/rndis_filter.c | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 4f3afcf92a7c..8144b938460f 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -179,7 +179,7 @@ struct rndis_device {
 
 	u8 hw_mac_adr[ETH_ALEN];
 	u8 rss_key[NETVSC_HASH_KEYLEN];
-	u16 ind_table[ITAB_NUM];
+	u16 rx_table[ITAB_NUM];
 };
 
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index dff93f682435..25276fbf31cd 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1378,7 +1378,7 @@ static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 	rndis_dev = ndev->extension;
 	if (indir) {
 		for (i = 0; i < ITAB_NUM; i++)
-			indir[i] = rndis_dev->ind_table[i];
+			indir[i] = rndis_dev->rx_table[i];
 	}
 
 	if (key)
@@ -1408,7 +1408,7 @@ static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir,
 				return -EINVAL;
 
 		for (i = 0; i < ITAB_NUM; i++)
-			rndis_dev->ind_table[i] = indir[i];
+			rndis_dev->rx_table[i] = indir[i];
 	}
 
 	if (!key) {
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 065b204d8e17..addf9f69c58c 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -759,7 +759,7 @@ int rndis_filter_set_rss_param(struct rndis_device *rdev,
 	/* Set indirection table entries */
 	itab = (u32 *)(rssp + 1);
 	for (i = 0; i < ITAB_NUM; i++)
-		itab[i] = rdev->ind_table[i];
+		itab[i] = rdev->rx_table[i];
 
 	/* Set hask key values */
 	keyp = (u8 *)((unsigned long)rssp + rssp->kashkey_offset);
@@ -1284,8 +1284,8 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 	net_device->num_chn = min(net_device->max_chn, device_info->num_chn);
 
 	for (i = 0; i < ITAB_NUM; i++)
-		rndis_device->ind_table[i] = ethtool_rxfh_indir_default(i,
-							net_device->num_chn);
+		rndis_device->rx_table[i] = ethtool_rxfh_indir_default(
+						i, net_device->num_chn);
 
 	atomic_set(&net_device->open_chn, 1);
 	vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open);
-- 
2.14.3


From 2e3aa36833a83600d4176cdded58c7bac7222456 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 14 May 2018 15:32:02 -0700
Subject: [PATCH 14/34] hv_netvsc: Rename tx_send_table to tx_table

[ Commit 39e91cfbf6f5fb26ba64cc2e8874372baf1671e7 upstream. ]

Simplify the variable name: tx_send_table

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h | 2 +-
 drivers/net/hyperv/netvsc.c     | 2 +-
 drivers/net/hyperv/netvsc_drv.c | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 8144b938460f..94de7ca1e52e 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -734,7 +734,7 @@ struct net_device_context {
 
 	u32 tx_checksum_mask;
 
-	u32 tx_send_table[VRSS_SEND_TAB_SIZE];
+	u32 tx_table[VRSS_SEND_TAB_SIZE];
 
 	/* Ethtool settings */
 	bool udp4_l4_hash;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index a6bafcf55776..03b44ec805db 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1107,7 +1107,7 @@ static void netvsc_send_table(struct hv_device *hdev,
 		      nvmsg->msg.v5_msg.send_table.offset);
 
 	for (i = 0; i < count; i++)
-		net_device_ctx->tx_send_table[i] = tab[i];
+		net_device_ctx->tx_table[i] = tab[i];
 }
 
 static void netvsc_send_vf(struct net_device_context *net_device_ctx,
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 25276fbf31cd..f7cc2bb1f3e5 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -234,8 +234,8 @@ static inline int netvsc_get_tx_queue(struct net_device *ndev,
 	struct sock *sk = skb->sk;
 	int q_idx;
 
-	q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) &
-				   (VRSS_SEND_TAB_SIZE - 1)];
+	q_idx = ndc->tx_table[netvsc_get_hash(skb, ndc) &
+			      (VRSS_SEND_TAB_SIZE - 1)];
 
 	/* If queue index changed record the new value */
 	if (q_idx != old_idx &&
-- 
2.14.3


From 2177e024a644a030108cdf614f17a20367cba6f6 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 14 May 2018 15:32:03 -0700
Subject: [PATCH 15/34] hv_netvsc: Add initialization of tx_table in
 netvsc_device_add()

[ Commit 6b0cbe315868d613123cf387052ccda5f09d49ea upstream. ]

tx_table is part of the private data of kernel net_device. It is only
zero-ed out when allocating net_device.

We may recreate netvsc_device w/o recreating net_device, so the private
netdev data, including tx_table, are not zeroed. It may contain channel
numbers for the older netvsc_device.

This patch adds initialization of tx_table each time we recreate
netvsc_device.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 03b44ec805db..73999214d444 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1252,6 +1252,9 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 	if (!net_device)
 		return ERR_PTR(-ENOMEM);
 
+	for (i = 0; i < VRSS_SEND_TAB_SIZE; i++)
+		net_device_ctx->tx_table[i] = 0;
+
 	net_device->ring_size = ring_size;
 
 	/* Because the device uses NAPI, all the interrupt batching and
-- 
2.14.3


From 2f3e80cfe6c047c4f200d8675b1516f947345b9e Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 14 May 2018 15:32:04 -0700
Subject: [PATCH 16/34] hv_netvsc: Set tx_table to equal weight after
 subchannels open

[ Commit a6fb6aa3cfa9047b62653dbcfc9bcde6e2272b41 upstream. ]

In some cases, like internal vSwitch, the host doesn't provide
send indirection table updates. This patch sets the table to be
equal weight after subchannels are all open. Otherwise, all workload
will be on one TX channel.

As tested, this patch has largely increased the throughput over
internal vSwitch.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index addf9f69c58c..0648eebda829 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1114,6 +1114,9 @@ void rndis_set_subchannel(struct work_struct *w)
 	netif_set_real_num_tx_queues(ndev, nvdev->num_chn);
 	netif_set_real_num_rx_queues(ndev, nvdev->num_chn);
 
+	for (i = 0; i < VRSS_SEND_TAB_SIZE; i++)
+		ndev_ctx->tx_table[i] = i % nvdev->num_chn;
+
 	rtnl_unlock();
 	return;
 
-- 
2.14.3


From 4589a89827152b565808fef026f0fca7528e88b2 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 14 May 2018 15:32:05 -0700
Subject: [PATCH 17/34] hv_netvsc: netvsc_teardown_gpadl() split

[ Commit 0cf737808ae7cb25e952be619db46b9147a92f46 upstream. ]

It was found that in some cases host refuses to teardown GPADL for send/
receive buffers (probably when some work with these buffere is scheduled or
ongoing). Change the teardown logic to be:
1) Send NVSP_MSG1_TYPE_REVOKE_* messages
2) Close the channel
3) Teardown GPADLs.
This seems to work reliably.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 69 +++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 73999214d444..4bc8a1d529d9 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -100,12 +100,11 @@ static void free_netvsc_device_rcu(struct netvsc_device *nvdev)
 	call_rcu(&nvdev->rcu, free_netvsc_device);
 }
 
-static void netvsc_destroy_buf(struct hv_device *device)
+static void netvsc_revoke_buf(struct hv_device *device,
+			      struct netvsc_device *net_device)
 {
 	struct nvsp_message *revoke_packet;
 	struct net_device *ndev = hv_get_drvdata(device);
-	struct net_device_context *ndc = netdev_priv(ndev);
-	struct netvsc_device *net_device = rtnl_dereference(ndc->nvdev);
 	int ret;
 
 	/*
@@ -148,28 +147,6 @@ static void netvsc_destroy_buf(struct hv_device *device)
 		net_device->recv_section_cnt = 0;
 	}
 
-	/* Teardown the gpadl on the vsp end */
-	if (net_device->recv_buf_gpadl_handle) {
-		ret = vmbus_teardown_gpadl(device->channel,
-					   net_device->recv_buf_gpadl_handle);
-
-		/* If we failed here, we might as well return and have a leak
-		 * rather than continue and a bugchk
-		 */
-		if (ret != 0) {
-			netdev_err(ndev,
-				   "unable to teardown receive buffer's gpadl\n");
-			return;
-		}
-		net_device->recv_buf_gpadl_handle = 0;
-	}
-
-	if (net_device->recv_buf) {
-		/* Free up the receive buffer */
-		vfree(net_device->recv_buf);
-		net_device->recv_buf = NULL;
-	}
-
 	/* Deal with the send buffer we may have setup.
 	 * If we got a  send section size, it means we received a
 	 * NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent
@@ -210,7 +187,35 @@ static void netvsc_destroy_buf(struct hv_device *device)
 		}
 		net_device->send_section_cnt = 0;
 	}
-	/* Teardown the gpadl on the vsp end */
+}
+
+static void netvsc_teardown_gpadl(struct hv_device *device,
+				  struct netvsc_device *net_device)
+{
+	struct net_device *ndev = hv_get_drvdata(device);
+	int ret;
+
+	if (net_device->recv_buf_gpadl_handle) {
+		ret = vmbus_teardown_gpadl(device->channel,
+					   net_device->recv_buf_gpadl_handle);
+
+		/* If we failed here, we might as well return and have a leak
+		 * rather than continue and a bugchk
+		 */
+		if (ret != 0) {
+			netdev_err(ndev,
+				   "unable to teardown receive buffer's gpadl\n");
+			return;
+		}
+		net_device->recv_buf_gpadl_handle = 0;
+	}
+
+	if (net_device->recv_buf) {
+		/* Free up the receive buffer */
+		vfree(net_device->recv_buf);
+		net_device->recv_buf = NULL;
+	}
+
 	if (net_device->send_buf_gpadl_handle) {
 		ret = vmbus_teardown_gpadl(device->channel,
 					   net_device->send_buf_gpadl_handle);
@@ -425,7 +430,8 @@ static int netvsc_init_buf(struct hv_device *device,
 	goto exit;
 
 cleanup:
-	netvsc_destroy_buf(device);
+	netvsc_revoke_buf(device, net_device);
+	netvsc_teardown_gpadl(device, net_device);
 
 exit:
 	return ret;
@@ -544,11 +550,6 @@ static int netvsc_connect_vsp(struct hv_device *device,
 	return ret;
 }
 
-static void netvsc_disconnect_vsp(struct hv_device *device)
-{
-	netvsc_destroy_buf(device);
-}
-
 /*
  * netvsc_device_remove - Callback when the root bus device is removed
  */
@@ -562,7 +563,7 @@ void netvsc_device_remove(struct hv_device *device)
 
 	cancel_work_sync(&net_device->subchan_work);
 
-	netvsc_disconnect_vsp(device);
+	netvsc_revoke_buf(device, net_device);
 
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
 
@@ -575,6 +576,8 @@ void netvsc_device_remove(struct hv_device *device)
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
+	netvsc_teardown_gpadl(device, net_device);
+
 	/* And dissassociate NAPI context from device */
 	for (i = 0; i < net_device->num_chn; i++)
 		netif_napi_del(&net_device->chan_table[i].napi);
-- 
2.14.3


From 2d0e4490de199d74d50381dc75727c6d0445051a Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 14 May 2018 15:32:06 -0700
Subject: [PATCH 18/34] hv_netvsc: preserve hw_features on
 mtu/channels/ringparam changes

[ Commit aefd80e874e98a864915df5b7d90824a4340b450 upstream. ]

rndis_filter_device_add() is called both from netvsc_probe() when we
initially create the device and from set channels/mtu/ringparam
routines where we basically remove the device and add it back.

hw_features is reset in rndis_filter_device_add() and filled with
host data. However, we lose all additional flags which are set outside
of the driver, e.g. register_netdevice() adds NETIF_F_SOFT_FEATURES and
many others.

Unfortunately, calls to rndis_{query_hwcaps(), _set_offload_params()}
calls cannot be avoided on every RNDIS reset: host expects us to set
required features explicitly. Moreover, in theory hardware capabilities
can change and we need to reflect the change in hw_features.

Reset net->hw_features bits according to host data in
rndis_netdev_set_hwcaps(), clear corresponding feature bits
from net->features in case some features went missing (will never happen
in real life I guess but let's be consistent).

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h   |   4 ++
 drivers/net/hyperv/netvsc_drv.c   |   2 +-
 drivers/net/hyperv/rndis_filter.c | 136 ++++++++++++++++++++++----------------
 3 files changed, 83 insertions(+), 59 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 94de7ca1e52e..a3f628c3c9ed 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -659,6 +659,10 @@ struct nvsp_message {
 #define NETVSC_RECEIVE_BUFFER_ID		0xcafe
 #define NETVSC_SEND_BUFFER_ID			0
 
+#define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \
+				      NETIF_F_TSO | NETIF_F_IPV6_CSUM | \
+				      NETIF_F_TSO6)
+
 #define VRSS_SEND_TAB_SIZE 16  /* must be power of 2 */
 #define VRSS_CHANNEL_MAX 64
 #define VRSS_CHANNEL_DEFAULT 8
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f7cc2bb1f3e5..717899993933 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1956,7 +1956,7 @@ static int netvsc_probe(struct hv_device *dev,
 
 	memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
 
-	/* hw_features computed in rndis_filter_device_add */
+	/* hw_features computed in rndis_netdev_set_hwcaps() */
 	net->features = net->hw_features |
 		NETIF_F_HIGHDMA | NETIF_F_SG |
 		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 0648eebda829..be57639bee29 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1131,69 +1131,20 @@ void rndis_set_subchannel(struct work_struct *w)
 	rtnl_unlock();
 }
 
-struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
-				      struct netvsc_device_info *device_info)
+static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device,
+				   struct netvsc_device *nvdev)
 {
-	struct net_device *net = hv_get_drvdata(dev);
+	struct net_device *net = rndis_device->ndev;
 	struct net_device_context *net_device_ctx = netdev_priv(net);
-	struct netvsc_device *net_device;
-	struct rndis_device *rndis_device;
 	struct ndis_offload hwcaps;
 	struct ndis_offload_params offloads;
-	struct ndis_recv_scale_cap rsscap;
-	u32 rsscap_size = sizeof(struct ndis_recv_scale_cap);
 	unsigned int gso_max_size = GSO_MAX_SIZE;
-	u32 mtu, size;
-	const struct cpumask *node_cpu_mask;
-	u32 num_possible_rss_qs;
-	int i, ret;
-
-	rndis_device = get_rndis_device();
-	if (!rndis_device)
-		return ERR_PTR(-ENODEV);
-
-	/*
-	 * Let the inner driver handle this first to create the netvsc channel
-	 * NOTE! Once the channel is created, we may get a receive callback
-	 * (RndisFilterOnReceive()) before this call is completed
-	 */
-	net_device = netvsc_device_add(dev, device_info);
-	if (IS_ERR(net_device)) {
-		kfree(rndis_device);
-		return net_device;
-	}
-
-	/* Initialize the rndis device */
-	net_device->max_chn = 1;
-	net_device->num_chn = 1;
-
-	net_device->extension = rndis_device;
-	rndis_device->ndev = net;
-
-	/* Send the rndis initialization message */
-	ret = rndis_filter_init_device(rndis_device, net_device);
-	if (ret != 0)
-		goto err_dev_remv;
-
-	/* Get the MTU from the host */
-	size = sizeof(u32);
-	ret = rndis_filter_query_device(rndis_device, net_device,
-					RNDIS_OID_GEN_MAXIMUM_FRAME_SIZE,
-					&mtu, &size);
-	if (ret == 0 && size == sizeof(u32) && mtu < net->mtu)
-		net->mtu = mtu;
-
-	/* Get the mac address */
-	ret = rndis_filter_query_device_mac(rndis_device, net_device);
-	if (ret != 0)
-		goto err_dev_remv;
-
-	memcpy(device_info->mac_adr, rndis_device->hw_mac_adr, ETH_ALEN);
+	int ret;
 
 	/* Find HW offload capabilities */
-	ret = rndis_query_hwcaps(rndis_device, net_device, &hwcaps);
+	ret = rndis_query_hwcaps(rndis_device, nvdev, &hwcaps);
 	if (ret != 0)
-		goto err_dev_remv;
+		return ret;
 
 	/* A value of zero means "no change"; now turn on what we want. */
 	memset(&offloads, 0, sizeof(struct ndis_offload_params));
@@ -1201,8 +1152,12 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 	/* Linux does not care about IP checksum, always does in kernel */
 	offloads.ip_v4_csum = NDIS_OFFLOAD_PARAMETERS_TX_RX_DISABLED;
 
+	/* Reset previously set hw_features flags */
+	net->hw_features &= ~NETVSC_SUPPORTED_HW_FEATURES;
+	net_device_ctx->tx_checksum_mask = 0;
+
 	/* Compute tx offload settings based on hw capabilities */
-	net->hw_features = NETIF_F_RXCSUM;
+	net->hw_features |= NETIF_F_RXCSUM;
 
 	if ((hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_ALL_TCP4) == NDIS_TXCSUM_ALL_TCP4) {
 		/* Can checksum TCP */
@@ -1246,10 +1201,75 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 		}
 	}
 
+	/* In case some hw_features disappeared we need to remove them from
+	 * net->features list as they're no longer supported.
+	 */
+	net->features &= ~NETVSC_SUPPORTED_HW_FEATURES | net->hw_features;
+
 	netif_set_gso_max_size(net, gso_max_size);
 
-	ret = rndis_filter_set_offload_params(net, net_device, &offloads);
-	if (ret)
+	ret = rndis_filter_set_offload_params(net, nvdev, &offloads);
+
+	return ret;
+}
+
+struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
+				      struct netvsc_device_info *device_info)
+{
+	struct net_device *net = hv_get_drvdata(dev);
+	struct netvsc_device *net_device;
+	struct rndis_device *rndis_device;
+	struct ndis_recv_scale_cap rsscap;
+	u32 rsscap_size = sizeof(struct ndis_recv_scale_cap);
+	u32 mtu, size;
+	const struct cpumask *node_cpu_mask;
+	u32 num_possible_rss_qs;
+	int i, ret;
+
+	rndis_device = get_rndis_device();
+	if (!rndis_device)
+		return ERR_PTR(-ENODEV);
+
+	/* Let the inner driver handle this first to create the netvsc channel
+	 * NOTE! Once the channel is created, we may get a receive callback
+	 * (RndisFilterOnReceive()) before this call is completed
+	 */
+	net_device = netvsc_device_add(dev, device_info);
+	if (IS_ERR(net_device)) {
+		kfree(rndis_device);
+		return net_device;
+	}
+
+	/* Initialize the rndis device */
+	net_device->max_chn = 1;
+	net_device->num_chn = 1;
+
+	net_device->extension = rndis_device;
+	rndis_device->ndev = net;
+
+	/* Send the rndis initialization message */
+	ret = rndis_filter_init_device(rndis_device, net_device);
+	if (ret != 0)
+		goto err_dev_remv;
+
+	/* Get the MTU from the host */
+	size = sizeof(u32);
+	ret = rndis_filter_query_device(rndis_device, net_device,
+					RNDIS_OID_GEN_MAXIMUM_FRAME_SIZE,
+					&mtu, &size);
+	if (ret == 0 && size == sizeof(u32) && mtu < net->mtu)
+		net->mtu = mtu;
+
+	/* Get the mac address */
+	ret = rndis_filter_query_device_mac(rndis_device, net_device);
+	if (ret != 0)
+		goto err_dev_remv;
+
+	memcpy(device_info->mac_adr, rndis_device->hw_mac_adr, ETH_ALEN);
+
+	/* Query and set hardware capabilities */
+	ret = rndis_netdev_set_hwcaps(rndis_device, net_device);
+	if (ret != 0)
 		goto err_dev_remv;
 
 	rndis_filter_query_device_link_status(rndis_device, net_device);
-- 
2.14.3


From 4d31526c79851ddda0ac9f4bad4b21167805da33 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:07 -0700
Subject: [PATCH 19/34] hv_netvsc: empty current transmit aggregation if flow
 blocked

[ Commit cfd8afd986cdb59ea9adac873c5082498a1eb7c0 upstream. ]

If the transmit queue is known full, then don't keep aggregating
data. And the cp_partial flag which indicates that the current
aggregation buffer is full can be folded in to avoid more
conditionals.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h   |  2 +-
 drivers/net/hyperv/netvsc.c       | 36 +++++++++++++++++++++---------------
 drivers/net/hyperv/netvsc_drv.c   |  2 +-
 drivers/net/hyperv/rndis_filter.c |  3 +--
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index a3f628c3c9ed..fd51a329e36e 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -192,7 +192,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 					const struct netvsc_device_info *info);
 int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx);
 void netvsc_device_remove(struct hv_device *device);
-int netvsc_send(struct net_device_context *ndc,
+int netvsc_send(struct net_device *net,
 		struct hv_netvsc_packet *packet,
 		struct rndis_message *rndis_msg,
 		struct hv_page_buffer *page_buffer,
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 4bc8a1d529d9..22da6399b37a 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -700,13 +700,13 @@ static u32 netvsc_get_next_send_section(struct netvsc_device *net_device)
 	return NETVSC_INVALID_INDEX;
 }
 
-static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
-				   unsigned int section_index,
-				   u32 pend_size,
-				   struct hv_netvsc_packet *packet,
-				   struct rndis_message *rndis_msg,
-				   struct hv_page_buffer *pb,
-				   struct sk_buff *skb)
+static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
+				    unsigned int section_index,
+				    u32 pend_size,
+				    struct hv_netvsc_packet *packet,
+				    struct rndis_message *rndis_msg,
+				    struct hv_page_buffer *pb,
+				    bool xmit_more)
 {
 	char *start = net_device->send_buf;
 	char *dest = start + (section_index * net_device->send_section_size)
@@ -719,7 +719,8 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 		packet->page_buf_cnt;
 
 	/* Add padding */
-	if (skb->xmit_more && remain && !packet->cp_partial) {
+	remain = packet->total_data_buflen & (net_device->pkt_align - 1);
+	if (xmit_more && remain) {
 		padding = net_device->pkt_align - remain;
 		rndis_msg->msg_len += padding;
 		packet->total_data_buflen += padding;
@@ -739,8 +740,6 @@ static u32 netvsc_copy_to_send_buf(struct netvsc_device *net_device,
 		memset(dest, 0, padding);
 		msg_size += padding;
 	}
-
-	return msg_size;
 }
 
 static inline int netvsc_send_pkt(
@@ -828,12 +827,13 @@ static inline void move_pkt_msd(struct hv_netvsc_packet **msd_send,
 }
 
 /* RCU already held by caller */
-int netvsc_send(struct net_device_context *ndev_ctx,
+int netvsc_send(struct net_device *ndev,
 		struct hv_netvsc_packet *packet,
 		struct rndis_message *rndis_msg,
 		struct hv_page_buffer *pb,
 		struct sk_buff *skb)
 {
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
 	struct netvsc_device *net_device
 		= rcu_dereference_bh(ndev_ctx->nvdev);
 	struct hv_device *device = ndev_ctx->device_ctx;
@@ -844,8 +844,7 @@ int netvsc_send(struct net_device_context *ndev_ctx,
 	struct multi_send_data *msdp;
 	struct hv_netvsc_packet *msd_send = NULL, *cur_send = NULL;
 	struct sk_buff *msd_skb = NULL;
-	bool try_batch;
-	bool xmit_more = (skb != NULL) ? skb->xmit_more : false;
+	bool try_batch, xmit_more;
 
 	/* If device is rescinded, return error and packet will get dropped. */
 	if (unlikely(!net_device || net_device->destroy))
@@ -896,10 +895,17 @@ int netvsc_send(struct net_device_context *ndev_ctx,
 		}
 	}
 
+	/* Keep aggregating only if stack says more data is coming
+	 * and not doing mixed modes send and not flow blocked
+	 */
+	xmit_more = skb->xmit_more &&
+		!packet->cp_partial &&
+		!netif_xmit_stopped(netdev_get_tx_queue(ndev, packet->q_idx));
+
 	if (section_index != NETVSC_INVALID_INDEX) {
 		netvsc_copy_to_send_buf(net_device,
 					section_index, msd_len,
-					packet, rndis_msg, pb, skb);
+					packet, rndis_msg, pb, xmit_more);
 
 		packet->send_buf_index = section_index;
 
@@ -919,7 +925,7 @@ int netvsc_send(struct net_device_context *ndev_ctx,
 		if (msdp->skb)
 			dev_consume_skb_any(msdp->skb);
 
-		if (xmit_more && !packet->cp_partial) {
+		if (xmit_more) {
 			msdp->skb = skb;
 			msdp->pkt = packet;
 			msdp->count++;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 717899993933..cc5bf9544d57 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -614,7 +614,7 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
 	/* timestamp packet in software */
 	skb_tx_timestamp(skb);
 
-	ret = netvsc_send(net_device_ctx, packet, rndis_msg, pb, skb);
+	ret = netvsc_send(net, packet, rndis_msg, pb, skb);
 	if (likely(ret == 0))
 		return NETDEV_TX_OK;
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index be57639bee29..0c99d9926085 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -217,7 +217,6 @@ static int rndis_filter_send_request(struct rndis_device *dev,
 	struct hv_netvsc_packet *packet;
 	struct hv_page_buffer page_buf[2];
 	struct hv_page_buffer *pb = page_buf;
-	struct net_device_context *net_device_ctx = netdev_priv(dev->ndev);
 	int ret;
 
 	/* Setup the packet to send it */
@@ -245,7 +244,7 @@ static int rndis_filter_send_request(struct rndis_device *dev,
 	}
 
 	rcu_read_lock_bh();
-	ret = netvsc_send(net_device_ctx, packet, NULL, pb, NULL);
+	ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL);
 	rcu_read_unlock_bh();
 
 	return ret;
-- 
2.14.3


From 4962bce5dce852b55d48d93bc0ac6f115db6a4a0 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 14 May 2018 15:32:08 -0700
Subject: [PATCH 20/34] hv_netvsc: Use the num_online_cpus() for channel limit

[ Commit 25a39f7f975c3c26a0052fbf9b59201c06744332 upstream. ]

Since we no longer localize channel/CPU affiliation within one NUMA
node, num_online_cpus() is used as the number of channel cap, instead of
the number of processors in a NUMA node.

This patch allows a bigger range for tuning the number of channels.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 0c99d9926085..05109ed5377c 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1221,7 +1221,6 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 	struct ndis_recv_scale_cap rsscap;
 	u32 rsscap_size = sizeof(struct ndis_recv_scale_cap);
 	u32 mtu, size;
-	const struct cpumask *node_cpu_mask;
 	u32 num_possible_rss_qs;
 	int i, ret;
 
@@ -1290,14 +1289,8 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 	if (ret || rsscap.num_recv_que < 2)
 		goto out;
 
-	/*
-	 * We will limit the VRSS channels to the number CPUs in the NUMA node
-	 * the primary channel is currently bound to.
-	 *
-	 * This also guarantees that num_possible_rss_qs <= num_online_cpus
-	 */
-	node_cpu_mask = cpumask_of_node(cpu_to_node(dev->channel->target_cpu));
-	num_possible_rss_qs = min_t(u32, cpumask_weight(node_cpu_mask),
+	/* This guarantees that num_possible_rss_qs <= num_online_cpus */
+	num_possible_rss_qs = min_t(u32, num_online_cpus(),
 				    rsscap.num_recv_que);
 
 	net_device->max_chn = min_t(u32, VRSS_CHANNEL_MAX, num_possible_rss_qs);
-- 
2.14.3


From 28d5dfff66caaf92342cbebdc5b4631639c7da9d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:09 -0700
Subject: [PATCH 21/34] hv_netvsc: avoid retry on send during shutdown

[ Commit 12f69661a49446840d742d8feb593ace022d9f66 upstream. ]

Change the initialization order so that the device is ready to transmit
(ie connect vsp is completed) before setting the internal reference
to the device with RCU.

This avoids any races on initialization and prevents retry issues
on shutdown.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 22da6399b37a..839b9d6ecb41 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -850,13 +850,6 @@ int netvsc_send(struct net_device *ndev,
 	if (unlikely(!net_device || net_device->destroy))
 		return -ENODEV;
 
-	/* We may race with netvsc_connect_vsp()/netvsc_init_buf() and get
-	 * here before the negotiation with the host is finished and
-	 * send_section_map may not be allocated yet.
-	 */
-	if (unlikely(!net_device->send_section_map))
-		return -EAGAIN;
-
 	nvchan = &net_device->chan_table[packet->q_idx];
 	packet->send_buf_index = NETVSC_INVALID_INDEX;
 	packet->cp_partial = false;
@@ -864,10 +857,8 @@ int netvsc_send(struct net_device *ndev,
 	/* Send control message directly without accessing msd (Multi-Send
 	 * Data) field which may be changed during data packet processing.
 	 */
-	if (!skb) {
-		cur_send = packet;
-		goto send_now;
-	}
+	if (!skb)
+		return netvsc_send_pkt(device, packet, net_device, pb, skb);
 
 	/* batch packets in send buffer if possible */
 	msdp = &nvchan->msd;
@@ -951,7 +942,6 @@ int netvsc_send(struct net_device *ndev,
 		}
 	}
 
-send_now:
 	if (cur_send)
 		ret = netvsc_send_pkt(device, cur_send, net_device, pb, skb);
 
@@ -1308,11 +1298,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 	napi_enable(&net_device->chan_table[0].napi);
 
-	/* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
-	 * populated.
-	 */
-	rcu_assign_pointer(net_device_ctx->nvdev, net_device);
-
 	/* Connect with the NetVsp */
 	ret = netvsc_connect_vsp(device, net_device, device_info);
 	if (ret != 0) {
@@ -1321,6 +1306,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 		goto close;
 	}
 
+	/* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
+	 * populated.
+	 */
+	rcu_assign_pointer(net_device_ctx->nvdev, net_device);
+
 	return net_device;
 
 close:
-- 
2.14.3


From 69650bf0c2c870a8e026e7647e6f6229beea4c83 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:10 -0700
Subject: [PATCH 22/34] hv_netvsc: only wake transmit queue if link is up

[ Commit f4950e4586dfc957e0a28226eeb992ddc049b5a2 upstream. ]

Don't wake transmit queues if link is not up yet.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index cc5bf9544d57..40a45f11550c 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -88,12 +88,11 @@ static int netvsc_open(struct net_device *net)
 		return ret;
 	}
 
-	netif_tx_wake_all_queues(net);
-
 	rdev = nvdev->extension;
-
-	if (!rdev->link_state)
+	if (!rdev->link_state) {
 		netif_carrier_on(net);
+		netif_tx_wake_all_queues(net);
+	}
 
 	if (vf_netdev) {
 		/* Setting synthetic device up transparently sets
-- 
2.14.3


From a4f292fe7931e588b452cf692c30cf86bf802f14 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:11 -0700
Subject: [PATCH 23/34] hv_netvsc: fix error unwind handling if vmbus_open
 fails

[ Commit fcfb4a00d1e514e8313277a01ef919de1113025b upstream. ]

Need to delete NAPI association if vmbus_open fails.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 839b9d6ecb41..b7720b65f5d6 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1288,7 +1288,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 			 net_device->chan_table);
 
 	if (ret != 0) {
-		netif_napi_del(&net_device->chan_table[0].napi);
 		netdev_err(ndev, "unable to open channel: %d\n", ret);
 		goto cleanup;
 	}
@@ -1321,6 +1320,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 	vmbus_close(device->channel);
 
 cleanup:
+	netif_napi_del(&net_device->chan_table[0].napi);
 	free_netvsc_device(&net_device->rcu);
 
 	return ERR_PTR(ret);
-- 
2.14.3


From e5bcc664f1c76a2c58b8f4f1be25db30a44ec047 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:12 -0700
Subject: [PATCH 24/34] hv_netvsc: cancel subchannel setup before halting
 device

[ Commit a7483ec0267c69b34e818738da60b392623da94b upstream. ]

Block setup of multiple channels earlier in the teardown
process. This avoids possible races between halt and subchannel
initialization.

Suggested-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 05109ed5377c..5a312b2d5a7b 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1340,6 +1340,9 @@ void rndis_filter_device_remove(struct hv_device *dev,
 {
 	struct rndis_device *rndis_dev = net_dev->extension;
 
+	/* Don't try and setup sub channels if about to halt */
+	cancel_work_sync(&net_dev->subchan_work);
+
 	/* Halt and release the rndis device */
 	rndis_filter_halt_device(rndis_dev);
 
-- 
2.14.3


From eef4a12b6ca0951e1a0c77fa36bb6a82b3dead87 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:13 -0700
Subject: [PATCH 25/34] hv_netvsc: fix race in napi poll when rescheduling

[ Commit d64e38ae690e3337db0d38d9b149a193a1646c4b upstream. ]

There is a race between napi_reschedule and re-enabling interrupts
which could lead to missed host interrrupts.  This occurs when
interrupts are re-enabled (hv_end_read) and vmbus irq callback
(netvsc_channel_cb) has already scheduled NAPI.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index b7720b65f5d6..7e0eb99571af 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1205,9 +1205,10 @@ int netvsc_poll(struct napi_struct *napi, int budget)
 	if (send_recv_completions(ndev, net_device, nvchan) == 0 &&
 	    work_done < budget &&
 	    napi_complete_done(napi, work_done) &&
-	    hv_end_read(&channel->inbound)) {
+	    hv_end_read(&channel->inbound) &&
+	    napi_schedule_prep(napi)) {
 		hv_begin_read(&channel->inbound);
-		napi_reschedule(napi);
+		__napi_schedule(napi);
 	}
 
 	/* Driver may overshoot since multiple packets per descriptor */
-- 
2.14.3


From f9edde6377b4ab0ca487ae3be024d3dd59269623 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:14 -0700
Subject: [PATCH 26/34] hv_netvsc: defer queue selection to VF

[ Commit b3bf5666a51068ad5ddd89a76ed877101ef3bc16 upstream. ]

When VF is used for accelerated networking it will likely have
more queues (and different policy) than the synthetic NIC.
This patch defers the queue policy to the VF so that all the
queues can be used. This impacts workloads like local generate UDP.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 40a45f11550c..67f37744eebe 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -283,8 +283,19 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	rcu_read_lock();
 	vf_netdev = rcu_dereference(ndc->vf_netdev);
 	if (vf_netdev) {
-		txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
-		qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+		const struct net_device_ops *vf_ops = vf_netdev->netdev_ops;
+
+		if (vf_ops->ndo_select_queue)
+			txq = vf_ops->ndo_select_queue(vf_netdev, skb,
+						       accel_priv, fallback);
+		else
+			txq = fallback(vf_netdev, skb);
+
+		/* Record the queue selected by VF so that it can be
+		 * used for common case where VF has more queues than
+		 * the synthetic device.
+		 */
+		qdisc_skb_cb(skb)->slave_dev_queue_mapping = txq;
 	} else {
 		txq = netvsc_pick_tx(ndev, skb);
 	}
-- 
2.14.3


From b4768b645a5741e5d689a99435e582381705b8c1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:15 -0700
Subject: [PATCH 27/34] hv_netvsc: disable NAPI before channel close

[ Commit 8348e0460ab1473f06c8b824699dd2eed3c1979d upstream. ]

This makes sure that no CPU is still process packets when
the channel is closed.

Fixes: 76bb5db5c749 ("netvsc: fix use after free on module removal")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 7e0eb99571af..a8789dc4fc9b 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -567,6 +567,10 @@ void netvsc_device_remove(struct hv_device *device)
 
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
 
+	/* And disassociate NAPI context from device */
+	for (i = 0; i < net_device->num_chn; i++)
+		netif_napi_del(&net_device->chan_table[i].napi);
+
 	/*
 	 * At this point, no one should be accessing net_device
 	 * except in here
@@ -578,10 +582,6 @@ void netvsc_device_remove(struct hv_device *device)
 
 	netvsc_teardown_gpadl(device, net_device);
 
-	/* And dissassociate NAPI context from device */
-	for (i = 0; i < net_device->num_chn; i++)
-		netif_napi_del(&net_device->chan_table[i].napi);
-
 	/* Release all resources */
 	free_netvsc_device_rcu(net_device);
 }
-- 
2.14.3


From 8cc24ca4ef46616fc93b139addd25ce95739dc6e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:16 -0700
Subject: [PATCH 28/34] hv_netvsc: use RCU to fix concurrent rx and queue
 changes

[ Commit 02400fcee2542ee334a2394e0d9f6efd969fe782 upstream. ]

The receive processing may continue to happen while the
internal network device state is in RCU grace period.
The internal RNDIS structure is associated with the
internal netvsc_device structure; both have the same
RCU lifetime.

Defer freeing all associated parts until after grace
period.

Fixes: 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c       | 17 +++++------------
 drivers/net/hyperv/rndis_filter.c | 39 +++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 34 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index a8789dc4fc9b..03328a60377e 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -89,6 +89,11 @@ static void free_netvsc_device(struct rcu_head *head)
 		= container_of(head, struct netvsc_device, rcu);
 	int i;
 
+	kfree(nvdev->extension);
+	vfree(nvdev->recv_buf);
+	vfree(nvdev->send_buf);
+	kfree(nvdev->send_section_map);
+
 	for (i = 0; i < VRSS_CHANNEL_MAX; i++)
 		vfree(nvdev->chan_table[i].mrc.slots);
 
@@ -210,12 +215,6 @@ static void netvsc_teardown_gpadl(struct hv_device *device,
 		net_device->recv_buf_gpadl_handle = 0;
 	}
 
-	if (net_device->recv_buf) {
-		/* Free up the receive buffer */
-		vfree(net_device->recv_buf);
-		net_device->recv_buf = NULL;
-	}
-
 	if (net_device->send_buf_gpadl_handle) {
 		ret = vmbus_teardown_gpadl(device->channel,
 					   net_device->send_buf_gpadl_handle);
@@ -230,12 +229,6 @@ static void netvsc_teardown_gpadl(struct hv_device *device,
 		}
 		net_device->send_buf_gpadl_handle = 0;
 	}
-	if (net_device->send_buf) {
-		/* Free up the send buffer */
-		vfree(net_device->send_buf);
-		net_device->send_buf = NULL;
-	}
-	kfree(net_device->send_section_map);
 }
 
 int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx)
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 5a312b2d5a7b..056499014a1f 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -266,13 +266,23 @@ static void rndis_set_link_state(struct rndis_device *rdev,
 	}
 }
 
-static void rndis_filter_receive_response(struct rndis_device *dev,
-				       struct rndis_message *resp)
+static void rndis_filter_receive_response(struct net_device *ndev,
+					  struct netvsc_device *nvdev,
+					  const struct rndis_message *resp)
 {
+	struct rndis_device *dev = nvdev->extension;
 	struct rndis_request *request = NULL;
 	bool found = false;
 	unsigned long flags;
-	struct net_device *ndev = dev->ndev;
+
+	/* This should never happen, it means control message
+	 * response received after device removed.
+	 */
+	if (dev->state == RNDIS_DEV_UNINITIALIZED) {
+		netdev_err(ndev,
+			   "got rndis message uninitialized\n");
+		return;
+	}
 
 	spin_lock_irqsave(&dev->request_lock, flags);
 	list_for_each_entry(request, &dev->req_list, list_ent) {
@@ -353,7 +363,7 @@ static inline void *rndis_get_ppi(struct rndis_packet *rpkt, u32 type)
 }
 
 static int rndis_filter_receive_data(struct net_device *ndev,
-				     struct rndis_device *dev,
+				     struct netvsc_device *nvdev,
 				     struct rndis_message *msg,
 				     struct vmbus_channel *channel,
 				     void *data, u32 data_buflen)
@@ -373,7 +383,7 @@ static int rndis_filter_receive_data(struct net_device *ndev,
 	 * should be the data packet size plus the trailer padding size
 	 */
 	if (unlikely(data_buflen < rndis_pkt->data_len)) {
-		netdev_err(dev->ndev, "rndis message buffer "
+		netdev_err(ndev, "rndis message buffer "
 			   "overflow detected (got %u, min %u)"
 			   "...dropping this message!\n",
 			   data_buflen, rndis_pkt->data_len);
@@ -401,34 +411,20 @@ int rndis_filter_receive(struct net_device *ndev,
 			 void *data, u32 buflen)
 {
 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
-	struct rndis_device *rndis_dev = net_dev->extension;
 	struct rndis_message *rndis_msg = data;
 
-	/* Make sure the rndis device state is initialized */
-	if (unlikely(!rndis_dev)) {
-		netif_err(net_device_ctx, rx_err, ndev,
-			  "got rndis message but no rndis device!\n");
-		return NVSP_STAT_FAIL;
-	}
-
-	if (unlikely(rndis_dev->state == RNDIS_DEV_UNINITIALIZED)) {
-		netif_err(net_device_ctx, rx_err, ndev,
-			  "got rndis message uninitialized\n");
-		return NVSP_STAT_FAIL;
-	}
-
 	if (netif_msg_rx_status(net_device_ctx))
 		dump_rndis_message(dev, rndis_msg);
 
 	switch (rndis_msg->ndis_msg_type) {
 	case RNDIS_MSG_PACKET:
-		return rndis_filter_receive_data(ndev, rndis_dev, rndis_msg,
+		return rndis_filter_receive_data(ndev, net_dev, rndis_msg,
 						 channel, data, buflen);
 	case RNDIS_MSG_INIT_C:
 	case RNDIS_MSG_QUERY_C:
 	case RNDIS_MSG_SET_C:
 		/* completion msgs */
-		rndis_filter_receive_response(rndis_dev, rndis_msg);
+		rndis_filter_receive_response(ndev, net_dev, rndis_msg);
 		break;
 
 	case RNDIS_MSG_INDICATE:
@@ -1349,7 +1345,6 @@ void rndis_filter_device_remove(struct hv_device *dev,
 	net_dev->extension = NULL;
 
 	netvsc_device_remove(dev);
-	kfree(rndis_dev);
 }
 
 int rndis_filter_open(struct netvsc_device *nvdev)
-- 
2.14.3


From 9970bd6486f01b70d0b34cdacd78b4f943e93041 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:17 -0700
Subject: [PATCH 29/34] hv_netvsc: change GPAD teardown order on older versions

[ Commit 0ef58b0a05c127762f975c3dfe8b922e4aa87a29 upstream. ]

On older versions of Windows, the host ignores messages after
vmbus channel is closed.

Workaround this by doing what Windows does and send the teardown
before close on older versions of NVSP protocol.

Reported-by: Mohammed Gamal <mgamal@redhat.com>
Fixes: 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 03328a60377e..3f3c6489ade2 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -570,10 +570,15 @@ void netvsc_device_remove(struct hv_device *device)
 	 */
 	netdev_dbg(ndev, "net device safe to remove\n");
 
+	/* older versions require that buffer be revoked before close */
+	if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_4)
+		netvsc_teardown_gpadl(device, net_device);
+
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
-	netvsc_teardown_gpadl(device, net_device);
+	if (net_device->nvsp_version >= NVSP_PROTOCOL_VERSION_4)
+		netvsc_teardown_gpadl(device, net_device);
 
 	/* Release all resources */
 	free_netvsc_device_rcu(net_device);
-- 
2.14.3


From 2bbbffc869db77aecdf70ac6dcf117f346cf6a3c Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 14 May 2018 15:32:18 -0700
Subject: [PATCH 30/34] hv_netvsc: common detach logic

[ Commit 7b2ee50c0cd513a176a26a71f2989facdd75bfea upstream. ]

Make common function for detaching internals of device
during changes to MTU and RSS. Make sure no more packets
are transmitted and all packets have been received before
doing device teardown.

Change the wait logic to be common and use usleep_range().

Changes transmit enabling logic so that transmit queues are disabled
during the period when lower device is being changed. And enabled
only after sub channels are setup. This avoids issue where it could
be that a packet was being sent while subchannel was not initialized.

Fixes: 8195b1396ec8 ("hv_netvsc: fix deadlock on hotplug")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h   |   1 -
 drivers/net/hyperv/netvsc.c       |  19 +--
 drivers/net/hyperv/netvsc_drv.c   | 278 +++++++++++++++++++++-----------------
 drivers/net/hyperv/rndis_filter.c |  15 +-
 4 files changed, 173 insertions(+), 140 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index fd51a329e36e..01017dd88802 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -208,7 +208,6 @@ void netvsc_channel_cb(void *context);
 int netvsc_poll(struct napi_struct *napi, int budget);
 
 void rndis_set_subchannel(struct work_struct *w);
-bool rndis_filter_opened(const struct netvsc_device *nvdev);
 int rndis_filter_open(struct netvsc_device *nvdev);
 int rndis_filter_close(struct netvsc_device *nvdev);
 struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 3f3c6489ade2..0c548748fba8 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -554,8 +554,6 @@ void netvsc_device_remove(struct hv_device *device)
 		= rtnl_dereference(net_device_ctx->nvdev);
 	int i;
 
-	cancel_work_sync(&net_device->subchan_work);
-
 	netvsc_revoke_buf(device, net_device);
 
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
@@ -644,13 +642,18 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 	queue_sends =
 		atomic_dec_return(&net_device->chan_table[q_idx].queue_sends);
 
-	if (net_device->destroy && queue_sends == 0)
-		wake_up(&net_device->wait_drain);
+	if (unlikely(net_device->destroy)) {
+		if (queue_sends == 0)
+			wake_up(&net_device->wait_drain);
+	} else {
+		struct netdev_queue *txq = netdev_get_tx_queue(ndev, q_idx);
 
-	if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
-	    (hv_ringbuf_avail_percent(&channel->outbound) > RING_AVAIL_PERCENT_HIWATER ||
-	     queue_sends < 1))
-		netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
+		if (netif_tx_queue_stopped(txq) &&
+		    (hv_ringbuf_avail_percent(&channel->outbound) > RING_AVAIL_PERCENT_HIWATER ||
+		     queue_sends < 1)) {
+			netif_tx_wake_queue(txq);
+		}
+	}
 }
 
 static void netvsc_send_completion(struct netvsc_device *net_device,
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 67f37744eebe..11b46c8d2d67 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -45,7 +45,10 @@
 
 #include "hyperv_net.h"
 
-#define RING_SIZE_MIN		64
+#define RING_SIZE_MIN	64
+#define RETRY_US_LO	5000
+#define RETRY_US_HI	10000
+#define RETRY_MAX	2000	/* >10 sec */
 
 #define LINKCHANGE_INT (2 * HZ)
 #define VF_TAKEOVER_INT (HZ / 10)
@@ -89,10 +92,8 @@ static int netvsc_open(struct net_device *net)
 	}
 
 	rdev = nvdev->extension;
-	if (!rdev->link_state) {
+	if (!rdev->link_state)
 		netif_carrier_on(net);
-		netif_tx_wake_all_queues(net);
-	}
 
 	if (vf_netdev) {
 		/* Setting synthetic device up transparently sets
@@ -108,36 +109,25 @@ static int netvsc_open(struct net_device *net)
 	return 0;
 }
 
-static int netvsc_close(struct net_device *net)
+static int netvsc_wait_until_empty(struct netvsc_device *nvdev)
 {
-	struct net_device_context *net_device_ctx = netdev_priv(net);
-	struct net_device *vf_netdev
-		= rtnl_dereference(net_device_ctx->vf_netdev);
-	struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
-	int ret = 0;
-	u32 aread, i, msec = 10, retry = 0, retry_max = 20;
-	struct vmbus_channel *chn;
-
-	netif_tx_disable(net);
-
-	/* No need to close rndis filter if it is removed already */
-	if (!nvdev)
-		goto out;
-
-	ret = rndis_filter_close(nvdev);
-	if (ret != 0) {
-		netdev_err(net, "unable to close device (ret %d).\n", ret);
-		return ret;
-	}
+	unsigned int retry = 0;
+	int i;
 
 	/* Ensure pending bytes in ring are read */
-	while (true) {
-		aread = 0;
+	for (;;) {
+		u32 aread = 0;
+
 		for (i = 0; i < nvdev->num_chn; i++) {
-			chn = nvdev->chan_table[i].channel;
+			struct vmbus_channel *chn
+				= nvdev->chan_table[i].channel;
+
 			if (!chn)
 				continue;
 
+			/* make sure receive not running now */
+			napi_synchronize(&nvdev->chan_table[i].napi);
+
 			aread = hv_get_bytes_to_read(&chn->inbound);
 			if (aread)
 				break;
@@ -147,22 +137,40 @@ static int netvsc_close(struct net_device *net)
 				break;
 		}
 
-		retry++;
-		if (retry > retry_max || aread == 0)
-			break;
+		if (aread == 0)
+			return 0;
 
-		msleep(msec);
+		if (++retry > RETRY_MAX)
+			return -ETIMEDOUT;
 
-		if (msec < 1000)
-			msec *= 2;
+		usleep_range(RETRY_US_LO, RETRY_US_HI);
 	}
+}
 
-	if (aread) {
-		netdev_err(net, "Ring buffer not empty after closing rndis\n");
-		ret = -ETIMEDOUT;
+static int netvsc_close(struct net_device *net)
+{
+	struct net_device_context *net_device_ctx = netdev_priv(net);
+	struct net_device *vf_netdev
+		= rtnl_dereference(net_device_ctx->vf_netdev);
+	struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
+	int ret;
+
+	netif_tx_disable(net);
+
+	/* No need to close rndis filter if it is removed already */
+	if (!nvdev)
+		return 0;
+
+	ret = rndis_filter_close(nvdev);
+	if (ret != 0) {
+		netdev_err(net, "unable to close device (ret %d).\n", ret);
+		return ret;
 	}
 
-out:
+	ret = netvsc_wait_until_empty(nvdev);
+	if (ret)
+		netdev_err(net, "Ring buffer not empty after closing rndis\n");
+
 	if (vf_netdev)
 		dev_close(vf_netdev);
 
@@ -820,16 +828,81 @@ static void netvsc_get_channels(struct net_device *net,
 	}
 }
 
+static int netvsc_detach(struct net_device *ndev,
+			 struct netvsc_device *nvdev)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+	struct hv_device *hdev = ndev_ctx->device_ctx;
+	int ret;
+
+	/* Don't try continuing to try and setup sub channels */
+	if (cancel_work_sync(&nvdev->subchan_work))
+		nvdev->num_chn = 1;
+
+	/* If device was up (receiving) then shutdown */
+	if (netif_running(ndev)) {
+		netif_tx_disable(ndev);
+
+		ret = rndis_filter_close(nvdev);
+		if (ret) {
+			netdev_err(ndev,
+				   "unable to close device (ret %d).\n", ret);
+			return ret;
+		}
+
+		ret = netvsc_wait_until_empty(nvdev);
+		if (ret) {
+			netdev_err(ndev,
+				   "Ring buffer not empty after closing rndis\n");
+			return ret;
+		}
+	}
+
+	netif_device_detach(ndev);
+
+	rndis_filter_device_remove(hdev, nvdev);
+
+	return 0;
+}
+
+static int netvsc_attach(struct net_device *ndev,
+			 struct netvsc_device_info *dev_info)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+	struct hv_device *hdev = ndev_ctx->device_ctx;
+	struct netvsc_device *nvdev;
+	struct rndis_device *rdev;
+	int ret;
+
+	nvdev = rndis_filter_device_add(hdev, dev_info);
+	if (IS_ERR(nvdev))
+		return PTR_ERR(nvdev);
+
+	/* Note: enable and attach happen when sub-channels setup */
+
+	netif_carrier_off(ndev);
+
+	if (netif_running(ndev)) {
+		ret = rndis_filter_open(nvdev);
+		if (ret)
+			return ret;
+
+		rdev = nvdev->extension;
+		if (!rdev->link_state)
+			netif_carrier_on(ndev);
+	}
+
+	return 0;
+}
+
 static int netvsc_set_channels(struct net_device *net,
 			       struct ethtool_channels *channels)
 {
 	struct net_device_context *net_device_ctx = netdev_priv(net);
-	struct hv_device *dev = net_device_ctx->device_ctx;
 	struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
 	unsigned int orig, count = channels->combined_count;
 	struct netvsc_device_info device_info;
-	bool was_opened;
-	int ret = 0;
+	int ret;
 
 	/* We do not support separate count for rx, tx, or other */
 	if (count == 0 ||
@@ -846,9 +919,6 @@ static int netvsc_set_channels(struct net_device *net,
 		return -EINVAL;
 
 	orig = nvdev->num_chn;
-	was_opened = rndis_filter_opened(nvdev);
-	if (was_opened)
-		rndis_filter_close(nvdev);
 
 	memset(&device_info, 0, sizeof(device_info));
 	device_info.num_chn = count;
@@ -858,28 +928,17 @@ static int netvsc_set_channels(struct net_device *net,
 	device_info.recv_sections = nvdev->recv_section_cnt;
 	device_info.recv_section_size = nvdev->recv_section_size;
 
-	rndis_filter_device_remove(dev, nvdev);
+	ret = netvsc_detach(net, nvdev);
+	if (ret)
+		return ret;
 
-	nvdev = rndis_filter_device_add(dev, &device_info);
-	if (IS_ERR(nvdev)) {
-		ret = PTR_ERR(nvdev);
+	ret = netvsc_attach(net, &device_info);
+	if (ret) {
 		device_info.num_chn = orig;
-		nvdev = rndis_filter_device_add(dev, &device_info);
-
-		if (IS_ERR(nvdev)) {
-			netdev_err(net, "restoring channel setting failed: %ld\n",
-				   PTR_ERR(nvdev));
-			return ret;
-		}
+		if (netvsc_attach(net, &device_info))
+			netdev_err(net, "restoring channel setting failed\n");
 	}
 
-	if (was_opened)
-		rndis_filter_open(nvdev);
-
-	/* We may have missed link change notifications */
-	net_device_ctx->last_reconfig = 0;
-	schedule_delayed_work(&net_device_ctx->dwork, 0);
-
 	return ret;
 }
 
@@ -946,10 +1005,8 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu)
 	struct net_device_context *ndevctx = netdev_priv(ndev);
 	struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
 	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
-	struct hv_device *hdev = ndevctx->device_ctx;
 	int orig_mtu = ndev->mtu;
 	struct netvsc_device_info device_info;
-	bool was_opened;
 	int ret = 0;
 
 	if (!nvdev || nvdev->destroy)
@@ -962,11 +1019,6 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu)
 			return ret;
 	}
 
-	netif_device_detach(ndev);
-	was_opened = rndis_filter_opened(nvdev);
-	if (was_opened)
-		rndis_filter_close(nvdev);
-
 	memset(&device_info, 0, sizeof(device_info));
 	device_info.ring_size = ring_size;
 	device_info.num_chn = nvdev->num_chn;
@@ -975,35 +1027,27 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu)
 	device_info.recv_sections = nvdev->recv_section_cnt;
 	device_info.recv_section_size = nvdev->recv_section_size;
 
-	rndis_filter_device_remove(hdev, nvdev);
+	ret = netvsc_detach(ndev, nvdev);
+	if (ret)
+		goto rollback_vf;
 
 	ndev->mtu = mtu;
 
-	nvdev = rndis_filter_device_add(hdev, &device_info);
-	if (IS_ERR(nvdev)) {
-		ret = PTR_ERR(nvdev);
-
-		/* Attempt rollback to original MTU */
-		ndev->mtu = orig_mtu;
-		nvdev = rndis_filter_device_add(hdev, &device_info);
-
-		if (vf_netdev)
-			dev_set_mtu(vf_netdev, orig_mtu);
-
-		if (IS_ERR(nvdev)) {
-			netdev_err(ndev, "restoring mtu failed: %ld\n",
-				   PTR_ERR(nvdev));
-			return ret;
-		}
-	}
+	ret = netvsc_attach(ndev, &device_info);
+	if (ret)
+		goto rollback;
 
-	if (was_opened)
-		rndis_filter_open(nvdev);
+	return 0;
 
-	netif_device_attach(ndev);
+rollback:
+	/* Attempt rollback to original MTU */
+	ndev->mtu = orig_mtu;
 
-	/* We may have missed link change notifications */
-	schedule_delayed_work(&ndevctx->dwork, 0);
+	if (netvsc_attach(ndev, &device_info))
+		netdev_err(ndev, "restoring mtu failed\n");
+rollback_vf:
+	if (vf_netdev)
+		dev_set_mtu(vf_netdev, orig_mtu);
 
 	return ret;
 }
@@ -1469,11 +1513,9 @@ static int netvsc_set_ringparam(struct net_device *ndev,
 {
 	struct net_device_context *ndevctx = netdev_priv(ndev);
 	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
-	struct hv_device *hdev = ndevctx->device_ctx;
 	struct netvsc_device_info device_info;
 	struct ethtool_ringparam orig;
 	u32 new_tx, new_rx;
-	bool was_opened;
 	int ret = 0;
 
 	if (!nvdev || nvdev->destroy)
@@ -1499,34 +1541,18 @@ static int netvsc_set_ringparam(struct net_device *ndev,
 	device_info.recv_sections = new_rx;
 	device_info.recv_section_size = nvdev->recv_section_size;
 
-	netif_device_detach(ndev);
-	was_opened = rndis_filter_opened(nvdev);
-	if (was_opened)
-		rndis_filter_close(nvdev);
-
-	rndis_filter_device_remove(hdev, nvdev);
-
-	nvdev = rndis_filter_device_add(hdev, &device_info);
-	if (IS_ERR(nvdev)) {
-		ret = PTR_ERR(nvdev);
+	ret = netvsc_detach(ndev, nvdev);
+	if (ret)
+		return ret;
 
+	ret = netvsc_attach(ndev, &device_info);
+	if (ret) {
 		device_info.send_sections = orig.tx_pending;
 		device_info.recv_sections = orig.rx_pending;
-		nvdev = rndis_filter_device_add(hdev, &device_info);
-		if (IS_ERR(nvdev)) {
-			netdev_err(ndev, "restoring ringparam failed: %ld\n",
-				   PTR_ERR(nvdev));
-			return ret;
-		}
-	}
-
-	if (was_opened)
-		rndis_filter_open(nvdev);
-	netif_device_attach(ndev);
 
-	/* We may have missed link change notifications */
-	ndevctx->last_reconfig = 0;
-	schedule_delayed_work(&ndevctx->dwork, 0);
+		if (netvsc_attach(ndev, &device_info))
+			netdev_err(ndev, "restoring ringparam failed");
+	}
 
 	return ret;
 }
@@ -2003,8 +2029,8 @@ static int netvsc_probe(struct hv_device *dev,
 static int netvsc_remove(struct hv_device *dev)
 {
 	struct net_device_context *ndev_ctx;
-	struct net_device *vf_netdev;
-	struct net_device *net;
+	struct net_device *vf_netdev, *net;
+	struct netvsc_device *nvdev;
 
 	net = hv_get_drvdata(dev);
 	if (net == NULL) {
@@ -2014,10 +2040,14 @@ static int netvsc_remove(struct hv_device *dev)
 
 	ndev_ctx = netdev_priv(net);
 
-	netif_device_detach(net);
-
 	cancel_delayed_work_sync(&ndev_ctx->dwork);
 
+	rcu_read_lock();
+	nvdev = rcu_dereference(ndev_ctx->nvdev);
+
+	if  (nvdev)
+		cancel_work_sync(&nvdev->subchan_work);
+
 	/*
 	 * Call to the vsc driver to let it know that the device is being
 	 * removed. Also blocks mtu and channel changes.
@@ -2027,11 +2057,13 @@ static int netvsc_remove(struct hv_device *dev)
 	if (vf_netdev)
 		netvsc_unregister_vf(vf_netdev);
 
+	if (nvdev)
+		rndis_filter_device_remove(dev, nvdev);
+
 	unregister_netdevice(net);
 
-	rndis_filter_device_remove(dev,
-				   rtnl_dereference(ndev_ctx->nvdev));
 	rtnl_unlock();
+	rcu_read_unlock();
 
 	hv_set_drvdata(dev, NULL);
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 056499014a1f..3bfa56560286 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1112,6 +1112,7 @@ void rndis_set_subchannel(struct work_struct *w)
 	for (i = 0; i < VRSS_SEND_TAB_SIZE; i++)
 		ndev_ctx->tx_table[i] = i % nvdev->num_chn;
 
+	netif_device_attach(ndev);
 	rtnl_unlock();
 	return;
 
@@ -1122,6 +1123,8 @@ void rndis_set_subchannel(struct work_struct *w)
 
 	nvdev->max_chn = 1;
 	nvdev->num_chn = 1;
+
+	netif_device_attach(ndev);
 unlock:
 	rtnl_unlock();
 }
@@ -1324,6 +1327,10 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 		net_device->num_chn = 1;
 	}
 
+	/* No sub channels, device is ready */
+	if (net_device->num_chn == 1)
+		netif_device_attach(net);
+
 	return net_device;
 
 err_dev_remv:
@@ -1336,9 +1343,6 @@ void rndis_filter_device_remove(struct hv_device *dev,
 {
 	struct rndis_device *rndis_dev = net_dev->extension;
 
-	/* Don't try and setup sub channels if about to halt */
-	cancel_work_sync(&net_dev->subchan_work);
-
 	/* Halt and release the rndis device */
 	rndis_filter_halt_device(rndis_dev);
 
@@ -1368,8 +1372,3 @@ int rndis_filter_close(struct netvsc_device *nvdev)
 
 	return rndis_filter_close_device(nvdev->extension);
 }
-
-bool rndis_filter_opened(const struct netvsc_device *nvdev)
-{
-	return atomic_read(&nvdev->open_cnt) > 0;
-}
-- 
2.14.3


From 1fb47631ecd3d8e8811a8519d383f11eea721766 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <mgamal@redhat.com>
Date: Mon, 14 May 2018 15:32:19 -0700
Subject: [PATCH 31/34] hv_netvsc: Use Windows version instead of NVSP version
 on GPAD teardown

commit 2afc5d61a7197de25a61f54ea4ecfb4cb62b1d42A upstram

When changing network interface settings, Windows guests
older than WS2016 can no longer shutdown. This was addressed
by commit 0ef58b0a05c12 ("hv_netvsc: change GPAD teardown order
on older versions"), however the issue also occurs on WS2012
guests that share NVSP protocol versions with WS2016 guests.
Hence we use Windows version directly to differentiate them.

Fixes: 0ef58b0a05c12 ("hv_netvsc: change GPAD teardown order on older versions")
Signed-off-by: Mohammed Gamal <mgamal@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 0c548748fba8..aa9b7a912c31 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -569,13 +569,13 @@ void netvsc_device_remove(struct hv_device *device)
 	netdev_dbg(ndev, "net device safe to remove\n");
 
 	/* older versions require that buffer be revoked before close */
-	if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_4)
+	if (vmbus_proto_version < VERSION_WIN10)
 		netvsc_teardown_gpadl(device, net_device);
 
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
-	if (net_device->nvsp_version >= NVSP_PROTOCOL_VERSION_4)
+	if (vmbus_proto_version >= VERSION_WIN10)
 		netvsc_teardown_gpadl(device, net_device);
 
 	/* Release all resources */
-- 
2.14.3


From 2d058ded4b9b0271040d0fe5be0089a65cb72af0 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <mgamal@redhat.com>
Date: Mon, 14 May 2018 15:32:20 -0700
Subject: [PATCH 32/34] hv_netvsc: Split netvsc_revoke_buf() and
 netvsc_teardown_gpadl()

[ Commit 7992894c305eaf504d005529637ff8283d0a849d upstream. ]

Split each of the functions into two for each of send/recv buffers.
This will be needed in order to implement a fine-grained messaging
sequence to the host so that we accommodate the requirements of
different Windows versions

Fixes: 0ef58b0a05c12 ("hv_netvsc: change GPAD teardown order on older versions")
Signed-off-by: Mohammed Gamal <mgamal@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 46 +++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index aa9b7a912c31..25fcba506ac5 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -105,11 +105,11 @@ static void free_netvsc_device_rcu(struct netvsc_device *nvdev)
 	call_rcu(&nvdev->rcu, free_netvsc_device);
 }
 
-static void netvsc_revoke_buf(struct hv_device *device,
-			      struct netvsc_device *net_device)
+static void netvsc_revoke_recv_buf(struct hv_device *device,
+				   struct netvsc_device *net_device)
 {
-	struct nvsp_message *revoke_packet;
 	struct net_device *ndev = hv_get_drvdata(device);
+	struct nvsp_message *revoke_packet;
 	int ret;
 
 	/*
@@ -151,6 +151,14 @@ static void netvsc_revoke_buf(struct hv_device *device,
 		}
 		net_device->recv_section_cnt = 0;
 	}
+}
+
+static void netvsc_revoke_send_buf(struct hv_device *device,
+				   struct netvsc_device *net_device)
+{
+	struct net_device *ndev = hv_get_drvdata(device);
+	struct nvsp_message *revoke_packet;
+	int ret;
 
 	/* Deal with the send buffer we may have setup.
 	 * If we got a  send section size, it means we received a
@@ -194,8 +202,8 @@ static void netvsc_revoke_buf(struct hv_device *device,
 	}
 }
 
-static void netvsc_teardown_gpadl(struct hv_device *device,
-				  struct netvsc_device *net_device)
+static void netvsc_teardown_recv_gpadl(struct hv_device *device,
+				       struct netvsc_device *net_device)
 {
 	struct net_device *ndev = hv_get_drvdata(device);
 	int ret;
@@ -214,6 +222,13 @@ static void netvsc_teardown_gpadl(struct hv_device *device,
 		}
 		net_device->recv_buf_gpadl_handle = 0;
 	}
+}
+
+static void netvsc_teardown_send_gpadl(struct hv_device *device,
+				       struct netvsc_device *net_device)
+{
+	struct net_device *ndev = hv_get_drvdata(device);
+	int ret;
 
 	if (net_device->send_buf_gpadl_handle) {
 		ret = vmbus_teardown_gpadl(device->channel,
@@ -423,8 +438,10 @@ static int netvsc_init_buf(struct hv_device *device,
 	goto exit;
 
 cleanup:
-	netvsc_revoke_buf(device, net_device);
-	netvsc_teardown_gpadl(device, net_device);
+	netvsc_revoke_recv_buf(device, net_device);
+	netvsc_revoke_send_buf(device, net_device);
+	netvsc_teardown_recv_gpadl(device, net_device);
+	netvsc_teardown_send_gpadl(device, net_device);
 
 exit:
 	return ret;
@@ -554,7 +571,8 @@ void netvsc_device_remove(struct hv_device *device)
 		= rtnl_dereference(net_device_ctx->nvdev);
 	int i;
 
-	netvsc_revoke_buf(device, net_device);
+	netvsc_revoke_recv_buf(device, net_device);
+	netvsc_revoke_send_buf(device, net_device);
 
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
 
@@ -569,14 +587,18 @@ void netvsc_device_remove(struct hv_device *device)
 	netdev_dbg(ndev, "net device safe to remove\n");
 
 	/* older versions require that buffer be revoked before close */
-	if (vmbus_proto_version < VERSION_WIN10)
-		netvsc_teardown_gpadl(device, net_device);
+	if (vmbus_proto_version < VERSION_WIN10) {
+		netvsc_teardown_recv_gpadl(device, net_device);
+		netvsc_teardown_send_gpadl(device, net_device);
+	}
 
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
-	if (vmbus_proto_version >= VERSION_WIN10)
-		netvsc_teardown_gpadl(device, net_device);
+	if (vmbus_proto_version >= VERSION_WIN10) {
+		netvsc_teardown_recv_gpadl(device, net_device);
+		netvsc_teardown_send_gpadl(device, net_device);
+	}
 
 	/* Release all resources */
 	free_netvsc_device_rcu(net_device);
-- 
2.14.3


From 3e3772a6765535629ab3901be6309f9f002b602c Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <mgamal@redhat.com>
Date: Mon, 14 May 2018 15:32:21 -0700
Subject: [PATCH 33/34] hv_netvsc: Ensure correct teardown message sequence
 order

[ Commit a56d99d714665591fed8527b90eef21530ea61e0 upstream. ]

Prior to commit 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split")
the call sequence in netvsc_device_remove() was as follows (as
implemented in netvsc_destroy_buf()):
1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message
2- Teardown receive buffer GPADL
3- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message
4- Teardown send buffer GPADL
5- Close vmbus

This didn't work for WS2016 hosts. Commit 0cf737808ae7
("hv_netvsc: netvsc_teardown_gpadl() split") rearranged the
teardown sequence as follows:
1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message
2- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message
3- Close vmbus
4- Teardown receive buffer GPADL
5- Teardown send buffer GPADL

That worked well for WS2016 hosts, but it prevented guests on older hosts from
shutting down after changing network settings. Commit 0ef58b0a05c1
("hv_netvsc: change GPAD teardown order on older versions") ensured the
following message sequence for older hosts
1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message
2- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message
3- Teardown receive buffer GPADL
4- Teardown send buffer GPADL
5- Close vmbus

However, with this sequence calling `ip link set eth0 mtu 1000` hangs and the
process becomes uninterruptible. On futher analysis it turns out that on tearing
down the receive buffer GPADL the kernel is waiting indefinitely
in vmbus_teardown_gpadl() for a completion to be signaled.

Here is a snippet of where this occurs:
int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
{
        struct vmbus_channel_gpadl_teardown *msg;
        struct vmbus_channel_msginfo *info;
        unsigned long flags;
        int ret;

        info = kmalloc(sizeof(*info) +
                       sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL);
        if (!info)
                return -ENOMEM;

        init_completion(&info->waitevent);
        info->waiting_channel = channel;
[....]
        ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_gpadl_teardown),
                             true);

        if (ret)
                goto post_msg_err;

        wait_for_completion(&info->waitevent);
[....]
}

The completion is signaled from vmbus_ongpadl_torndown(), which gets called when
the corresponding message is received from the host, which apparently never happens
in that case.
This patch works around the issue by restoring the first mentioned message sequence
for older hosts

Fixes: 0ef58b0a05c1 ("hv_netvsc: change GPAD teardown order on older versions")
Signed-off-by: Mohammed Gamal <mgamal@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 25fcba506ac5..99be63eacaeb 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -571,8 +571,17 @@ void netvsc_device_remove(struct hv_device *device)
 		= rtnl_dereference(net_device_ctx->nvdev);
 	int i;
 
+	/*
+	 * Revoke receive buffer. If host is pre-Win2016 then tear down
+	 * receive buffer GPADL. Do the same for send buffer.
+	 */
 	netvsc_revoke_recv_buf(device, net_device);
+	if (vmbus_proto_version < VERSION_WIN10)
+		netvsc_teardown_recv_gpadl(device, net_device);
+
 	netvsc_revoke_send_buf(device, net_device);
+	if (vmbus_proto_version < VERSION_WIN10)
+		netvsc_teardown_send_gpadl(device, net_device);
 
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
 
@@ -586,15 +595,13 @@ void netvsc_device_remove(struct hv_device *device)
 	 */
 	netdev_dbg(ndev, "net device safe to remove\n");
 
-	/* older versions require that buffer be revoked before close */
-	if (vmbus_proto_version < VERSION_WIN10) {
-		netvsc_teardown_recv_gpadl(device, net_device);
-		netvsc_teardown_send_gpadl(device, net_device);
-	}
-
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
+	/*
+	 * If host is Win2016 or higher then we do the GPADL tear down
+	 * here after VMBus is closed.
+	*/
 	if (vmbus_proto_version >= VERSION_WIN10) {
 		netvsc_teardown_recv_gpadl(device, net_device);
 		netvsc_teardown_send_gpadl(device, net_device);
-- 
2.14.3


From b104abdfafab9e75c67d2dbeb1867d3d28d9ac75 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <mgamal@redhat.com>
Date: Mon, 14 May 2018 15:32:22 -0700
Subject: [PATCH 34/34] hv_netvsc: Fix net device attach on older Windows hosts

[ Commit 55be9f25be1ca5bda75c39808fc77e42691bc07f upstream. ]

On older windows hosts the net_device instance is returned to
the caller of rndis_filter_device_add() without having the presence
bit set first. This would cause any subsequent calls to network device
operations (e.g. MTU change, channel change) to fail after the device
is detached once, returning -ENODEV.

Instead of returning the device instabce, we take the exit path where
we call netif_device_attach()

Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic")
Signed-off-by: Mohammed Gamal <mgamal@redhat.com>
Reviewed-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 3bfa56560286..6dde92c1c113 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1276,7 +1276,7 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 		   rndis_device->link_state ? "down" : "up");
 
 	if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_5)
-		return net_device;
+		goto out;
 
 	rndis_filter_query_link_speed(rndis_device, net_device);
 
-- 
2.14.3


[-- Attachment #3: net_416.mbox --]
[-- Type: Application/Octet-Stream, Size: 118700 bytes --]

From caa2be7d6d697e2fec45c88a2ce6c5c06bac2a09 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Thu, 17 May 2018 14:50:44 -0700
Subject: [PATCH 01/32] net: Fix a bug in removing queues from XPS map

[ Upstream commit 6358d49ac23995fdfe157cc8747ab0f274d3954b ]

While removing queues from the XPS map, the individual CPU ID
alone was used to index the CPUs map, this should be changed to also
factor in the traffic class mapping for the CPU-to-queue lookup.

Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes")
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3e550507e9f0..ace13bea3e50 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2097,7 +2097,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 		int i, j;
 
 		for (i = count, j = offset; i--; j++) {
-			if (!remove_xps_queue(dev_maps, cpu, j))
+			if (!remove_xps_queue(dev_maps, tci, j))
 				break;
 		}
 
-- 
2.14.3


From a90fc67a32d17c7afb9703406f72e8db3fc64c40 Mon Sep 17 00:00:00 2001
From: Tarick Bedeir <tarick@google.com>
Date: Sun, 13 May 2018 16:38:45 -0700
Subject: [PATCH 02/32] net/mlx4_core: Fix error handling in
 mlx4_init_port_info.

[ Upstream commit 57f6f99fdad9984801cde05c1db68fe39b474a10 ]

Avoid exiting the function with a lingering sysfs file (if the first
call to device_create_file() fails while the second succeeds), and avoid
calling devlink_port_unregister() twice.

In other words, either mlx4_init_port_info() succeeds and returns zero, or
it fails, returns non-zero, and requires no cleanup.

Fixes: 096335b3f983 ("mlx4_core: Allow dynamic MTU configuration for IB ports")
Signed-off-by: Tarick Bedeir <tarick@google.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 4d84cab77105..e8a3a45d0b53 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3007,6 +3007,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}
 
 	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
@@ -3028,9 +3029,10 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 				   &info->port_attr);
 		devlink_port_unregister(&info->devlink_port);
 		info->port = -1;
+		return err;
 	}
 
-	return err;
+	return 0;
 }
 
 static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
-- 
2.14.3


From 8eba4160e89531cc2fcdd38ce77f2139988e3a74 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 14 May 2018 15:38:10 -0700
Subject: [PATCH 03/32] net/mlx5: Fix build break when CONFIG_SMP=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit e3ca34880652250f524022ad89e516f8ba9a805b ]

Avoid using the kernel's irq_descriptor and return IRQ vector affinity
directly from the driver.

This fixes the following build break when CONFIG_SMP=n

include/linux/mlx5/driver.h: In function ‘mlx5_get_vector_affinity_hint’:
include/linux/mlx5/driver.h:1299:13: error:
        ‘struct irq_desc’ has no member named ‘affinity_hint’

Fixes: 6082d9c9c94a ("net/mlx5: Fix mlx5_get_vector_affinity function")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
CC: Randy Dunlap <rdunlap@infradead.org>
CC: Guenter Roeck <linux@roeck-us.net>
CC: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Israel Rukshin <israelr@mellanox.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1352b1b990a7..3ebb2f6ace79 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1271,17 +1271,7 @@ enum {
 static inline const struct cpumask *
 mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector)
 {
-	struct irq_desc *desc;
-	unsigned int irq;
-	int eqn;
-	int err;
-
-	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
-	if (err)
-		return NULL;
-
-	desc = irq_to_desc(irq);
-	return desc->affinity_hint;
+	return dev->priv.irq_info[vector].mask;
 }
 
 #endif /* MLX5_DRIVER_H */
-- 
2.14.3


From 1bb5084cd9b422798c87473fae50131c632a8bfb Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 16 May 2018 12:54:29 +0200
Subject: [PATCH 04/32] net/sched: fix refcnt leak in the error path of
 tcf_vlan_init()

[ Upstream commit 5a4931ae0193f8a4a97e8260fd0df1d705d83299 ]

Similarly to what was done with commit a52956dfc503 ("net sched actions:
fix refcnt leak in skbmod"), fix the error path of tcf_vlan_init() to avoid
refcnt leaks when wrong value of TCA_VLAN_PUSH_VLAN_PROTOCOL is given.

Fixes: 5026c9b1bafc ("net sched: vlan action fix late binding")
CC: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_vlan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index c49cb61adedf..64ca017f2e00 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -161,6 +161,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			case htons(ETH_P_8021AD):
 				break;
 			default:
+				if (exists)
+					tcf_idr_release(*a, bind);
 				return -EPROTONOSUPPORT;
 			}
 		} else {
-- 
2.14.3


From bbd0eb285363437bd4f600a1e5c52ba9860dec44 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 18 May 2018 14:51:44 +0200
Subject: [PATCH 05/32] net: sched: red: avoid hashing NULL child

[ Upstream commit 44a63b137f7b6e4c7bd6c9cc21615941cb36509d ]

Hangbin reported an Oops triggered by the syzkaller qdisc rules:

 kasan: GPF could be caused by NULL-ptr deref or user memory access
 general protection fault: 0000 [#1] SMP KASAN PTI
 Modules linked in: sch_red
 CPU: 0 PID: 28699 Comm: syz-executor5 Not tainted 4.17.0-rc4.kcov #1
 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
 RIP: 0010:qdisc_hash_add+0x26/0xa0
 RSP: 0018:ffff8800589cf470 EFLAGS: 00010203
 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff824ad971
 RDX: 0000000000000007 RSI: ffffc9000ce9f000 RDI: 000000000000003c
 RBP: 0000000000000001 R08: ffffed000b139ea2 R09: ffff8800589cf4f0
 R10: ffff8800589cf50f R11: ffffed000b139ea2 R12: ffff880054019fc0
 R13: ffff880054019fb4 R14: ffff88005c0af600 R15: ffff880054019fb0
 FS:  00007fa6edcb1700(0000) GS:ffff88005ce00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000020000740 CR3: 000000000fc16000 CR4: 00000000000006f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  red_change+0x2d2/0xed0 [sch_red]
  qdisc_create+0x57e/0xef0
  tc_modify_qdisc+0x47f/0x14e0
  rtnetlink_rcv_msg+0x6a8/0x920
  netlink_rcv_skb+0x2a2/0x3c0
  netlink_unicast+0x511/0x740
  netlink_sendmsg+0x825/0xc30
  sock_sendmsg+0xc5/0x100
  ___sys_sendmsg+0x778/0x8e0
  __sys_sendmsg+0xf5/0x1b0
  do_syscall_64+0xbd/0x3b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x450869
 RSP: 002b:00007fa6edcb0c48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007fa6edcb16b4 RCX: 0000000000450869
 RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000013
 RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
 R13: 0000000000008778 R14: 0000000000702838 R15: 00007fa6edcb1700
 Code: e9 0b fe ff ff 0f 1f 44 00 00 55 53 48 89 fb 89 f5 e8 3f 07 f3 fe 48 8d 7b 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 51
 RIP: qdisc_hash_add+0x26/0xa0 RSP: ffff8800589cf470

When a red qdisc is updated with a 0 limit, the child qdisc is left
unmodified, no additional scheduler is created in red_change(),
the 'child' local variable is rightfully NULL and must not add it
to the hash table.

This change addresses the above issue moving qdisc_hash_add() right
after the child qdisc creation. It additionally removes unneeded checks
for noop_qdisc.

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: 49b499718fa1 ("net: sched: make default fifo qdiscs appear in the dump")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 5 +++--
 net/sched/sch_tbf.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 16644b3d2362..56c181c3feeb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -222,10 +222,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 					 extack);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
-	}
 
-	if (child != &noop_qdisc)
+		/* child is fifo, no need to check for noop_qdisc */
 		qdisc_hash_add(child, true);
+	}
+
 	sch_tree_lock(sch);
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 03225a8df973..6f74a426f159 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -383,6 +383,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 			err = PTR_ERR(child);
 			goto done;
 		}
+
+		/* child is fifo, no need to check for noop_qdisc */
+		qdisc_hash_add(child, true);
 	}
 
 	sch_tree_lock(sch);
@@ -391,8 +394,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
-		if (child != &noop_qdisc)
-			qdisc_hash_add(child, true);
 	}
 	q->limit = qopt->limit;
 	if (tb[TCA_TBF_PBURST])
-- 
2.14.3


From 7d09018e15e453cbf6138622ef9ae94579858e41 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 May 2018 17:01:30 -0700
Subject: [PATCH 06/32] net/smc: check for missing nlattrs in SMC_PNETID
 messages

[ Upstream commit d49baa7e12ee70c0a7b821d088a770c94c02e494 ]

It's possible to crash the kernel in several different ways by sending
messages to the SMC_PNETID generic netlink family that are missing the
expected attributes:

- Missing SMC_PNETID_NAME => null pointer dereference when comparing
  names.
- Missing SMC_PNETID_ETHNAME => null pointer dereference accessing
  smc_pnetentry::ndev.
- Missing SMC_PNETID_IBNAME => null pointer dereference accessing
  smc_pnetentry::smcibdev.
- Missing SMC_PNETID_IBPORT => out of bounds array access to
  smc_ib_device::pattr[-1].

Fix it by validating that all expected attributes are present and that
SMC_PNETID_IBPORT is nonzero.

Reported-by: syzbot+5cd61039dc9b8bfa6e47@syzkaller.appspotmail.com
Fixes: 6812baabf24d ("smc: establish pnet table management")
Cc: <stable@vger.kernel.org> # v4.11+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_pnet.c | 71 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 74568cdbca70..d7b88b2d1b22 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -245,40 +245,45 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
 static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
 			       struct nlattr *tb[])
 {
-	char *string, *ibname = NULL;
-	int rc = 0;
+	char *string, *ibname;
+	int rc;
 
 	memset(pnetelem, 0, sizeof(*pnetelem));
 	INIT_LIST_HEAD(&pnetelem->list);
-	if (tb[SMC_PNETID_NAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_NAME]);
-		if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_ETHNAME]) {
-		string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
-		pnetelem->ndev = dev_get_by_name(net, string);
-		if (!pnetelem->ndev)
-			return -ENOENT;
-	}
-	if (tb[SMC_PNETID_IBNAME]) {
-		ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
-		ibname = strim(ibname);
-		pnetelem->smcibdev = smc_pnet_find_ib(ibname);
-		if (!pnetelem->smcibdev) {
-			rc = -ENOENT;
-			goto error;
-		}
-	}
-	if (tb[SMC_PNETID_IBPORT]) {
-		pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
-		if (pnetelem->ib_port > SMC_MAX_PORTS) {
-			rc = -EINVAL;
-			goto error;
-		}
-	}
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_NAME])
+		goto error;
+	string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+	if (!smc_pnetid_valid(string, pnetelem->pnet_name))
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_ETHNAME])
+		goto error;
+	rc = -ENOENT;
+	string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+	pnetelem->ndev = dev_get_by_name(net, string);
+	if (!pnetelem->ndev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBNAME])
+		goto error;
+	rc = -ENOENT;
+	ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+	ibname = strim(ibname);
+	pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+	if (!pnetelem->smcibdev)
+		goto error;
+
+	rc = -EINVAL;
+	if (!tb[SMC_PNETID_IBPORT])
+		goto error;
+	pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+	if (pnetelem->ib_port < 1 || pnetelem->ib_port > SMC_MAX_PORTS)
+		goto error;
+
 	return 0;
 
 error:
@@ -307,6 +312,8 @@ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
 	void *hdr;
 	int rc;
 
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	pnetelem = smc_pnet_find_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 	if (!pnetelem)
@@ -359,6 +366,8 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
 
 static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
 {
+	if (!info->attrs[SMC_PNETID_NAME])
+		return -EINVAL;
 	return smc_pnet_remove_by_pnetid(
 				(char *)nla_data(info->attrs[SMC_PNETID_NAME]));
 }
-- 
2.14.3


From 43f787a750e9d842afbcf558653659876b38fadb Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 17 May 2018 13:13:29 -0400
Subject: [PATCH 07/32] net: test tailroom before appending to linear skb

[ Upstream commit 113f99c3358564a0647d444c2ae34e8b1abfd5b9 ]

Device features may change during transmission. In particular with
corking, a device may toggle scatter-gather in between allocating
and writing to an skb.

Do not unconditionally assume that !NETIF_F_SG at write time implies
that the same held at alloc time and thus the skb has sufficient
tailroom.

This issue predates git history.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 3 ++-
 net/ipv6/ip6_output.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 66340ab750e6..e7daec7c7421 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1040,7 +1040,8 @@ static int __ip_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4065ae0c32a0..072333760a52 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1489,7 +1489,8 @@ static int __ip6_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
-- 
2.14.3


From a8ad7d0ff1cc94d8bd43c5303ea56f7e6ebe377a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 11 May 2018 13:24:25 -0400
Subject: [PATCH 08/32] packet: in packet_snd start writing at link layer
 allocation

[ Upstream commit b84bbaf7a6c8cca24f8acf25a2c8e46913a947ba ]

Packet sockets allow construction of packets shorter than
dev->hard_header_len to accommodate protocols with variable length
link layer headers. These packets are padded to dev->hard_header_len,
because some device drivers interpret that as a minimum packet size.

packet_snd reserves dev->hard_header_len bytes on allocation.
SOCK_DGRAM sockets call skb_push in dev_hard_header() to ensure that
link layer headers are stored in the reserved range. SOCK_RAW sockets
do the same in tpacket_snd, but not in packet_snd.

Syzbot was able to send a zero byte packet to a device with massive
116B link layer header, causing padding to cross over into skb_shinfo.
Fix this by writing from the start of the llheader reserved range also
in the case of packet_snd/SOCK_RAW.

Update skb_set_network_header to the new offset. This also corrects
it for SOCK_DGRAM, where it incorrectly double counted reserve due to
the skb_push in dev_hard_header.

Fixes: 9ed988cd5915 ("packet: validate variable length ll headers")
Reported-by: syzbot+71d74a5406d02057d559@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3b43b1fcd618..c6a2dd890de3 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2903,13 +2903,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	if (skb == NULL)
 		goto out_unlock;
 
-	skb_set_network_header(skb, reserve);
+	skb_reset_network_header(skb);
 
 	err = -EINVAL;
 	if (sock->type == SOCK_DGRAM) {
 		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
 		if (unlikely(offset < 0))
 			goto out_free;
+	} else if (reserve) {
+		skb_push(skb, reserve);
 	}
 
 	/* Returns -EFAULT on error */
-- 
2.14.3


From 399ac6ef531b1b6095f8bfa33dd0d7631c631370 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 May 2018 04:47:55 -0700
Subject: [PATCH 09/32] sock_diag: fix use-after-free read in __sk_free

[ Upstream commit 9709020c86f6bf8439ca3effc58cfca49a5de192 ]

We must not call sock_diag_has_destroy_listeners(sk) on a socket
that has no reference on net structure.

BUG: KASAN: use-after-free in sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
BUG: KASAN: use-after-free in __sk_free+0x329/0x340 net/core/sock.c:1609
Read of size 8 at addr ffff88018a02e3a0 by task swapper/1/0

CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.17.0-rc5+ #54
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
 __sk_free+0x329/0x340 net/core/sock.c:1609
 sk_free+0x42/0x50 net/core/sock.c:1623
 sock_put include/net/sock.h:1664 [inline]
 reqsk_free include/net/request_sock.h:116 [inline]
 reqsk_put include/net/request_sock.h:124 [inline]
 inet_csk_reqsk_queue_drop_and_put net/ipv4/inet_connection_sock.c:672 [inline]
 reqsk_timer_handler+0xe27/0x10e0 net/ipv4/inet_connection_sock.c:739
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:525 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
 </IRQ>
RIP: 0010:native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:54
RSP: 0018:ffff8801d9ae7c38 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
RAX: dffffc0000000000 RBX: 1ffff1003b35cf8a RCX: 0000000000000000
RDX: 1ffffffff11a30d0 RSI: 0000000000000001 RDI: ffffffff88d18680
RBP: ffff8801d9ae7c38 R08: ffffed003b5e46c3 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: ffff8801d9ae7cf0 R14: ffffffff897bef20 R15: 0000000000000000
 arch_safe_halt arch/x86/include/asm/paravirt.h:94 [inline]
 default_idle+0xc2/0x440 arch/x86/kernel/process.c:354
 arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:345
 default_idle_call+0x6d/0x90 kernel/sched/idle.c:93
 cpuidle_idle_call kernel/sched/idle.c:153 [inline]
 do_idle+0x395/0x560 kernel/sched/idle.c:262
 cpu_startup_entry+0x104/0x120 kernel/sched/idle.c:368
 start_secondary+0x426/0x5b0 arch/x86/kernel/smpboot.c:269
 secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242

Allocated by task 4557:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
 kmem_cache_zalloc include/linux/slab.h:691 [inline]
 net_alloc net/core/net_namespace.c:383 [inline]
 copy_net_ns+0x159/0x4c0 net/core/net_namespace.c:423
 create_new_namespaces+0x69d/0x8f0 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xc3/0x1f0 kernel/nsproxy.c:206
 ksys_unshare+0x708/0xf90 kernel/fork.c:2408
 __do_sys_unshare kernel/fork.c:2476 [inline]
 __se_sys_unshare kernel/fork.c:2474 [inline]
 __x64_sys_unshare+0x31/0x40 kernel/fork.c:2474
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 69:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 net_free net/core/net_namespace.c:399 [inline]
 net_drop_ns.part.14+0x11a/0x130 net/core/net_namespace.c:406
 net_drop_ns net/core/net_namespace.c:405 [inline]
 cleanup_net+0x6a1/0xb20 net/core/net_namespace.c:541
 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
 kthread+0x345/0x410 kernel/kthread.c:240
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

The buggy address belongs to the object at ffff88018a02c140
 which belongs to the cache net_namespace of size 8832
The buggy address is located 8800 bytes inside of
 8832-byte region [ffff88018a02c140, ffff88018a02e3c0)
The buggy address belongs to the page:
page:ffffea0006280b00 count:1 mapcount:0 mapping:ffff88018a02c140 index:0x0 compound_mapcount: 0
flags: 0x2fffc0000008100(slab|head)
raw: 02fffc0000008100 ffff88018a02c140 0000000000000000 0000000100000001
raw: ffffea00062a1320 ffffea0006268020 ffff8801d9bdde40 0000000000000000
page dumped because: kasan: bad access detected

Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Craig Gallek <kraig@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 85b0b64e7f9d..81c2df84f953 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1603,7 +1603,7 @@ static void __sk_free(struct sock *sk)
 	if (likely(sk->sk_net_refcnt))
 		sock_inuse_add(sock_net(sk), -1);
 
-	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
 		sock_diag_broadcast_destroy(sk);
 	else
 		sk_destruct(sk);
-- 
2.14.3


From 0c8b0b08f9111b3456706122944d054d8a9cd823 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 May 2018 21:14:26 -0700
Subject: [PATCH 10/32] tcp: purge write queue in tcp_connect_init()

[ Upstream commit 7f582b248d0a86bae5788c548d7bb5bca6f7691a ]

syzkaller found a reliable way to crash the host, hitting a BUG()
in __tcp_retransmit_skb()

Malicous MSG_FASTOPEN is the root cause. We need to purge write queue
in tcp_connect_init() at the point we init snd_una/write_seq.

This patch also replaces the BUG() by a less intrusive WARN_ON_ONCE()

kernel BUG at net/ipv4/tcp_output.c:2837!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 5276 Comm: syz-executor0 Not tainted 4.17.0-rc3+ #51
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__tcp_retransmit_skb+0x2992/0x2eb0 net/ipv4/tcp_output.c:2837
RSP: 0000:ffff8801dae06ff8 EFLAGS: 00010206
RAX: ffff8801b9fe61c0 RBX: 00000000ffc18a16 RCX: ffffffff864e1a49
RDX: 0000000000000100 RSI: ffffffff864e2e12 RDI: 0000000000000005
RBP: ffff8801dae073a0 R08: ffff8801b9fe61c0 R09: ffffed0039c40dd2
R10: ffffed0039c40dd2 R11: ffff8801ce206e93 R12: 00000000421eeaad
R13: ffff8801ce206d4e R14: ffff8801ce206cc0 R15: ffff8801cd4f4a80
FS:  0000000000000000(0000) GS:ffff8801dae00000(0063) knlGS:00000000096bc900
CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 0000000020000000 CR3: 00000001c47b6000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 tcp_retransmit_skb+0x2e/0x250 net/ipv4/tcp_output.c:2923
 tcp_retransmit_timer+0xc50/0x3060 net/ipv4/tcp_timer.c:488
 tcp_write_timer_handler+0x339/0x960 net/ipv4/tcp_timer.c:573
 tcp_write_timer+0x111/0x1d0 net/ipv4/tcp_timer.c:593
 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
 expire_timers kernel/time/timer.c:1363 [inline]
 __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
 invoke_softirq kernel/softirq.c:365 [inline]
 irq_exit+0x1d1/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:525 [inline]
 smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863

Fixes: cf60af03ca4e ("net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6818042cd8a9..3a0211692c28 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2860,8 +2860,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		return -EBUSY;
 
 	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
-		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
-			BUG();
+		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
 			return -ENOMEM;
 	}
@@ -3369,6 +3371,7 @@ static void tcp_connect_init(struct sock *sk)
 	sock_reset_flag(sk, SOCK_DONE);
 	tp->snd_wnd = 0;
 	tcp_init_wl(tp, 0);
+	tcp_write_queue_purge(sk);
 	tp->snd_una = tp->write_seq;
 	tp->snd_sml = tp->write_seq;
 	tp->snd_up = tp->write_seq;
-- 
2.14.3


From f6da152de3354e52ba5b186cbe548b1093d506bf Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 11 May 2018 10:49:25 +0800
Subject: [PATCH 11/32] tun: fix use after free for ptr_ring

[ Upstream commit b196d88aba8ac72b775137854121097f4c4c6862 ]

We used to initialize ptr_ring during TUNSETIFF, this is because its
size depends on the tx_queue_len of netdevice. And we try to clean it
up when socket were detached from netdevice. A race were spotted when
trying to do uninit during a read which will lead a use after free for
pointer ring. Solving this by always initialize a zero size ptr_ring
in open() and do resizing during TUNSETIFF, and then we can safely do
cleanup during close(). With this, there's no need for the workaround
that was introduced by commit 4df0bfc79904 ("tun: fix a memory leak
for tfile->tx_array").

Reported-by: syzbot+e8b902c3c3fadf0a9dba@syzkaller.appspotmail.com
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Fixes: 1576d9860599 ("tun: switch to use skb array for tx")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6c7bdd0c361a..f2de59d6d586 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -680,15 +680,6 @@ static void tun_queue_purge(struct tun_file *tfile)
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
-static void tun_cleanup_tx_ring(struct tun_file *tfile)
-{
-	if (tfile->tx_ring.queue) {
-		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
-		xdp_rxq_info_unreg(&tfile->xdp_rxq);
-		memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
-	}
-}
-
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -735,7 +726,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 			    tun->dev->reg_state == NETREG_REGISTERED)
 				unregister_netdevice(tun->dev);
 		}
-		tun_cleanup_tx_ring(tfile);
+		if (tun)
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
 	}
 }
@@ -775,14 +767,14 @@ static void tun_detach_all(struct net_device *dev)
 		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
-		tun_cleanup_tx_ring(tfile);
 	}
 	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 		tun_enable_queue(tfile);
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
-		tun_cleanup_tx_ring(tfile);
 	}
 	BUG_ON(tun->numdisabled != 0);
 
@@ -826,7 +818,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 	}
 
 	if (!tfile->detached &&
-	    ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) {
+	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
+			    GFP_KERNEL, tun_ptr_free)) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -3131,6 +3124,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 					    &tun_proto, 0);
 	if (!tfile)
 		return -ENOMEM;
+	if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
+		sk_free(&tfile->sk);
+		return -ENOMEM;
+	}
+
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
@@ -3151,8 +3149,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
 
-	memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
-
 	return 0;
 }
 
@@ -3161,6 +3157,7 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
+	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }
-- 
2.14.3


From b33c114dd536f531a917fb0b4239bd3f4c2808d9 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 16 May 2018 20:39:33 +0800
Subject: [PATCH 12/32] tuntap: fix use after free during release

[ Upstream commit 7063efd33bb15abc0160347f89eb5aba6b7d000e ]

After commit b196d88aba8a ("tun: fix use after free for ptr_ring") we
need clean up tx ring during release(). But unfortunately, it tries to
do the cleanup blindly after socket were destroyed which will lead
another use-after-free. Fix this by doing the cleanup before dropping
the last reference of the socket in __tun_detach().

Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Fixes: b196d88aba8a ("tun: fix use after free for ptr_ring")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f2de59d6d586..ffae19714ffd 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -728,6 +728,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 		sock_put(&tfile->sk);
 	}
 }
@@ -3157,7 +3158,6 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
-	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }
-- 
2.14.3


From b8ecce86486a28682de39eed06f52babf31c3539 Mon Sep 17 00:00:00 2001
From: Kumar Sanghvi <kumaras@chelsio.com>
Date: Mon, 14 May 2018 16:27:34 +0530
Subject: [PATCH 13/32] cxgb4: Correct ntuple mask validation for hash filters

[ Upstream commit 849a742c59a3d597473c0232f9c2506c69eeef14 ]

Earlier code of doing bitwise AND with field width bits was wrong.
Instead, simplify code to calculate ntuple_mask based on supplied
fields and then compare with mask configured in hw - which is the
correct and simpler way to validate ntuple mask.

Fixes: 3eb8b62d5a26 ("cxgb4: add support to create hash-filters via tc-flower offload")
Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 88 ++++++++---------------
 1 file changed, 30 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 3177b0c9bd2d..829dc8c5ddff 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -836,7 +836,7 @@ bool is_filter_exact_match(struct adapter *adap,
 {
 	struct tp_params *tp = &adap->params.tp;
 	u64 hash_filter_mask = tp->hash_filter_mask;
-	u32 mask;
+	u64 ntuple_mask = 0;
 
 	if (!is_hashfilter(adap))
 		return false;
@@ -865,73 +865,45 @@ bool is_filter_exact_match(struct adapter *adap,
 	if (!fs->val.fport || fs->mask.fport != 0xffff)
 		return false;
 
-	if (tp->fcoe_shift >= 0) {
-		mask = (hash_filter_mask >> tp->fcoe_shift) & FT_FCOE_W;
-		if (mask && !fs->mask.fcoe)
-			return false;
-	}
+	/* calculate tuple mask and compare with mask configured in hw */
+	if (tp->fcoe_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.fcoe << tp->fcoe_shift;
 
-	if (tp->port_shift >= 0) {
-		mask = (hash_filter_mask >> tp->port_shift) & FT_PORT_W;
-		if (mask && !fs->mask.iport)
-			return false;
-	}
+	if (tp->port_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.iport << tp->port_shift;
 
 	if (tp->vnic_shift >= 0) {
-		mask = (hash_filter_mask >> tp->vnic_shift) & FT_VNIC_ID_W;
-
-		if ((adap->params.tp.ingress_config & VNIC_F)) {
-			if (mask && !fs->mask.pfvf_vld)
-				return false;
-		} else {
-			if (mask && !fs->mask.ovlan_vld)
-				return false;
-		}
+		if ((adap->params.tp.ingress_config & VNIC_F))
+			ntuple_mask |= (u64)fs->mask.pfvf_vld << tp->vnic_shift;
+		else
+			ntuple_mask |= (u64)fs->mask.ovlan_vld <<
+				tp->vnic_shift;
 	}
 
-	if (tp->vlan_shift >= 0) {
-		mask = (hash_filter_mask >> tp->vlan_shift) & FT_VLAN_W;
-		if (mask && !fs->mask.ivlan)
-			return false;
-	}
+	if (tp->vlan_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.ivlan << tp->vlan_shift;
 
-	if (tp->tos_shift >= 0) {
-		mask = (hash_filter_mask >> tp->tos_shift) & FT_TOS_W;
-		if (mask && !fs->mask.tos)
-			return false;
-	}
+	if (tp->tos_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.tos << tp->tos_shift;
 
-	if (tp->protocol_shift >= 0) {
-		mask = (hash_filter_mask >> tp->protocol_shift) & FT_PROTOCOL_W;
-		if (mask && !fs->mask.proto)
-			return false;
-	}
+	if (tp->protocol_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.proto << tp->protocol_shift;
 
-	if (tp->ethertype_shift >= 0) {
-		mask = (hash_filter_mask >> tp->ethertype_shift) &
-			FT_ETHERTYPE_W;
-		if (mask && !fs->mask.ethtype)
-			return false;
-	}
+	if (tp->ethertype_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.ethtype << tp->ethertype_shift;
 
-	if (tp->macmatch_shift >= 0) {
-		mask = (hash_filter_mask >> tp->macmatch_shift) & FT_MACMATCH_W;
-		if (mask && !fs->mask.macidx)
-			return false;
-	}
+	if (tp->macmatch_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.macidx << tp->macmatch_shift;
+
+	if (tp->matchtype_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.matchtype << tp->matchtype_shift;
+
+	if (tp->frag_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.frag << tp->frag_shift;
+
+	if (ntuple_mask != hash_filter_mask)
+		return false;
 
-	if (tp->matchtype_shift >= 0) {
-		mask = (hash_filter_mask >> tp->matchtype_shift) &
-			FT_MPSHITTYPE_W;
-		if (mask && !fs->mask.matchtype)
-			return false;
-	}
-	if (tp->frag_shift >= 0) {
-		mask = (hash_filter_mask >> tp->frag_shift) &
-			FT_FRAGMENTATION_W;
-		if (mask && !fs->mask.frag)
-			return false;
-	}
 	return true;
 }
 
-- 
2.14.3


From 9892c38247f56ab826aea9c79f9481f4971994a5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:01:23 -0700
Subject: [PATCH 14/32] net: dsa: bcm_sf2: Fix RX_CLS_LOC_ANY overwrite for
 last rule

[ Upstream commit 43a5e00f38fe8933a1c716bfe5b30e97f749d94b ]

When we let the kernel pick up a rule location with RX_CLS_LOC_ANY, we
would be able to overwrite the last rules because of a number of issues.

The IPv4 code path would not be checking that rule_index is within
bounds, and it would also only be allowed to pick up rules from range
0..126 instead of the full 0..127 range. This would lead us to allow
overwriting the last rule when we let the kernel pick-up the location.

Fixes: 3306145866b6 ("net: dsa: bcm_sf2: Move IPv4 CFP processing to specific functions")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 23b45da784cb..9e04786e3139 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -354,10 +354,13 @@ static int bcm_sf2_cfp_ipv4_rule_set(struct bcm_sf2_priv *priv, int port,
 	/* Locate the first rule available */
 	if (fs->location == RX_CLS_LOC_ANY)
 		rule_index = find_first_zero_bit(priv->cfp.used,
-						 bcm_sf2_cfp_rule_size(priv));
+						 priv->num_cfp_rules);
 	else
 		rule_index = fs->location;
 
+	if (rule_index > bcm_sf2_cfp_rule_size(priv))
+		return -ENOSPC;
+
 	layout = &udf_tcpip4_layout;
 	/* We only use one UDF slice for now */
 	slice_num = bcm_sf2_get_slice_number(layout, 0);
-- 
2.14.3


From 58a79e33ad15dca6ffd8582cdffdf99d6e7a8a51 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 May 2018 16:55:39 -0700
Subject: [PATCH 15/32] net: dsa: Do not register devlink for unused ports

[ Upstream commit 5447d78623da2eded06d4cd9469d1a71eba43bc4 ]

Even if commit 1d27732f411d ("net: dsa: setup and teardown ports") indicated
that registering a devlink instance for unused ports is not a problem, and this
is true, this can be confusing nonetheless, so let's not do it.

Fixes: 1d27732f411d ("net: dsa: setup and teardown ports")
Reported-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index adf50fbc4c13..47725250b4ca 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -258,11 +258,13 @@ static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst)
 static int dsa_port_setup(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
-	int err;
+	int err = 0;
 
 	memset(&dp->devlink_port, 0, sizeof(dp->devlink_port));
 
-	err = devlink_port_register(ds->devlink, &dp->devlink_port, dp->index);
+	if (dp->type != DSA_PORT_TYPE_UNUSED)
+		err = devlink_port_register(ds->devlink, &dp->devlink_port,
+					    dp->index);
 	if (err)
 		return err;
 
@@ -293,7 +295,8 @@ static int dsa_port_setup(struct dsa_port *dp)
 
 static void dsa_port_teardown(struct dsa_port *dp)
 {
-	devlink_port_unregister(&dp->devlink_port);
+	if (dp->type != DSA_PORT_TYPE_UNUSED)
+		devlink_port_unregister(&dp->devlink_port);
 
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
-- 
2.14.3


From e7a866e399fd1701d0347e125a38f62269a41b3e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:01:24 -0700
Subject: [PATCH 16/32] net: dsa: bcm_sf2: Fix IPv6 rules and chain ID

[ Upstream commit 6c05561c541843b2bec2189f680bed6d20afc25b ]

We had several issues that would make the programming of IPv6 rules both
inconsistent and error prone:

- the chain ID that we would be asking the hardware to put in the
  packet's Broadcom tag would be off by one, it would return one of the
  two indexes, but not the one user-space specified

- when an user specified a particular location to insert a CFP rule at,
  we would not be returning the same index, which would be confusing if
  nothing else

- finally, like IPv4, it would be possible to overflow the last entry by
  re-programming it

Fix this by swapping the usage of rule_index[0] and rule_index[1] where
relevant in order to return a consistent and correct user-space
experience.

Fixes: ba0696c22e7c ("net: dsa: bcm_sf2: Add support for IPv6 CFP rules")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 9e04786e3139..6fd0f8a12cc2 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -565,19 +565,21 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 	 * first half because the HW search is by incrementing addresses.
 	 */
 	if (fs->location == RX_CLS_LOC_ANY)
-		rule_index[0] = find_first_zero_bit(priv->cfp.used,
-						    bcm_sf2_cfp_rule_size(priv));
+		rule_index[1] = find_first_zero_bit(priv->cfp.used,
+						    priv->num_cfp_rules);
 	else
-		rule_index[0] = fs->location;
+		rule_index[1] = fs->location;
+	if (rule_index[1] > bcm_sf2_cfp_rule_size(priv))
+		return -ENOSPC;
 
 	/* Flag it as used (cleared on error path) such that we can immediately
 	 * obtain a second one to chain from.
 	 */
-	set_bit(rule_index[0], priv->cfp.used);
+	set_bit(rule_index[1], priv->cfp.used);
 
-	rule_index[1] = find_first_zero_bit(priv->cfp.used,
-					    bcm_sf2_cfp_rule_size(priv));
-	if (rule_index[1] > bcm_sf2_cfp_rule_size(priv)) {
+	rule_index[0] = find_first_zero_bit(priv->cfp.used,
+					    priv->num_cfp_rules);
+	if (rule_index[0] > bcm_sf2_cfp_rule_size(priv)) {
 		ret = -ENOSPC;
 		goto out_err;
 	}
@@ -715,14 +717,14 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 	/* Flag the second half rule as being used now, return it as the
 	 * location, and flag it as unique while dumping rules
 	 */
-	set_bit(rule_index[1], priv->cfp.used);
+	set_bit(rule_index[0], priv->cfp.used);
 	set_bit(rule_index[1], priv->cfp.unique);
 	fs->location = rule_index[1];
 
 	return ret;
 
 out_err:
-	clear_bit(rule_index[0], priv->cfp.used);
+	clear_bit(rule_index[1], priv->cfp.used);
 	return ret;
 }
 
-- 
2.14.3


From 829bd6f2e1e485e714130ddbc4ca7a7c9c52e75d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:01:25 -0700
Subject: [PATCH 17/32] net: dsa: bcm_sf2: Fix IPv6 rule half deletion

[ Upstream commit 1942adf64214df370350aa46954ba27654456f68 ]

It was possible to delete only one half of an IPv6, which would leave
the second half still programmed and possibly in use. Instead of
checking for the unused bitmap, we need to check the unique bitmap, and
refuse any deletion that does not match that criteria. We also need to
move that check from bcm_sf2_cfp_rule_del_one() into its caller:
bcm_sf2_cfp_rule_del() otherwise we would not be able to delete second
halves anymore that would not pass the first test.

Fixes: ba0696c22e7c ("net: dsa: bcm_sf2: Add support for IPv6 CFP rules")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 6fd0f8a12cc2..b89acaee12d4 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -790,10 +790,6 @@ static int bcm_sf2_cfp_rule_del_one(struct bcm_sf2_priv *priv, int port,
 	int ret;
 	u32 reg;
 
-	/* Refuse deletion of unused rules, and the default reserved rule */
-	if (!test_bit(loc, priv->cfp.used) || loc == 0)
-		return -EINVAL;
-
 	/* Indicate which rule we want to read */
 	bcm_sf2_cfp_rule_addr_set(priv, loc);
 
@@ -831,6 +827,13 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	u32 next_loc = 0;
 	int ret;
 
+	/* Refuse deleting unused rules, and those that are not unique since
+	 * that could leave IPv6 rules with one of the chained rule in the
+	 * table.
+	 */
+	if (!test_bit(loc, priv->cfp.unique) || loc == 0)
+		return -EINVAL;
+
 	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
 	if (ret)
 		return ret;
-- 
2.14.3


From 68283fe870fae34347209b566f06626c02ce5480 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 12 May 2018 12:16:50 +0200
Subject: [PATCH 18/32] 3c59x: convert to generic DMA API

[ Upstream commit 55c82617c3e82210b7471e9334e8fc5df6a9961f ]

This driver supports EISA devices in addition to PCI devices, and relied
on the legacy behavior of the pci_dma* shims to pass on a NULL pointer
to the DMA API, and the DMA API being able to handle that.  When the
NULL forwarding broke the EISA support got broken.  Fix this by converting
to the DMA API instead of the legacy PCI shims.

Fixes: 4167b2ad ("PCI: Remove NULL device handling from PCI DMA API")
Reported-by: tedheadster <tedheadster@gmail.com>
Tested-by: tedheadster <tedheadster@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 104 +++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 36c8950dbd2d..176861bd2252 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -1212,9 +1212,9 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq,
 	vp->mii.reg_num_mask = 0x1f;
 
 	/* Makes sure rings are at least 16 byte aligned. */
-	vp->rx_ring = pci_alloc_consistent(pdev, sizeof(struct boom_rx_desc) * RX_RING_SIZE
+	vp->rx_ring = dma_alloc_coherent(gendev, sizeof(struct boom_rx_desc) * RX_RING_SIZE
 					   + sizeof(struct boom_tx_desc) * TX_RING_SIZE,
-					   &vp->rx_ring_dma);
+					   &vp->rx_ring_dma, GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!vp->rx_ring)
 		goto free_device;
@@ -1476,11 +1476,10 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq,
 		return 0;
 
 free_ring:
-	pci_free_consistent(pdev,
-						sizeof(struct boom_rx_desc) * RX_RING_SIZE
-							+ sizeof(struct boom_tx_desc) * TX_RING_SIZE,
-						vp->rx_ring,
-						vp->rx_ring_dma);
+	dma_free_coherent(&pdev->dev,
+		sizeof(struct boom_rx_desc) * RX_RING_SIZE +
+		sizeof(struct boom_tx_desc) * TX_RING_SIZE,
+		vp->rx_ring, vp->rx_ring_dma);
 free_device:
 	free_netdev(dev);
 	pr_err(PFX "vortex_probe1 fails.  Returns %d\n", retval);
@@ -1751,9 +1750,9 @@ vortex_open(struct net_device *dev)
 				break;			/* Bad news!  */
 
 			skb_reserve(skb, NET_IP_ALIGN);	/* Align IP on 16 byte boundaries */
-			dma = pci_map_single(VORTEX_PCI(vp), skb->data,
-					     PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
-			if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma))
+			dma = dma_map_single(vp->gendev, skb->data,
+					     PKT_BUF_SZ, DMA_FROM_DEVICE);
+			if (dma_mapping_error(vp->gendev, dma))
 				break;
 			vp->rx_ring[i].addr = cpu_to_le32(dma);
 		}
@@ -2067,9 +2066,9 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (vp->bus_master) {
 		/* Set the bus-master controller to transfer the packet. */
 		int len = (skb->len + 3) & ~3;
-		vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len,
-						PCI_DMA_TODEVICE);
-		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, vp->tx_skb_dma)) {
+		vp->tx_skb_dma = dma_map_single(vp->gendev, skb->data, len,
+						DMA_TO_DEVICE);
+		if (dma_mapping_error(vp->gendev, vp->tx_skb_dma)) {
 			dev_kfree_skb_any(skb);
 			dev->stats.tx_dropped++;
 			return NETDEV_TX_OK;
@@ -2168,9 +2167,9 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded | AddTCPChksum | AddUDPChksum);
 
 	if (!skb_shinfo(skb)->nr_frags) {
-		dma_addr = pci_map_single(VORTEX_PCI(vp), skb->data, skb->len,
-					  PCI_DMA_TODEVICE);
-		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr))
+		dma_addr = dma_map_single(vp->gendev, skb->data, skb->len,
+					  DMA_TO_DEVICE);
+		if (dma_mapping_error(vp->gendev, dma_addr))
 			goto out_dma_err;
 
 		vp->tx_ring[entry].frag[0].addr = cpu_to_le32(dma_addr);
@@ -2178,9 +2177,9 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		int i;
 
-		dma_addr = pci_map_single(VORTEX_PCI(vp), skb->data,
-					  skb_headlen(skb), PCI_DMA_TODEVICE);
-		if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr))
+		dma_addr = dma_map_single(vp->gendev, skb->data,
+					  skb_headlen(skb), DMA_TO_DEVICE);
+		if (dma_mapping_error(vp->gendev, dma_addr))
 			goto out_dma_err;
 
 		vp->tx_ring[entry].frag[0].addr = cpu_to_le32(dma_addr);
@@ -2189,21 +2188,21 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
-			dma_addr = skb_frag_dma_map(&VORTEX_PCI(vp)->dev, frag,
+			dma_addr = skb_frag_dma_map(vp->gendev, frag,
 						    0,
 						    frag->size,
 						    DMA_TO_DEVICE);
-			if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr)) {
+			if (dma_mapping_error(vp->gendev, dma_addr)) {
 				for(i = i-1; i >= 0; i--)
-					dma_unmap_page(&VORTEX_PCI(vp)->dev,
+					dma_unmap_page(vp->gendev,
 						       le32_to_cpu(vp->tx_ring[entry].frag[i+1].addr),
 						       le32_to_cpu(vp->tx_ring[entry].frag[i+1].length),
 						       DMA_TO_DEVICE);
 
-				pci_unmap_single(VORTEX_PCI(vp),
+				dma_unmap_single(vp->gendev,
 						 le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
 						 le32_to_cpu(vp->tx_ring[entry].frag[0].length),
-						 PCI_DMA_TODEVICE);
+						 DMA_TO_DEVICE);
 
 				goto out_dma_err;
 			}
@@ -2218,8 +2217,8 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 #else
-	dma_addr = pci_map_single(VORTEX_PCI(vp), skb->data, skb->len, PCI_DMA_TODEVICE);
-	if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma_addr))
+	dma_addr = dma_map_single(vp->gendev, skb->data, skb->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(vp->gendev, dma_addr))
 		goto out_dma_err;
 	vp->tx_ring[entry].addr = cpu_to_le32(dma_addr);
 	vp->tx_ring[entry].length = cpu_to_le32(skb->len | LAST_FRAG);
@@ -2254,7 +2253,7 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 out:
 	return NETDEV_TX_OK;
 out_dma_err:
-	dev_err(&VORTEX_PCI(vp)->dev, "Error mapping dma buffer\n");
+	dev_err(vp->gendev, "Error mapping dma buffer\n");
 	goto out;
 }
 
@@ -2322,7 +2321,7 @@ vortex_interrupt(int irq, void *dev_id)
 		if (status & DMADone) {
 			if (ioread16(ioaddr + Wn7_MasterStatus) & 0x1000) {
 				iowrite16(0x1000, ioaddr + Wn7_MasterStatus); /* Ack the event. */
-				pci_unmap_single(VORTEX_PCI(vp), vp->tx_skb_dma, (vp->tx_skb->len + 3) & ~3, PCI_DMA_TODEVICE);
+				dma_unmap_single(vp->gendev, vp->tx_skb_dma, (vp->tx_skb->len + 3) & ~3, DMA_TO_DEVICE);
 				pkts_compl++;
 				bytes_compl += vp->tx_skb->len;
 				dev_kfree_skb_irq(vp->tx_skb); /* Release the transferred buffer */
@@ -2459,19 +2458,19 @@ boomerang_interrupt(int irq, void *dev_id)
 					struct sk_buff *skb = vp->tx_skbuff[entry];
 #if DO_ZEROCOPY
 					int i;
-					pci_unmap_single(VORTEX_PCI(vp),
+					dma_unmap_single(vp->gendev,
 							le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
 							le32_to_cpu(vp->tx_ring[entry].frag[0].length)&0xFFF,
-							PCI_DMA_TODEVICE);
+							DMA_TO_DEVICE);
 
 					for (i=1; i<=skb_shinfo(skb)->nr_frags; i++)
-							pci_unmap_page(VORTEX_PCI(vp),
+							dma_unmap_page(vp->gendev,
 											 le32_to_cpu(vp->tx_ring[entry].frag[i].addr),
 											 le32_to_cpu(vp->tx_ring[entry].frag[i].length)&0xFFF,
-											 PCI_DMA_TODEVICE);
+											 DMA_TO_DEVICE);
 #else
-					pci_unmap_single(VORTEX_PCI(vp),
-						le32_to_cpu(vp->tx_ring[entry].addr), skb->len, PCI_DMA_TODEVICE);
+					dma_unmap_single(vp->gendev,
+						le32_to_cpu(vp->tx_ring[entry].addr), skb->len, DMA_TO_DEVICE);
 #endif
 					pkts_compl++;
 					bytes_compl += skb->len;
@@ -2561,14 +2560,14 @@ static int vortex_rx(struct net_device *dev)
 				/* 'skb_put()' points to the start of sk_buff data area. */
 				if (vp->bus_master &&
 					! (ioread16(ioaddr + Wn7_MasterStatus) & 0x8000)) {
-					dma_addr_t dma = pci_map_single(VORTEX_PCI(vp), skb_put(skb, pkt_len),
-									   pkt_len, PCI_DMA_FROMDEVICE);
+					dma_addr_t dma = dma_map_single(vp->gendev, skb_put(skb, pkt_len),
+									   pkt_len, DMA_FROM_DEVICE);
 					iowrite32(dma, ioaddr + Wn7_MasterAddr);
 					iowrite16((skb->len + 3) & ~3, ioaddr + Wn7_MasterLen);
 					iowrite16(StartDMAUp, ioaddr + EL3_CMD);
 					while (ioread16(ioaddr + Wn7_MasterStatus) & 0x8000)
 						;
-					pci_unmap_single(VORTEX_PCI(vp), dma, pkt_len, PCI_DMA_FROMDEVICE);
+					dma_unmap_single(vp->gendev, dma, pkt_len, DMA_FROM_DEVICE);
 				} else {
 					ioread32_rep(ioaddr + RX_FIFO,
 					             skb_put(skb, pkt_len),
@@ -2635,11 +2634,11 @@ boomerang_rx(struct net_device *dev)
 			if (pkt_len < rx_copybreak &&
 			    (skb = netdev_alloc_skb(dev, pkt_len + 2)) != NULL) {
 				skb_reserve(skb, 2);	/* Align IP on 16 byte boundaries */
-				pci_dma_sync_single_for_cpu(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_sync_single_for_cpu(vp->gendev, dma, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				/* 'skb_put()' points to the start of sk_buff data area. */
 				skb_put_data(skb, vp->rx_skbuff[entry]->data,
 					     pkt_len);
-				pci_dma_sync_single_for_device(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_sync_single_for_device(vp->gendev, dma, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				vp->rx_copy++;
 			} else {
 				/* Pre-allocate the replacement skb.  If it or its
@@ -2651,9 +2650,9 @@ boomerang_rx(struct net_device *dev)
 					dev->stats.rx_dropped++;
 					goto clear_complete;
 				}
-				newdma = pci_map_single(VORTEX_PCI(vp), newskb->data,
-							PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
-				if (dma_mapping_error(&VORTEX_PCI(vp)->dev, newdma)) {
+				newdma = dma_map_single(vp->gendev, newskb->data,
+							PKT_BUF_SZ, DMA_FROM_DEVICE);
+				if (dma_mapping_error(vp->gendev, newdma)) {
 					dev->stats.rx_dropped++;
 					consume_skb(newskb);
 					goto clear_complete;
@@ -2664,7 +2663,7 @@ boomerang_rx(struct net_device *dev)
 				vp->rx_skbuff[entry] = newskb;
 				vp->rx_ring[entry].addr = cpu_to_le32(newdma);
 				skb_put(skb, pkt_len);
-				pci_unmap_single(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_unmap_single(vp->gendev, dma, PKT_BUF_SZ, DMA_FROM_DEVICE);
 				vp->rx_nocopy++;
 			}
 			skb->protocol = eth_type_trans(skb, dev);
@@ -2761,8 +2760,8 @@ vortex_close(struct net_device *dev)
 	if (vp->full_bus_master_rx) { /* Free Boomerang bus master Rx buffers. */
 		for (i = 0; i < RX_RING_SIZE; i++)
 			if (vp->rx_skbuff[i]) {
-				pci_unmap_single(	VORTEX_PCI(vp), le32_to_cpu(vp->rx_ring[i].addr),
-									PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+				dma_unmap_single(vp->gendev, le32_to_cpu(vp->rx_ring[i].addr),
+									PKT_BUF_SZ, DMA_FROM_DEVICE);
 				dev_kfree_skb(vp->rx_skbuff[i]);
 				vp->rx_skbuff[i] = NULL;
 			}
@@ -2775,12 +2774,12 @@ vortex_close(struct net_device *dev)
 				int k;
 
 				for (k=0; k<=skb_shinfo(skb)->nr_frags; k++)
-						pci_unmap_single(VORTEX_PCI(vp),
+						dma_unmap_single(vp->gendev,
 										 le32_to_cpu(vp->tx_ring[i].frag[k].addr),
 										 le32_to_cpu(vp->tx_ring[i].frag[k].length)&0xFFF,
-										 PCI_DMA_TODEVICE);
+										 DMA_TO_DEVICE);
 #else
-				pci_unmap_single(VORTEX_PCI(vp), le32_to_cpu(vp->tx_ring[i].addr), skb->len, PCI_DMA_TODEVICE);
+				dma_unmap_single(vp->gendev, le32_to_cpu(vp->tx_ring[i].addr), skb->len, DMA_TO_DEVICE);
 #endif
 				dev_kfree_skb(skb);
 				vp->tx_skbuff[i] = NULL;
@@ -3288,11 +3287,10 @@ static void vortex_remove_one(struct pci_dev *pdev)
 
 	pci_iounmap(pdev, vp->ioaddr);
 
-	pci_free_consistent(pdev,
-						sizeof(struct boom_rx_desc) * RX_RING_SIZE
-							+ sizeof(struct boom_tx_desc) * TX_RING_SIZE,
-						vp->rx_ring,
-						vp->rx_ring_dma);
+	dma_free_coherent(&pdev->dev,
+			sizeof(struct boom_rx_desc) * RX_RING_SIZE +
+			sizeof(struct boom_tx_desc) * TX_RING_SIZE,
+			vp->rx_ring, vp->rx_ring_dma);
 
 	pci_release_regions(pdev);
 
-- 
2.14.3


From a6368009e2780d144cd99ed694f1d0e44a3f1924 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Fri, 18 May 2018 19:13:37 +0530
Subject: [PATCH 19/32] cxgb4: fix offset in collecting TX rate limit info

[ Upstream commit d775f26b295a0a303f7a73d7da46e04296484fe7 ]

Correct the indirect register offsets in collecting TX rate limit info
in UP CIM logs.

Also, T5 doesn't support these indirect register offsets, so remove
them from collection logic.

Fixes: be6e36d916b1 ("cxgb4: collect TX rate limit info in UP CIM logs")
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h | 28 ++++++++---------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index b57acb8dc35b..dc25066c59a1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -419,15 +419,15 @@ static const u32 t6_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
 	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
 	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
 	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
-	{0x7b50, 0x7b54, 0x2900, 0x4, 0x4}, /* up_cim_2900_to_3d40 */
-	{0x7b50, 0x7b54, 0x2904, 0x4, 0x4}, /* up_cim_2904_to_3d44 */
-	{0x7b50, 0x7b54, 0x2908, 0x4, 0x4}, /* up_cim_2908_to_3d48 */
-	{0x7b50, 0x7b54, 0x2910, 0x4, 0x4}, /* up_cim_2910_to_3d4c */
-	{0x7b50, 0x7b54, 0x2914, 0x4, 0x4}, /* up_cim_2914_to_3d50 */
-	{0x7b50, 0x7b54, 0x2920, 0x10, 0x10}, /* up_cim_2920_to_2a10 */
-	{0x7b50, 0x7b54, 0x2924, 0x10, 0x10}, /* up_cim_2924_to_2a14 */
-	{0x7b50, 0x7b54, 0x2928, 0x10, 0x10}, /* up_cim_2928_to_2a18 */
-	{0x7b50, 0x7b54, 0x292c, 0x10, 0x10}, /* up_cim_292c_to_2a1c */
+	{0x7b50, 0x7b54, 0x4900, 0x4, 0x4}, /* up_cim_4900_to_4c60 */
+	{0x7b50, 0x7b54, 0x4904, 0x4, 0x4}, /* up_cim_4904_to_4c64 */
+	{0x7b50, 0x7b54, 0x4908, 0x4, 0x4}, /* up_cim_4908_to_4c68 */
+	{0x7b50, 0x7b54, 0x4910, 0x4, 0x4}, /* up_cim_4910_to_4c70 */
+	{0x7b50, 0x7b54, 0x4914, 0x4, 0x4}, /* up_cim_4914_to_4c74 */
+	{0x7b50, 0x7b54, 0x4920, 0x10, 0x10}, /* up_cim_4920_to_4a10 */
+	{0x7b50, 0x7b54, 0x4924, 0x10, 0x10}, /* up_cim_4924_to_4a14 */
+	{0x7b50, 0x7b54, 0x4928, 0x10, 0x10}, /* up_cim_4928_to_4a18 */
+	{0x7b50, 0x7b54, 0x492c, 0x10, 0x10}, /* up_cim_492c_to_4a1c */
 };
 
 static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
@@ -444,16 +444,6 @@ static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
 	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
 	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
 	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
-	{0x7b50, 0x7b54, 0x2900, 0x4, 0x4}, /* up_cim_2900_to_3d40 */
-	{0x7b50, 0x7b54, 0x2904, 0x4, 0x4}, /* up_cim_2904_to_3d44 */
-	{0x7b50, 0x7b54, 0x2908, 0x4, 0x4}, /* up_cim_2908_to_3d48 */
-	{0x7b50, 0x7b54, 0x2910, 0x4, 0x4}, /* up_cim_2910_to_3d4c */
-	{0x7b50, 0x7b54, 0x2914, 0x4, 0x4}, /* up_cim_2914_to_3d50 */
-	{0x7b50, 0x7b54, 0x2918, 0x4, 0x4}, /* up_cim_2918_to_3d54 */
-	{0x7b50, 0x7b54, 0x291c, 0x4, 0x4}, /* up_cim_291c_to_3d58 */
-	{0x7b50, 0x7b54, 0x2924, 0x10, 0x10}, /* up_cim_2924_to_2914 */
-	{0x7b50, 0x7b54, 0x2928, 0x10, 0x10}, /* up_cim_2928_to_2a18 */
-	{0x7b50, 0x7b54, 0x292c, 0x10, 0x10}, /* up_cim_292c_to_2a1c */
 };
 
 static const u32 t6_hma_ireg_array[][IREG_NUM_ELEM] = {
-- 
2.14.3


From 30babf8ce147c869aa25477e5ec7071a426a0088 Mon Sep 17 00:00:00 2001
From: "hpreg@vmware.com" <hpreg@vmware.com>
Date: Mon, 14 May 2018 08:14:34 -0400
Subject: [PATCH 20/32] vmxnet3: set the DMA mask before the first DMA map
 operation

[ Upstream commit 61aeecea40afb2b89933e27cd4adb10fc2e75cfd ]

The DMA mask must be set before, not after, the first DMA map operation, or
the first DMA map operation could in theory fail on some systems.

Fixes: b0eb57cb97e78 ("VMXNET3: Add support for virtual IOMMU")
Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 50 +++++++++++++++++++--------------------
 drivers/net/vmxnet3/vmxnet3_int.h |  8 ++++---
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 9ebe2a689966..ed58685c353f 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2688,7 +2688,7 @@ vmxnet3_set_mac_addr(struct net_device *netdev, void *p)
 /* ==================== initialization and cleanup routines ============ */
 
 static int
-vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
+vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter)
 {
 	int err;
 	unsigned long mmio_start, mmio_len;
@@ -2700,30 +2700,12 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
 		return err;
 	}
 
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
-		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
-			dev_err(&pdev->dev,
-				"pci_set_consistent_dma_mask failed\n");
-			err = -EIO;
-			goto err_set_mask;
-		}
-		*dma64 = true;
-	} else {
-		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
-			dev_err(&pdev->dev,
-				"pci_set_dma_mask failed\n");
-			err = -EIO;
-			goto err_set_mask;
-		}
-		*dma64 = false;
-	}
-
 	err = pci_request_selected_regions(pdev, (1 << 2) - 1,
 					   vmxnet3_driver_name);
 	if (err) {
 		dev_err(&pdev->dev,
 			"Failed to request region for adapter: error %d\n", err);
-		goto err_set_mask;
+		goto err_enable_device;
 	}
 
 	pci_set_master(pdev);
@@ -2751,7 +2733,7 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
 	iounmap(adapter->hw_addr0);
 err_ioremap:
 	pci_release_selected_regions(pdev, (1 << 2) - 1);
-err_set_mask:
+err_enable_device:
 	pci_disable_device(pdev);
 	return err;
 }
@@ -3254,7 +3236,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 #endif
 	};
 	int err;
-	bool dma64 = false; /* stupid gcc */
+	bool dma64;
 	u32 ver;
 	struct net_device *netdev;
 	struct vmxnet3_adapter *adapter;
@@ -3300,6 +3282,24 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	adapter->rx_ring_size = VMXNET3_DEF_RX_RING_SIZE;
 	adapter->rx_ring2_size = VMXNET3_DEF_RX_RING2_SIZE;
 
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_consistent_dma_mask failed\n");
+			err = -EIO;
+			goto err_set_mask;
+		}
+		dma64 = true;
+	} else {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_dma_mask failed\n");
+			err = -EIO;
+			goto err_set_mask;
+		}
+		dma64 = false;
+	}
+
 	spin_lock_init(&adapter->cmd_lock);
 	adapter->adapter_pa = dma_map_single(&adapter->pdev->dev, adapter,
 					     sizeof(struct vmxnet3_adapter),
@@ -3307,7 +3307,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	if (dma_mapping_error(&adapter->pdev->dev, adapter->adapter_pa)) {
 		dev_err(&pdev->dev, "Failed to map dma\n");
 		err = -EFAULT;
-		goto err_dma_map;
+		goto err_set_mask;
 	}
 	adapter->shared = dma_alloc_coherent(
 				&adapter->pdev->dev,
@@ -3358,7 +3358,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	}
 #endif /* VMXNET3_RSS */
 
-	err = vmxnet3_alloc_pci_resources(adapter, &dma64);
+	err = vmxnet3_alloc_pci_resources(adapter);
 	if (err < 0)
 		goto err_alloc_pci;
 
@@ -3504,7 +3504,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 err_alloc_shared:
 	dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
 			 sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
-err_dma_map:
+err_set_mask:
 	free_netdev(netdev);
 	return err;
 }
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index a3326463b71f..b379bed5f51d 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,12 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.14.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.15.0-k"
 
-/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040e00
+/* Each byte of this 32-bit integer encodes a version number in
+ * VMXNET3_DRIVER_VERSION_STRING.
+ */
+#define VMXNET3_DRIVER_VERSION_NUM      0x01040f00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */
-- 
2.14.3


From 3725fac335c042941d7f0caf982ee990199992bb Mon Sep 17 00:00:00 2001
From: "hpreg@vmware.com" <hpreg@vmware.com>
Date: Mon, 14 May 2018 08:14:49 -0400
Subject: [PATCH 21/32] vmxnet3: use DMA memory barriers where required

[ Upstream commit f3002c1374fb2367c9d8dbb28852791ef90d2bac ]

The gen bits must be read first from (resp. written last to) DMA memory.
The proper way to enforce this on Linux is to call dma_rmb() (resp.
dma_wmb()).

Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 22 ++++++++++++++++++++++
 drivers/net/vmxnet3/vmxnet3_int.h |  4 ++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index ed58685c353f..27a9bb8c9611 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -369,6 +369,11 @@ vmxnet3_tq_tx_complete(struct vmxnet3_tx_queue *tq,
 
 	gdesc = tq->comp_ring.base + tq->comp_ring.next2proc;
 	while (VMXNET3_TCD_GET_GEN(&gdesc->tcd) == tq->comp_ring.gen) {
+		/* Prevent any &gdesc->tcd field from being (speculatively)
+		 * read before (&gdesc->tcd)->gen is read.
+		 */
+		dma_rmb();
+
 		completed += vmxnet3_unmap_pkt(VMXNET3_TCD_GET_TXIDX(
 					       &gdesc->tcd), tq, adapter->pdev,
 					       adapter);
@@ -1103,6 +1108,11 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		gdesc->txd.tci = skb_vlan_tag_get(skb);
 	}
 
+	/* Ensure that the write to (&gdesc->txd)->gen will be observed after
+	 * all other writes to &gdesc->txd.
+	 */
+	dma_wmb();
+
 	/* finally flips the GEN bit of the SOP desc. */
 	gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^
 						  VMXNET3_TXD_GEN);
@@ -1298,6 +1308,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			 */
 			break;
 		}
+
+		/* Prevent any rcd field from being (speculatively) read before
+		 * rcd->gen is read.
+		 */
+		dma_rmb();
+
 		BUG_ON(rcd->rqID != rq->qid && rcd->rqID != rq->qid2 &&
 		       rcd->rqID != rq->dataRingQid);
 		idx = rcd->rxdIdx;
@@ -1528,6 +1544,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		ring->next2comp = idx;
 		num_to_alloc = vmxnet3_cmd_ring_desc_avail(ring);
 		ring = rq->rx_ring + ring_idx;
+
+		/* Ensure that the writes to rxd->gen bits will be observed
+		 * after all other writes to rxd objects.
+		 */
+		dma_wmb();
+
 		while (num_to_alloc) {
 			vmxnet3_getRxDesc(rxd, &ring->base[ring->next2fill].rxd,
 					  &rxCmdDesc);
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index b379bed5f51d..a2c554f8a61b 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,12 +69,12 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.15.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.16.0-k"
 
 /* Each byte of this 32-bit integer encodes a version number in
  * VMXNET3_DRIVER_VERSION_STRING.
  */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040f00
+#define VMXNET3_DRIVER_VERSION_NUM      0x01041000
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */
-- 
2.14.3


From d182ce246579e777af6f3185baaf1d6c15d55f1f Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:10 +0200
Subject: [PATCH 22/32] net: ip6_gre: Request headroom in __gre6_xmit()

[ Upstream commit 01b8d064d58b4c1f0eff47f8fe8a8508cb3b3840 ]

__gre6_xmit() pushes GRE headers before handing over to ip6_tnl_xmit()
for generic IP-in-IP processing. However it doesn't make sure that there
is enough headroom to push the header to. That can lead to the panic
cited below. (Reproducer below that).

Fix by requesting either needed_headroom if already primed, or just the
bare minimum needed for the header otherwise.

[  158.576725] kernel BUG at net/core/skbuff.c:104!
[  158.581510] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
[  158.587174] Modules linked in: act_mirred cls_matchall ip6_gre ip6_tunnel tunnel6 gre sch_ingress vrf veth x86_pkg_temp_thermal mlx_platform nfsd e1000e leds_mlxcpld
[  158.602268] CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 4.17.0-rc4-net_master-custom-139 #10
[  158.610938] Hardware name: Mellanox Technologies Ltd. "MSN2410-CB2F"/"SA000874", BIOS 4.6.5 03/08/2016
[  158.620426] RIP: 0010:skb_panic+0xc3/0x100
[  158.624586] RSP: 0018:ffff8801d3f27110 EFLAGS: 00010286
[  158.629882] RAX: 0000000000000082 RBX: ffff8801c02cc040 RCX: 0000000000000000
[  158.637127] RDX: 0000000000000082 RSI: dffffc0000000000 RDI: ffffed003a7e4e18
[  158.644366] RBP: ffff8801bfec8020 R08: ffffed003aabce19 R09: ffffed003aabce19
[  158.651574] R10: 000000000000000b R11: ffffed003aabce18 R12: ffff8801c364de66
[  158.658786] R13: 000000000000002c R14: 00000000000000c0 R15: ffff8801c364de68
[  158.666007] FS:  0000000000000000(0000) GS:ffff8801d5400000(0000) knlGS:0000000000000000
[  158.674212] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  158.680036] CR2: 00007f4b3702dcd0 CR3: 0000000003228002 CR4: 00000000001606e0
[  158.687228] Call Trace:
[  158.689752]  ? __gre6_xmit+0x246/0xd80 [ip6_gre]
[  158.694475]  ? __gre6_xmit+0x246/0xd80 [ip6_gre]
[  158.699141]  skb_push+0x78/0x90
[  158.702344]  __gre6_xmit+0x246/0xd80 [ip6_gre]
[  158.706872]  ip6gre_tunnel_xmit+0x3bc/0x610 [ip6_gre]
[  158.711992]  ? __gre6_xmit+0xd80/0xd80 [ip6_gre]
[  158.716668]  ? debug_check_no_locks_freed+0x210/0x210
[  158.721761]  ? print_irqtrace_events+0x120/0x120
[  158.726461]  ? sched_clock_cpu+0x18/0x210
[  158.730572]  ? sched_clock_cpu+0x18/0x210
[  158.734692]  ? cyc2ns_read_end+0x10/0x10
[  158.738705]  ? skb_network_protocol+0x76/0x200
[  158.743216]  ? netif_skb_features+0x1b2/0x550
[  158.747648]  dev_hard_start_xmit+0x137/0x770
[  158.752010]  sch_direct_xmit+0x2ef/0x5d0
[  158.755992]  ? pfifo_fast_dequeue+0x3fa/0x670
[  158.760460]  ? pfifo_fast_change_tx_queue_len+0x810/0x810
[  158.765975]  ? __lock_is_held+0xa0/0x160
[  158.770002]  __qdisc_run+0x39e/0xfc0
[  158.773673]  ? _raw_spin_unlock+0x29/0x40
[  158.777781]  ? pfifo_fast_enqueue+0x24b/0x3e0
[  158.782191]  ? sch_direct_xmit+0x5d0/0x5d0
[  158.786372]  ? pfifo_fast_dequeue+0x670/0x670
[  158.790818]  ? __dev_queue_xmit+0x172/0x1770
[  158.795195]  ? preempt_count_sub+0xf/0xd0
[  158.799313]  __dev_queue_xmit+0x410/0x1770
[  158.803512]  ? ___slab_alloc+0x605/0x930
[  158.807525]  ? ___slab_alloc+0x605/0x930
[  158.811540]  ? memcpy+0x34/0x50
[  158.814768]  ? netdev_pick_tx+0x1c0/0x1c0
[  158.818895]  ? __skb_clone+0x2fd/0x3d0
[  158.822712]  ? __copy_skb_header+0x270/0x270
[  158.827079]  ? rcu_read_lock_sched_held+0x93/0xa0
[  158.831903]  ? kmem_cache_alloc+0x344/0x4d0
[  158.836199]  ? skb_clone+0x123/0x230
[  158.839869]  ? skb_split+0x820/0x820
[  158.843521]  ? tcf_mirred+0x554/0x930 [act_mirred]
[  158.848407]  tcf_mirred+0x554/0x930 [act_mirred]
[  158.853104]  ? tcf_mirred_act_wants_ingress.part.2+0x10/0x10 [act_mirred]
[  158.860005]  ? __lock_acquire+0x706/0x26e0
[  158.864162]  ? mark_lock+0x13d/0xb40
[  158.867832]  tcf_action_exec+0xcf/0x2a0
[  158.871736]  tcf_classify+0xfa/0x340
[  158.875402]  __netif_receive_skb_core+0x8e1/0x1c60
[  158.880334]  ? nf_ingress+0x500/0x500
[  158.884059]  ? process_backlog+0x347/0x4b0
[  158.888241]  ? lock_acquire+0xd8/0x320
[  158.892050]  ? process_backlog+0x1b6/0x4b0
[  158.896228]  ? process_backlog+0xc2/0x4b0
[  158.900291]  process_backlog+0xc2/0x4b0
[  158.904210]  net_rx_action+0x5cc/0x980
[  158.908047]  ? napi_complete_done+0x2c0/0x2c0
[  158.912525]  ? rcu_read_unlock+0x80/0x80
[  158.916534]  ? __lock_is_held+0x34/0x160
[  158.920541]  __do_softirq+0x1d4/0x9d2
[  158.924308]  ? trace_event_raw_event_irq_handler_exit+0x140/0x140
[  158.930515]  run_ksoftirqd+0x1d/0x40
[  158.934152]  smpboot_thread_fn+0x32b/0x690
[  158.938299]  ? sort_range+0x20/0x20
[  158.941842]  ? preempt_count_sub+0xf/0xd0
[  158.945940]  ? schedule+0x5b/0x140
[  158.949412]  kthread+0x206/0x300
[  158.952689]  ? sort_range+0x20/0x20
[  158.956249]  ? kthread_stop+0x570/0x570
[  158.960164]  ret_from_fork+0x3a/0x50
[  158.963823] Code: 14 3e ff 8b 4b 78 55 4d 89 f9 41 56 41 55 48 c7 c7 a0 cf db 82 41 54 44 8b 44 24 2c 48 8b 54 24 30 48 8b 74 24 20 e8 16 94 13 ff <0f> 0b 48 c7 c7 60 8e 1f 85 48 83 c4 20 e8 55 ef a6 ff 89 74 24
[  158.983235] RIP: skb_panic+0xc3/0x100 RSP: ffff8801d3f27110
[  158.988935] ---[ end trace 5af56ee845aa6cc8 ]---
[  158.993641] Kernel panic - not syncing: Fatal exception in interrupt
[  159.000176] Kernel Offset: disabled
[  159.003767] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Reproducer:

	ip link add h1 type veth peer name swp1
	ip link add h3 type veth peer name swp3

	ip link set dev h1 up
	ip address add 192.0.2.1/28 dev h1

	ip link add dev vh3 type vrf table 20
	ip link set dev h3 master vh3
	ip link set dev vh3 up
	ip link set dev h3 up

	ip link set dev swp3 up
	ip address add dev swp3 2001:db8:2::1/64

	ip link set dev swp1 up
	tc qdisc add dev swp1 clsact

	ip link add name gt6 type ip6gretap \
		local 2001:db8:2::1 remote 2001:db8:2::2
	ip link set dev gt6 up

	sleep 1

	tc filter add dev swp1 ingress pref 1000 matchall skip_hw \
		action mirred egress mirror dev gt6
	ping -I h1 192.0.2.2

Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 197fcae855ca..7a5ed46e316d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -701,6 +701,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
 		tunnel->o_seqno++;
 
+	if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+		return -ENOMEM;
+
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
 
-- 
2.14.3


From 594a6ae9945017901ec3244ad3372f1f8ed59217 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:15 +0200
Subject: [PATCH 23/32] net: ip6_gre: Fix headroom request in
 ip6erspan_tunnel_xmit()

[ Upstream commit 5691484df961aff897d824bcc26cd1a2aa036b5b ]

dev->needed_headroom is not primed until ip6_tnl_xmit(), so it starts
out zero. Thus the call to skb_cow_head() fails to actually make sure
there's enough headroom to push the ERSPAN headers to. That can lead to
the panic cited below. (Reproducer below that).

Fix by requesting either needed_headroom if already primed, or just the
bare minimum needed for the header otherwise.

[  190.703567] kernel BUG at net/core/skbuff.c:104!
[  190.708384] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
[  190.714007] Modules linked in: act_mirred cls_matchall ip6_gre ip6_tunnel tunnel6 gre sch_ingress vrf veth x86_pkg_temp_thermal mlx_platform nfsd e1000e leds_mlxcpld
[  190.728975] CPU: 1 PID: 959 Comm: kworker/1:2 Not tainted 4.17.0-rc4-net_master-custom-139 #10
[  190.737647] Hardware name: Mellanox Technologies Ltd. "MSN2410-CB2F"/"SA000874", BIOS 4.6.5 03/08/2016
[  190.747006] Workqueue: ipv6_addrconf addrconf_dad_work
[  190.752222] RIP: 0010:skb_panic+0xc3/0x100
[  190.756358] RSP: 0018:ffff8801d54072f0 EFLAGS: 00010282
[  190.761629] RAX: 0000000000000085 RBX: ffff8801c1a8ecc0 RCX: 0000000000000000
[  190.768830] RDX: 0000000000000085 RSI: dffffc0000000000 RDI: ffffed003aa80e54
[  190.776025] RBP: ffff8801bd1ec5a0 R08: ffffed003aabce19 R09: ffffed003aabce19
[  190.783226] R10: 0000000000000001 R11: ffffed003aabce18 R12: ffff8801bf695dbe
[  190.790418] R13: 0000000000000084 R14: 00000000000006c0 R15: ffff8801bf695dc8
[  190.797621] FS:  0000000000000000(0000) GS:ffff8801d5400000(0000) knlGS:0000000000000000
[  190.805786] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  190.811582] CR2: 000055fa929aced0 CR3: 0000000003228004 CR4: 00000000001606e0
[  190.818790] Call Trace:
[  190.821264]  <IRQ>
[  190.823314]  ? ip6erspan_tunnel_xmit+0x5e4/0x1982 [ip6_gre]
[  190.828940]  ? ip6erspan_tunnel_xmit+0x5e4/0x1982 [ip6_gre]
[  190.834562]  skb_push+0x78/0x90
[  190.837749]  ip6erspan_tunnel_xmit+0x5e4/0x1982 [ip6_gre]
[  190.843219]  ? ip6gre_tunnel_ioctl+0xd90/0xd90 [ip6_gre]
[  190.848577]  ? debug_check_no_locks_freed+0x210/0x210
[  190.853679]  ? debug_check_no_locks_freed+0x210/0x210
[  190.858783]  ? print_irqtrace_events+0x120/0x120
[  190.863451]  ? sched_clock_cpu+0x18/0x210
[  190.867496]  ? cyc2ns_read_end+0x10/0x10
[  190.871474]  ? skb_network_protocol+0x76/0x200
[  190.875977]  dev_hard_start_xmit+0x137/0x770
[  190.880317]  ? do_raw_spin_trylock+0x6d/0xa0
[  190.884624]  sch_direct_xmit+0x2ef/0x5d0
[  190.888589]  ? pfifo_fast_dequeue+0x3fa/0x670
[  190.892994]  ? pfifo_fast_change_tx_queue_len+0x810/0x810
[  190.898455]  ? __lock_is_held+0xa0/0x160
[  190.902422]  __qdisc_run+0x39e/0xfc0
[  190.906041]  ? _raw_spin_unlock+0x29/0x40
[  190.910090]  ? pfifo_fast_enqueue+0x24b/0x3e0
[  190.914501]  ? sch_direct_xmit+0x5d0/0x5d0
[  190.918658]  ? pfifo_fast_dequeue+0x670/0x670
[  190.923047]  ? __dev_queue_xmit+0x172/0x1770
[  190.927365]  ? preempt_count_sub+0xf/0xd0
[  190.931421]  __dev_queue_xmit+0x410/0x1770
[  190.935553]  ? ___slab_alloc+0x605/0x930
[  190.939524]  ? print_irqtrace_events+0x120/0x120
[  190.944186]  ? memcpy+0x34/0x50
[  190.947364]  ? netdev_pick_tx+0x1c0/0x1c0
[  190.951428]  ? __skb_clone+0x2fd/0x3d0
[  190.955218]  ? __copy_skb_header+0x270/0x270
[  190.959537]  ? rcu_read_lock_sched_held+0x93/0xa0
[  190.964282]  ? kmem_cache_alloc+0x344/0x4d0
[  190.968520]  ? cyc2ns_read_end+0x10/0x10
[  190.972495]  ? skb_clone+0x123/0x230
[  190.976112]  ? skb_split+0x820/0x820
[  190.979747]  ? tcf_mirred+0x554/0x930 [act_mirred]
[  190.984582]  tcf_mirred+0x554/0x930 [act_mirred]
[  190.989252]  ? tcf_mirred_act_wants_ingress.part.2+0x10/0x10 [act_mirred]
[  190.996109]  ? __lock_acquire+0x706/0x26e0
[  191.000239]  ? sched_clock_cpu+0x18/0x210
[  191.004294]  tcf_action_exec+0xcf/0x2a0
[  191.008179]  tcf_classify+0xfa/0x340
[  191.011794]  __netif_receive_skb_core+0x8e1/0x1c60
[  191.016630]  ? debug_check_no_locks_freed+0x210/0x210
[  191.021732]  ? nf_ingress+0x500/0x500
[  191.025458]  ? process_backlog+0x347/0x4b0
[  191.029619]  ? print_irqtrace_events+0x120/0x120
[  191.034302]  ? lock_acquire+0xd8/0x320
[  191.038089]  ? process_backlog+0x1b6/0x4b0
[  191.042246]  ? process_backlog+0xc2/0x4b0
[  191.046303]  process_backlog+0xc2/0x4b0
[  191.050189]  net_rx_action+0x5cc/0x980
[  191.053991]  ? napi_complete_done+0x2c0/0x2c0
[  191.058386]  ? mark_lock+0x13d/0xb40
[  191.062001]  ? clockevents_program_event+0x6b/0x1d0
[  191.066922]  ? print_irqtrace_events+0x120/0x120
[  191.071593]  ? __lock_is_held+0xa0/0x160
[  191.075566]  __do_softirq+0x1d4/0x9d2
[  191.079282]  ? ip6_finish_output2+0x524/0x1460
[  191.083771]  do_softirq_own_stack+0x2a/0x40
[  191.087994]  </IRQ>
[  191.090130]  do_softirq.part.13+0x38/0x40
[  191.094178]  __local_bh_enable_ip+0x135/0x190
[  191.098591]  ip6_finish_output2+0x54d/0x1460
[  191.102916]  ? ip6_forward_finish+0x2f0/0x2f0
[  191.107314]  ? ip6_mtu+0x3c/0x2c0
[  191.110674]  ? ip6_finish_output+0x2f8/0x650
[  191.114992]  ? ip6_output+0x12a/0x500
[  191.118696]  ip6_output+0x12a/0x500
[  191.122223]  ? ip6_route_dev_notify+0x5b0/0x5b0
[  191.126807]  ? ip6_finish_output+0x650/0x650
[  191.131120]  ? ip6_fragment+0x1a60/0x1a60
[  191.135182]  ? icmp6_dst_alloc+0x26e/0x470
[  191.139317]  mld_sendpack+0x672/0x830
[  191.143021]  ? igmp6_mcf_seq_next+0x2f0/0x2f0
[  191.147429]  ? __local_bh_enable_ip+0x77/0x190
[  191.151913]  ipv6_mc_dad_complete+0x47/0x90
[  191.156144]  addrconf_dad_completed+0x561/0x720
[  191.160731]  ? addrconf_rs_timer+0x3a0/0x3a0
[  191.165036]  ? mark_held_locks+0xc9/0x140
[  191.169095]  ? __local_bh_enable_ip+0x77/0x190
[  191.173570]  ? addrconf_dad_work+0x50d/0xa20
[  191.177886]  ? addrconf_dad_work+0x529/0xa20
[  191.182194]  addrconf_dad_work+0x529/0xa20
[  191.186342]  ? addrconf_dad_completed+0x720/0x720
[  191.191088]  ? __lock_is_held+0xa0/0x160
[  191.195059]  ? process_one_work+0x45d/0xe20
[  191.199302]  ? process_one_work+0x51e/0xe20
[  191.203531]  ? rcu_read_lock_sched_held+0x93/0xa0
[  191.208279]  process_one_work+0x51e/0xe20
[  191.212340]  ? pwq_dec_nr_in_flight+0x200/0x200
[  191.216912]  ? get_lock_stats+0x4b/0xf0
[  191.220788]  ? preempt_count_sub+0xf/0xd0
[  191.224844]  ? worker_thread+0x219/0x860
[  191.228823]  ? do_raw_spin_trylock+0x6d/0xa0
[  191.233142]  worker_thread+0xeb/0x860
[  191.236848]  ? process_one_work+0xe20/0xe20
[  191.241095]  kthread+0x206/0x300
[  191.244352]  ? process_one_work+0xe20/0xe20
[  191.248587]  ? kthread_stop+0x570/0x570
[  191.252459]  ret_from_fork+0x3a/0x50
[  191.256082] Code: 14 3e ff 8b 4b 78 55 4d 89 f9 41 56 41 55 48 c7 c7 a0 cf db 82 41 54 44 8b 44 24 2c 48 8b 54 24 30 48 8b 74 24 20 e8 16 94 13 ff <0f> 0b 48 c7 c7 60 8e 1f 85 48 83 c4 20 e8 55 ef a6 ff 89 74 24
[  191.275327] RIP: skb_panic+0xc3/0x100 RSP: ffff8801d54072f0
[  191.281024] ---[ end trace 7ea51094e099e006 ]---
[  191.285724] Kernel panic - not syncing: Fatal exception in interrupt
[  191.292168] Kernel Offset: disabled
[  191.295697] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Reproducer:

	ip link add h1 type veth peer name swp1
	ip link add h3 type veth peer name swp3

	ip link set dev h1 up
	ip address add 192.0.2.1/28 dev h1

	ip link add dev vh3 type vrf table 20
	ip link set dev h3 master vh3
	ip link set dev vh3 up
	ip link set dev h3 up

	ip link set dev swp3 up
	ip address add dev swp3 2001:db8:2::1/64

	ip link set dev swp1 up
	tc qdisc add dev swp1 clsact

	ip link add name gt6 type ip6erspan \
		local 2001:db8:2::1 remote 2001:db8:2::2 oseq okey 123
	ip link set dev gt6 up

	sleep 1

	tc filter add dev swp1 ingress pref 1000 matchall skip_hw \
		action mirred egress mirror dev gt6
	ping -I h1 192.0.2.2

Fixes: e41c7c68ea77 ("ip6erspan: make sure enough headroom at xmit.")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 7a5ed46e316d..f88d614e32fe 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -908,7 +908,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		truncate = true;
 	}
 
-	if (skb_cow_head(skb, dev->needed_headroom))
+	if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
 		goto tx_err;
 
 	t->parms.o_flags &= ~TUNNEL_KEY;
-- 
2.14.3


From f792bac38196a6a64d483bc8dd5ea82fc76f2936 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:27 +0200
Subject: [PATCH 24/32] net: ip6_gre: Split up ip6gre_tnl_link_config()

[ Upstream commit a483373ead61e6079bc8ebe27e2dfdb2e3c1559f ]

The function ip6gre_tnl_link_config() is used for setting up
configuration of both ip6gretap and ip6erspan tunnels. Split the
function into the common part and the route-lookup part. The latter then
takes the calculated header length as an argument. This split will allow
the patches down the line to sneak in a custom header length computation
for the ERSPAN tunnel.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f88d614e32fe..9a439e5743ce 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1019,12 +1019,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
 {
 	struct net_device *dev = t->dev;
 	struct __ip6_tnl_parm *p = &t->parms;
 	struct flowi6 *fl6 = &t->fl.u.ip6;
-	int t_hlen;
 
 	if (dev->type != ARPHRD_ETHER) {
 		memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
@@ -1051,12 +1050,13 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 		dev->flags |= IFF_POINTOPOINT;
 	else
 		dev->flags &= ~IFF_POINTOPOINT;
+}
 
-	t->tun_hlen = gre_calc_hlen(t->parms.o_flags);
-
-	t->hlen = t->encap_hlen + t->tun_hlen;
-
-	t_hlen = t->hlen + sizeof(struct ipv6hdr);
+static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
+					 int t_hlen)
+{
+	const struct __ip6_tnl_parm *p = &t->parms;
+	struct net_device *dev = t->dev;
 
 	if (p->flags & IP6_TNL_F_CAP_XMIT) {
 		int strict = (ipv6_addr_type(&p->raddr) &
@@ -1088,6 +1088,24 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 	}
 }
 
+static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
+{
+	int t_hlen;
+
+	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+	tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	return t_hlen;
+}
+
+static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+	ip6gre_tnl_link_config_common(t);
+	ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t));
+}
+
 static int ip6gre_tnl_change(struct ip6_tnl *t,
 	const struct __ip6_tnl_parm *p, int set_mtu)
 {
@@ -1381,11 +1399,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 		return ret;
 	}
 
-	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
-	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
-	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
-
-	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	t_hlen = ip6gre_calc_hlen(tunnel);
 	dev->mtu = ETH_DATA_LEN - t_hlen;
 	if (dev->type == ARPHRD_ETHER)
 		dev->mtu -= ETH_HLEN;
-- 
2.14.3


From 15d248766116a3e5c12ecafd774f69f7f623f59b Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:33 +0200
Subject: [PATCH 25/32] net: ip6_gre: Split up ip6gre_tnl_change()

[ Upstream commit a6465350ef495f5cbd76a3e505d25a01d648477e ]

Split a reusable function ip6gre_tnl_copy_tnl_parm() from
ip6gre_tnl_change(). This will allow ERSPAN-specific code to
reuse the common parts while customizing the behavior for ERSPAN.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 9a439e5743ce..892858af139a 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1106,8 +1106,8 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 	ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t));
 }
 
-static int ip6gre_tnl_change(struct ip6_tnl *t,
-	const struct __ip6_tnl_parm *p, int set_mtu)
+static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t,
+				     const struct __ip6_tnl_parm *p)
 {
 	t->parms.laddr = p->laddr;
 	t->parms.raddr = p->raddr;
@@ -1123,6 +1123,12 @@ static int ip6gre_tnl_change(struct ip6_tnl *t,
 	t->parms.o_flags = p->o_flags;
 	t->parms.fwmark = p->fwmark;
 	dst_cache_reset(&t->dst_cache);
+}
+
+static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p,
+			     int set_mtu)
+{
+	ip6gre_tnl_copy_tnl_parm(t, p);
 	ip6gre_tnl_link_config(t, set_mtu);
 	return 0;
 }
-- 
2.14.3


From f832ed4855be7387cdf7374dc8fd762bbe5f9f1d Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:39 +0200
Subject: [PATCH 26/32] net: ip6_gre: Split up ip6gre_newlink()

[ Upstream commit 7fa38a7c852ec99e3a7fc375eb2c21c50c2e46b8 ]

Extract from ip6gre_newlink() a reusable function
ip6gre_newlink_common(). The ip6gre_tnl_link_config() call needs to be
made customizable for ERSPAN, thus reorder it with calls to
ip6_tnl_change_mtu() and dev_hold(), and extract the whole tail to the
caller, ip6gre_newlink(). Thus enable an ERSPAN-specific _newlink()
function without a lot of duplicity.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 892858af139a..9a0b662b0087 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1848,9 +1848,9 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
 	return ret;
 }
 
-static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
-			  struct nlattr *tb[], struct nlattr *data[],
-			  struct netlink_ext_ack *extack)
+static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
+				 struct nlattr *tb[], struct nlattr *data[],
+				 struct netlink_ext_ack *extack)
 {
 	struct ip6_tnl *nt;
 	struct net *net = dev_net(dev);
@@ -1887,18 +1887,30 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	if (err)
 		goto out;
 
-	ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
-
 	if (tb[IFLA_MTU])
 		ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
 
 	dev_hold(dev);
-	ip6gre_tunnel_link(ign, nt);
 
 out:
 	return err;
 }
 
+static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[],
+			  struct netlink_ext_ack *extack)
+{
+	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
+	struct ip6_tnl *nt = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+
+	if (!err) {
+		ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
+	}
+	return err;
+}
+
 static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
 			     struct nlattr *data[],
 			     struct netlink_ext_ack *extack)
-- 
2.14.3


From 121cc91853fbe06e713c6db5edb06bc784507870 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:45 +0200
Subject: [PATCH 27/32] net: ip6_gre: Split up ip6gre_changelink()

[ Upstream commit c8632fc30bb03aa0c3bd7bcce85355a10feb8149 ]

Extract from ip6gre_changelink() a reusable function
ip6gre_changelink_common(). This will allow introduction of
ERSPAN-specific _changelink() function with not a lot of code
duplication.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 9a0b662b0087..68edb439631f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1911,37 +1911,52 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 	return err;
 }
 
-static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
-			     struct nlattr *data[],
-			     struct netlink_ext_ack *extack)
+static struct ip6_tnl *
+ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[],
+			 struct nlattr *data[], struct __ip6_tnl_parm *p_p,
+			 struct netlink_ext_ack *extack)
 {
 	struct ip6_tnl *t, *nt = netdev_priv(dev);
 	struct net *net = nt->net;
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
-	struct __ip6_tnl_parm p;
 	struct ip_tunnel_encap ipencap;
 
 	if (dev == ign->fb_tunnel_dev)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	if (ip6gre_netlink_encap_parms(data, &ipencap)) {
 		int err = ip6_tnl_encap_setup(nt, &ipencap);
 
 		if (err < 0)
-			return err;
+			return ERR_PTR(err);
 	}
 
-	ip6gre_netlink_parms(data, &p);
+	ip6gre_netlink_parms(data, p_p);
 
-	t = ip6gre_tunnel_locate(net, &p, 0);
+	t = ip6gre_tunnel_locate(net, p_p, 0);
 
 	if (t) {
 		if (t->dev != dev)
-			return -EEXIST;
+			return ERR_PTR(-EEXIST);
 	} else {
 		t = nt;
 	}
 
+	return t;
+}
+
+static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
+			     struct nlattr *data[],
+			     struct netlink_ext_ack *extack)
+{
+	struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+	struct __ip6_tnl_parm p;
+	struct ip6_tnl *t;
+
+	t = ip6gre_changelink_common(dev, tb, data, &p, extack);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
 	ip6gre_tunnel_unlink(ign, t);
 	ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
 	ip6gre_tunnel_link(ign, t);
-- 
2.14.3


From b82abc871ea0d3d33ce74c506ed55307c3ca21bf Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 17 May 2018 16:36:51 +0200
Subject: [PATCH 28/32] net: ip6_gre: Fix ip6erspan hlen calculation

[ Upstream commit 2d665034f239412927b1e71329f20f001c92da09 ]

Even though ip6erspan_tap_init() sets up hlen and tun_hlen according to
what ERSPAN needs, it goes ahead to call ip6gre_tnl_link_config() which
overwrites these settings with GRE-specific ones.

Similarly for changelink callbacks, which are handled by
ip6gre_changelink() calls ip6gre_tnl_change() calls
ip6gre_tnl_link_config() as well.

The difference ends up being 12 vs. 20 bytes, and this is generally not
a problem, because a 12-byte request likely ends up allocating more and
the extra 8 bytes are thus available. However correct it is not.

So replace the newlink and changelink callbacks with an ERSPAN-specific
ones, reusing the newly-introduced _common() functions.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 68edb439631f..dc6730a79b96 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -81,6 +81,7 @@ static int ip6gre_tunnel_init(struct net_device *dev);
 static void ip6gre_tunnel_setup(struct net_device *dev);
 static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
 static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
+static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu);
 
 /* Tunnel hash table */
 
@@ -1746,6 +1747,19 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
 	.ndo_get_iflink = ip6_tnl_get_iflink,
 };
 
+static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel)
+{
+	int t_hlen;
+
+	tunnel->tun_hlen = 8;
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+		       erspan_hdr_len(tunnel->parms.erspan_ver);
+
+	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
+	tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	return t_hlen;
+}
+
 static int ip6erspan_tap_init(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
@@ -1769,12 +1783,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 		return ret;
 	}
 
-	tunnel->tun_hlen = 8;
-	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
-		       erspan_hdr_len(tunnel->parms.erspan_ver);
-	t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
-
-	dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+	t_hlen = ip6erspan_calc_hlen(tunnel);
 	dev->mtu = ETH_DATA_LEN - t_hlen;
 	if (dev->type == ARPHRD_ETHER)
 		dev->mtu -= ETH_HLEN;
@@ -1783,7 +1792,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 	tunnel = netdev_priv(dev);
-	ip6gre_tnl_link_config(tunnel, 1);
+	ip6erspan_tnl_link_config(tunnel, 1);
 
 	return 0;
 }
@@ -2108,6 +2117,53 @@ static void ip6erspan_tap_setup(struct net_device *dev)
 	netif_keep_dst(dev);
 }
 
+static int ip6erspan_newlink(struct net *src_net, struct net_device *dev,
+			     struct nlattr *tb[], struct nlattr *data[],
+			     struct netlink_ext_ack *extack)
+{
+	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
+	struct ip6_tnl *nt = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+
+	if (!err) {
+		ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
+	}
+	return err;
+}
+
+static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+	ip6gre_tnl_link_config_common(t);
+	ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t));
+}
+
+static int ip6erspan_tnl_change(struct ip6_tnl *t,
+				const struct __ip6_tnl_parm *p, int set_mtu)
+{
+	ip6gre_tnl_copy_tnl_parm(t, p);
+	ip6erspan_tnl_link_config(t, set_mtu);
+	return 0;
+}
+
+static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+				struct nlattr *data[],
+				struct netlink_ext_ack *extack)
+{
+	struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+	struct __ip6_tnl_parm p;
+	struct ip6_tnl *t;
+
+	t = ip6gre_changelink_common(dev, tb, data, &p, extack);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	ip6gre_tunnel_unlink(ign, t);
+	ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
+	ip6gre_tunnel_link(ign, t);
+	return 0;
+}
+
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.kind		= "ip6gre",
 	.maxtype	= IFLA_GRE_MAX,
@@ -2144,8 +2200,8 @@ static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
 	.priv_size	= sizeof(struct ip6_tnl),
 	.setup		= ip6erspan_tap_setup,
 	.validate	= ip6erspan_tap_validate,
-	.newlink	= ip6gre_newlink,
-	.changelink	= ip6gre_changelink,
+	.newlink	= ip6erspan_newlink,
+	.changelink	= ip6erspan_changelink,
 	.get_size	= ip6gre_get_size,
 	.fill_info	= ip6gre_fill_info,
 	.get_link_net	= ip6_tnl_get_link_net,
-- 
2.14.3


From 5c6c8ab038b07c720cc3d6c128ca4ad1221248f2 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 18 May 2018 19:22:28 -0700
Subject: [PATCH 29/32] net: ip6_gre: fix tunnel metadata device sharing.

[ Upstream commit b80d0b93b991e551a32157e0d9d38fc5bc9348a7 ]

Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'.  Thus, when doing:
  ip link add dev ip6gre11 type ip6gretap external
  ip link add dev ip6erspan12 type ip6erspan external
  RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.

The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'.  As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.

First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan.  Tested it
using the samples/bpf/test_tunnel_bpf.sh.

Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 101 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 79 insertions(+), 22 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index dc6730a79b96..9539bdb15edb 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -71,6 +71,7 @@ struct ip6gre_net {
 	struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
 	struct ip6_tnl __rcu *collect_md_tun;
+	struct ip6_tnl __rcu *collect_md_tun_erspan;
 	struct net_device *fb_tunnel_dev;
 };
 
@@ -233,7 +234,12 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 	if (cand)
 		return cand;
 
-	t = rcu_dereference(ign->collect_md_tun);
+	if (gre_proto == htons(ETH_P_ERSPAN) ||
+	    gre_proto == htons(ETH_P_ERSPAN2))
+		t = rcu_dereference(ign->collect_md_tun_erspan);
+	else
+		t = rcu_dereference(ign->collect_md_tun);
+
 	if (t && t->dev->flags & IFF_UP)
 		return t;
 
@@ -262,6 +268,31 @@ static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign,
 	return &ign->tunnels[prio][h];
 }
 
+static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, t);
+}
+
+static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun_erspan, t);
+}
+
+static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun, NULL);
+}
+
+static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign,
+				       struct ip6_tnl *t)
+{
+	if (t->parms.collect_md)
+		rcu_assign_pointer(ign->collect_md_tun_erspan, NULL);
+}
+
 static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign,
 		const struct ip6_tnl *t)
 {
@@ -272,9 +303,6 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
 {
 	struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
 
-	if (t->parms.collect_md)
-		rcu_assign_pointer(ign->collect_md_tun, t);
-
 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
 }
@@ -284,9 +312,6 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
 	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *iter;
 
-	if (t->parms.collect_md)
-		rcu_assign_pointer(ign->collect_md_tun, NULL);
-
 	for (tp = ip6gre_bucket(ign, t);
 	     (iter = rtnl_dereference(*tp)) != NULL;
 	     tp = &iter->next) {
@@ -375,11 +400,23 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
 	return NULL;
 }
 
+static void ip6erspan_tunnel_uninit(struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+
+	ip6erspan_tunnel_unlink_md(ign, t);
+	ip6gre_tunnel_unlink(ign, t);
+	dst_cache_reset(&t->dst_cache);
+	dev_put(dev);
+}
+
 static void ip6gre_tunnel_uninit(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
 
+	ip6gre_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	dst_cache_reset(&t->dst_cache);
 	dev_put(dev);
@@ -1799,7 +1836,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
 
 static const struct net_device_ops ip6erspan_netdev_ops = {
 	.ndo_init =		ip6erspan_tap_init,
-	.ndo_uninit =		ip6gre_tunnel_uninit,
+	.ndo_uninit =		ip6erspan_tunnel_uninit,
 	.ndo_start_xmit =	ip6erspan_tunnel_xmit,
 	.ndo_set_mac_address =	eth_mac_addr,
 	.ndo_validate_addr =	eth_validate_addr,
@@ -1862,8 +1899,6 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
 				 struct netlink_ext_ack *extack)
 {
 	struct ip6_tnl *nt;
-	struct net *net = dev_net(dev);
-	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
 	struct ip_tunnel_encap ipencap;
 	int err;
 
@@ -1876,16 +1911,6 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
 			return err;
 	}
 
-	ip6gre_netlink_parms(data, &nt->parms);
-
-	if (nt->parms.collect_md) {
-		if (rtnl_dereference(ign->collect_md_tun))
-			return -EEXIST;
-	} else {
-		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
-			return -EEXIST;
-	}
-
 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
 
@@ -1909,12 +1934,26 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
 {
-	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	struct ip6_tnl *nt = netdev_priv(dev);
 	struct net *net = dev_net(dev);
+	struct ip6gre_net *ign;
+	int err;
+
+	ip6gre_netlink_parms(data, &nt->parms);
+	ign = net_generic(net, ip6gre_net_id);
+
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ign->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+			return -EEXIST;
+	}
 
+	err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	if (!err) {
 		ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6gre_tunnel_link_md(ign, nt);
 		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
 	}
 	return err;
@@ -1966,8 +2005,10 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
+	ip6gre_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+	ip6gre_tunnel_link_md(ign, t);
 	ip6gre_tunnel_link(ign, t);
 	return 0;
 }
@@ -2121,12 +2162,26 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev,
 			     struct nlattr *tb[], struct nlattr *data[],
 			     struct netlink_ext_ack *extack)
 {
-	int err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	struct ip6_tnl *nt = netdev_priv(dev);
 	struct net *net = dev_net(dev);
+	struct ip6gre_net *ign;
+	int err;
+
+	ip6gre_netlink_parms(data, &nt->parms);
+	ign = net_generic(net, ip6gre_net_id);
+
+	if (nt->parms.collect_md) {
+		if (rtnl_dereference(ign->collect_md_tun_erspan))
+			return -EEXIST;
+	} else {
+		if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+			return -EEXIST;
+	}
 
+	err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
 	if (!err) {
 		ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
+		ip6erspan_tunnel_link_md(ign, nt);
 		ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt);
 	}
 	return err;
@@ -2158,8 +2213,10 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
+	ip6gre_tunnel_unlink_md(ign, t);
 	ip6gre_tunnel_unlink(ign, t);
 	ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
+	ip6erspan_tunnel_link_md(ign, t);
 	ip6gre_tunnel_link(ign, t);
 	return 0;
 }
-- 
2.14.3


From 1e287f87000699e9399d22d2521d48064c9e02d9 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:38 +0300
Subject: [PATCH 30/32] qed: LL2 flush isles when connection is closed

[ Upstream commit f9bcd60274a565751abef622f9018badd01a17c8 ]

Driver should free all pending isles once it gets a FLUSH cqe from FW.
Part of iSCSI out of order flow.

Fixes: 1d6cff4fca4366 ("qed: Add iSCSI out of order packet handling")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index c4f14fdc4e77..39aa1eddbfe6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -601,6 +601,27 @@ static u8 qed_ll2_convert_rx_parse_to_tx_flags(u16 parse_flags)
 	return bd_flags;
 }
 
+static bool
+qed_ll2_lb_rxq_handler_slowpath(struct qed_hwfn *p_hwfn,
+				struct core_rx_slow_path_cqe *p_cqe)
+{
+	struct ooo_opaque *iscsi_ooo;
+	u32 cid;
+
+	if (p_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH)
+		return false;
+
+	iscsi_ooo = (struct ooo_opaque *)&p_cqe->opaque_data;
+	if (iscsi_ooo->ooo_opcode != TCP_EVENT_DELETE_ISLES)
+		return false;
+
+	/* Need to make a flush */
+	cid = le32_to_cpu(iscsi_ooo->cid);
+	qed_ooo_release_connection_isles(p_hwfn, p_hwfn->p_ooo_info, cid);
+
+	return true;
+}
+
 static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
 				  struct qed_ll2_info *p_ll2_conn)
 {
@@ -627,6 +648,11 @@ static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
 		cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
 		cqe_type = cqe->rx_cqe_sp.type;
 
+		if (cqe_type == CORE_RX_CQE_TYPE_SLOW_PATH)
+			if (qed_ll2_lb_rxq_handler_slowpath(p_hwfn,
+							    &cqe->rx_cqe_sp))
+				continue;
+
 		if (cqe_type != CORE_RX_CQE_TYPE_REGULAR) {
 			DP_NOTICE(p_hwfn,
 				  "Got a non-regular LB LL2 completion [type 0x%02x]\n",
-- 
2.14.3


From 2006d1d574c5f9c77b50189c0897e6764f1b6011 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:39 +0300
Subject: [PATCH 31/32] qed: Fix possibility of list corruption during rmmod
 flows

[ Upstream commit ffd2c0d12752a69e480366031ec7a7d723dd2510 ]

The ll2 flows of flushing the txq/rxq need to be synchronized with the
regular fp processing. Caused list corruption during load/unload stress
tests.

Fixes: 0a7fb11c23c0f ("qed: Add Light L2 support")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 39aa1eddbfe6..7764e2ca1dd9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -292,6 +292,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	struct qed_ll2_tx_packet *p_pkt = NULL;
 	struct qed_ll2_info *p_ll2_conn;
 	struct qed_ll2_tx_queue *p_tx;
+	unsigned long flags = 0;
 	dma_addr_t tx_frag;
 
 	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
@@ -300,6 +301,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	p_tx = &p_ll2_conn->tx_queue;
 
+	spin_lock_irqsave(&p_tx->lock, flags);
 	while (!list_empty(&p_tx->active_descq)) {
 		p_pkt = list_first_entry(&p_tx->active_descq,
 					 struct qed_ll2_tx_packet, list_entry);
@@ -309,6 +311,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 		list_del(&p_pkt->list_entry);
 		b_last_packet = list_empty(&p_tx->active_descq);
 		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+		spin_unlock_irqrestore(&p_tx->lock, flags);
 		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
 			struct qed_ooo_buffer *p_buffer;
 
@@ -328,7 +331,9 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 						      b_last_frag,
 						      b_last_packet);
 		}
+		spin_lock_irqsave(&p_tx->lock, flags);
 	}
+	spin_unlock_irqrestore(&p_tx->lock, flags);
 }
 
 static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
@@ -556,6 +561,7 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	struct qed_ll2_info *p_ll2_conn = NULL;
 	struct qed_ll2_rx_packet *p_pkt = NULL;
 	struct qed_ll2_rx_queue *p_rx;
+	unsigned long flags = 0;
 
 	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
 	if (!p_ll2_conn)
@@ -563,13 +569,14 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	p_rx = &p_ll2_conn->rx_queue;
 
+	spin_lock_irqsave(&p_rx->lock, flags);
 	while (!list_empty(&p_rx->active_descq)) {
 		p_pkt = list_first_entry(&p_rx->active_descq,
 					 struct qed_ll2_rx_packet, list_entry);
 		if (!p_pkt)
 			break;
-
 		list_move_tail(&p_pkt->list_entry, &p_rx->free_descq);
+		spin_unlock_irqrestore(&p_rx->lock, flags);
 
 		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
 			struct qed_ooo_buffer *p_buffer;
@@ -588,7 +595,9 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 						      cookie,
 						      rx_buf_addr, b_last);
 		}
+		spin_lock_irqsave(&p_rx->lock, flags);
 	}
+	spin_unlock_irqrestore(&p_rx->lock, flags);
 }
 
 static u8 qed_ll2_convert_rx_parse_to_tx_flags(u16 parse_flags)
-- 
2.14.3


From 5b15d0e5b75654348ee0789860a0f129b8c68c77 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:40 +0300
Subject: [PATCH 32/32] qed: Fix LL2 race during connection terminate

[ Upstream commit 490068deaef0c76e47bf89c457de899b7d3995c7 ]

Stress on qedi/qedr load unload lead to list_del corruption.
This is due to ll2 connection terminate freeing resources without
verifying that no more ll2 processing will occur.

This patch unregisters the ll2 status block before terminating
the connection to assure this race does not occur.

Fixes: 1d6cff4fca4366 ("qed: Add iSCSI out of order packet handling")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 7764e2ca1dd9..0161e01778f2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -842,6 +842,9 @@ static int qed_ll2_lb_rxq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
 	struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie;
 	int rc;
 
+	if (!QED_LL2_RX_REGISTERED(p_ll2_conn))
+		return 0;
+
 	rc = qed_ll2_lb_rxq_handler(p_hwfn, p_ll2_conn);
 	if (rc)
 		return rc;
@@ -862,6 +865,9 @@ static int qed_ll2_lb_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
 	u16 new_idx = 0, num_bds = 0;
 	int rc;
 
+	if (!QED_LL2_TX_REGISTERED(p_ll2_conn))
+		return 0;
+
 	new_idx = le16_to_cpu(*p_tx->p_fw_cons);
 	num_bds = ((s16)new_idx - (s16)p_tx->bds_idx);
 
@@ -1915,17 +1921,25 @@ int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
 
 	/* Stop Tx & Rx of connection, if needed */
 	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->tx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
 		rc = qed_sp_ll2_tx_queue_stop(p_hwfn, p_ll2_conn);
 		if (rc)
 			goto out;
+
 		qed_ll2_txq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
 	}
 
 	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->rx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
 		rc = qed_sp_ll2_rx_queue_stop(p_hwfn, p_ll2_conn);
 		if (rc)
 			goto out;
+
 		qed_ll2_rxq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
 	}
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO)
@@ -1973,16 +1987,6 @@ void qed_ll2_release_connection(void *cxt, u8 connection_handle)
 	if (!p_ll2_conn)
 		return;
 
-	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
-		p_ll2_conn->rx_queue.b_cb_registred = false;
-		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
-	}
-
-	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
-		p_ll2_conn->tx_queue.b_cb_registred = false;
-		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
-	}
-
 	kfree(p_ll2_conn->tx_queue.descq_mem);
 	qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain);
 
-- 
2.14.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-11-06 15:43 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-05  1:51 [GIT] Networking David Miller
2019-11-06 15:43 ` Greg KH
  -- strict thread matches above, loose matches on Subject: below --
2019-05-31 21:16 David Miller
2019-05-31 22:18 ` Greg KH
2019-05-31 23:49   ` David Miller
2018-08-21  0:59 David Miller
2018-08-21  5:36 ` Greg KH
2018-08-23  7:04   ` Greg KH
2018-05-22 17:49 David Miller
2018-05-22 18:13 ` Greg KH

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).