[PATCHES] Networking

From: David Miller <davem@davemloft.net>
To: stable@vger.kernel.org
Subject: [PATCHES] Networking
Date: Wed, 23 Aug 2017 20:24:49 -0700 (PDT)	[thread overview]
Message-ID: <20170823.202449.1909590154173330401.davem@davemloft.net> (raw)

[-- Attachment #1: Type: Text/Plain, Size: 102 bytes --]


Please queue up the following networking fixes for v4.9 and v4.12
-stable, respectively.

Thank you!

[-- Attachment #2: net_49.mbox --]
[-- Type: Application/Octet-Stream, Size: 106165 bytes --]

From d14811d21d471919e24c4520bb0254b0a42ea4f0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Aug 2017 10:16:45 -0700
Subject: [PATCH 01/25] af_key: do not use GFP_KERNEL in atomic contexts

[ Upstream commit 36f41f8fc6d8aa9f8c9072d66ff7cf9055f5e69b ]

pfkey_broadcast() might be called from non process contexts,
we can not use GFP_KERNEL in these cases [1].

This patch partially reverts commit ba51b6be38c1 ("net: Fix RCU splat in
af_key"), only keeping the GFP_ATOMIC forcing under rcu_read_lock()
section.

[1] : syzkaller reported :

in_atomic(): 1, irqs_disabled(): 0, pid: 2932, name: syzkaller183439
3 locks held by syzkaller183439/2932:
 #0:  (&net->xfrm.xfrm_cfg_mutex){+.+.+.}, at: [<ffffffff83b43888>] pfkey_sendmsg+0x4c8/0x9f0 net/key/af_key.c:3649
 #1:  (&pfk->dump_lock){+.+.+.}, at: [<ffffffff83b467f6>] pfkey_do_dump+0x76/0x3f0 net/key/af_key.c:293
 #2:  (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [<ffffffff83957632>] spin_lock_bh include/linux/spinlock.h:304 [inline]
 #2:  (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [<ffffffff83957632>] xfrm_policy_walk+0x192/0xa30 net/xfrm/xfrm_policy.c:1028
CPU: 0 PID: 2932 Comm: syzkaller183439 Not tainted 4.13.0-rc4+ #24
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:5994
 __might_sleep+0x95/0x190 kernel/sched/core.c:5947
 slab_pre_alloc_hook mm/slab.h:416 [inline]
 slab_alloc mm/slab.c:3383 [inline]
 kmem_cache_alloc+0x24b/0x6e0 mm/slab.c:3559
 skb_clone+0x1a0/0x400 net/core/skbuff.c:1037
 pfkey_broadcast_one+0x4b2/0x6f0 net/key/af_key.c:207
 pfkey_broadcast+0x4ba/0x770 net/key/af_key.c:281
 dump_sp+0x3d6/0x500 net/key/af_key.c:2685
 xfrm_policy_walk+0x2f1/0xa30 net/xfrm/xfrm_policy.c:1042
 pfkey_dump_sp+0x42/0x50 net/key/af_key.c:2695
 pfkey_do_dump+0xaa/0x3f0 net/key/af_key.c:299
 pfkey_spddump+0x1a0/0x210 net/key/af_key.c:2722
 pfkey_process+0x606/0x710 net/key/af_key.c:2814
 pfkey_sendmsg+0x4d6/0x9f0 net/key/af_key.c:3650
sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 ___sys_sendmsg+0x755/0x890 net/socket.c:2035
 __sys_sendmsg+0xe5/0x210 net/socket.c:2069
 SYSC_sendmsg net/socket.c:2080 [inline]
 SyS_sendmsg+0x2d/0x50 net/socket.c:2076
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x445d79
RSP: 002b:00007f32447c1dc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000445d79
RDX: 0000000000000000 RSI: 000000002023dfc8 RDI: 0000000000000008
RBP: 0000000000000086 R08: 00007f32447c2700 R09: 00007f32447c2700
R10: 00007f32447c2700 R11: 0000000000000202 R12: 0000000000000000
R13: 00007ffe33edec4f R14: 00007f32447c29c0 R15: 0000000000000000

Fixes: ba51b6be38c1 ("net: Fix RCU splat in af_key")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 2e1050ec2cf0..94bf810ad242 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -228,7 +228,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
 #define BROADCAST_ONE		1
 #define BROADCAST_REGISTERED	2
 #define BROADCAST_PROMISC_ONLY	4
-static int pfkey_broadcast(struct sk_buff *skb,
+static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
 			   int broadcast_flags, struct sock *one_sk,
 			   struct net *net)
 {
@@ -278,7 +278,7 @@ static int pfkey_broadcast(struct sk_buff *skb,
 	rcu_read_unlock();
 
 	if (one_sk != NULL)
-		err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk);
+		err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
 
 	kfree_skb(skb2);
 	kfree_skb(skb);
@@ -311,7 +311,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
 		hdr = (struct sadb_msg *) pfk->dump.skb->data;
 		hdr->sadb_msg_seq = 0;
 		hdr->sadb_msg_errno = rc;
-		pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
 				&pfk->sk, sock_net(&pfk->sk));
 		pfk->dump.skb = NULL;
 	}
@@ -355,7 +355,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) /
 			     sizeof(uint64_t));
 
-	pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk));
+	pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk));
 
 	return 0;
 }
@@ -1396,7 +1396,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
 
 	xfrm_state_put(x);
 
-	pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net);
+	pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net);
 
 	return 0;
 }
@@ -1483,7 +1483,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
 	hdr->sadb_msg_seq = c->seq;
 	hdr->sadb_msg_pid = c->portid;
 
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x));
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x));
 
 	return 0;
 }
@@ -1596,7 +1596,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg
 	out_hdr->sadb_msg_reserved = 0;
 	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
 	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
 
 	return 0;
 }
@@ -1701,8 +1701,8 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad
 		return -ENOBUFS;
 	}
 
-	pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk));
-
+	pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk,
+			sock_net(sk));
 	return 0;
 }
 
@@ -1720,7 +1720,8 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr)
 	hdr->sadb_msg_errno = (uint8_t) 0;
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
 
-	return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk));
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk,
+			       sock_net(sk));
 }
 
 static int key_notify_sa_flush(const struct km_event *c)
@@ -1741,7 +1742,7 @@ static int key_notify_sa_flush(const struct km_event *c)
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
 	hdr->sadb_msg_reserved = 0;
 
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net);
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 
 	return 0;
 }
@@ -1798,7 +1799,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr)
 	out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
 
 	if (pfk->dump.skb)
-		pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
 				&pfk->sk, sock_net(&pfk->sk));
 	pfk->dump.skb = out_skb;
 
@@ -1886,7 +1887,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb
 		new_hdr->sadb_msg_errno = 0;
 	}
 
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk));
+	pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
 	return 0;
 }
 
@@ -2219,7 +2220,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev
 	out_hdr->sadb_msg_errno = 0;
 	out_hdr->sadb_msg_seq = c->seq;
 	out_hdr->sadb_msg_pid = c->portid;
-	pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp));
 	return 0;
 
 }
@@ -2439,7 +2440,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc
 	out_hdr->sadb_msg_errno = 0;
 	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
 	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp));
 	err = 0;
 
 out:
@@ -2695,7 +2696,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
 	out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
 
 	if (pfk->dump.skb)
-		pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
 				&pfk->sk, sock_net(&pfk->sk));
 	pfk->dump.skb = out_skb;
 
@@ -2752,7 +2753,7 @@ static int key_notify_policy_flush(const struct km_event *c)
 	hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
 	hdr->sadb_msg_reserved = 0;
-	pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net);
+	pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 	return 0;
 
 }
@@ -2814,7 +2815,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb
 	void *ext_hdrs[SADB_EXT_MAX];
 	int err;
 
-	pfkey_broadcast(skb_clone(skb, GFP_KERNEL),
+	pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
 			BROADCAST_PROMISC_ONLY, NULL, sock_net(sk));
 
 	memset(ext_hdrs, 0, sizeof(ext_hdrs));
@@ -3036,7 +3037,8 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
 	out_hdr->sadb_msg_seq = 0;
 	out_hdr->sadb_msg_pid = 0;
 
-	pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL,
+			xs_net(x));
 	return 0;
 }
 
@@ -3226,7 +3228,8 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
 		       xfrm_ctx->ctx_len);
 	}
 
-	return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x));
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL,
+			       xs_net(x));
 }
 
 static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
@@ -3424,7 +3427,8 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
 	n_port->sadb_x_nat_t_port_port = sport;
 	n_port->sadb_x_nat_t_port_reserved = 0;
 
-	return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x));
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL,
+			       xs_net(x));
 }
 
 #ifdef CONFIG_NET_KEY_MIGRATE
@@ -3616,7 +3620,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 	}
 
 	/* broadcast migrate message to sockets */
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net);
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net);
 
 	return 0;
 
-- 
2.13.5


From e3370ac8dcdc599191b7af0cf90229488b6b449c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Aug 2017 14:10:25 -0700
Subject: [PATCH 02/25] dccp: purge write queue in dccp_destroy_sock()

[ Upstream commit 7749d4ff88d31b0be17c8683143135adaaadc6a7 ]

syzkaller reported that DCCP could have a non empty
write queue at dismantle time.

WARNING: CPU: 1 PID: 2953 at net/core/stream.c:199 sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 2953 Comm: syz-executor0 Not tainted 4.13.0-rc4+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:180
 __warn+0x1c4/0x1d9 kernel/panic.c:541
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190
 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:273
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323
 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846
RIP: 0010:sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199
RSP: 0018:ffff8801d182f108 EFLAGS: 00010297
RAX: ffff8801d1144140 RBX: ffff8801d13cb280 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff85137b00 RDI: ffff8801d13cb280
RBP: ffff8801d182f148 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d13cb4d0
R13: ffff8801d13cb3b8 R14: ffff8801d13cb300 R15: ffff8801d13cb3b8
 inet_csk_destroy_sock+0x175/0x3f0 net/ipv4/inet_connection_sock.c:835
 dccp_close+0x84d/0xc10 net/dccp/proto.c:1067
 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425
 sock_release+0x8d/0x1e0 net/socket.c:597
 sock_close+0x16/0x20 net/socket.c:1126
 __fput+0x327/0x7e0 fs/file_table.c:210
 ____fput+0x15/0x20 fs/file_table.c:246
 task_work_run+0x18a/0x260 kernel/task_work.c:116
 exit_task_work include/linux/task_work.h:21 [inline]
 do_exit+0xa32/0x1b10 kernel/exit.c:865
 do_group_exit+0x149/0x400 kernel/exit.c:969
 get_signal+0x7e8/0x17e0 kernel/signal.c:2330
 do_signal+0x94/0x1ee0 arch/x86/kernel/signal.c:808
 exit_to_usermode_loop+0x21c/0x2d0 arch/x86/entry/common.c:157
 prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
 syscall_return_slowpath+0x3a7/0x450 arch/x86/entry/common.c:263

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 9fe25bf63296..86bc40ba6ba5 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -201,10 +201,7 @@ void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	/*
-	 * DCCP doesn't use sk_write_queue, just sk_send_head
-	 * for retransmissions
-	 */
+	__skb_queue_purge(&sk->sk_write_queue);
 	if (sk->sk_send_head != NULL) {
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
-- 
2.13.5


From 5db71c4f638b0853d86c4c44dadf749e62dd91af Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 07:03:15 -0700
Subject: [PATCH 03/25] dccp: defer ccid_hc_tx_delete() at dismantle time

[ Upstream commit 120e9dabaf551c6dc03d3a10a1f026376cb1811c ]

syszkaller team reported another problem in DCCP [1]

Problem here is that the structure holding RTO timer
(ccid2_hc_tx_rto_expire() handler) is freed too soon.

We can not use del_timer_sync() to cancel the timer
since this timer wants to grab socket lock (that would risk a dead lock)

Solution is to defer the freeing of memory when all references to
the socket were released. Socket timers do own a reference, so this
should fix the issue.

[1]

==================================================================
BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365

CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events_unbound call_usermodehelper_exec_work
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429
 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268
 expire_timers kernel/time/timer.c:1307 [inline]
 __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601
 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:638 [inline]
 smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044
 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702
RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline]
RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline]
RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343
RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10
RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006
RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98
RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60
R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0
 </IRQ>
 release_task+0xe9e/0x1a40 kernel/exit.c:220
 wait_task_zombie kernel/exit.c:1162 [inline]
 wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389
 do_wait_thread kernel/exit.c:1452 [inline]
 do_wait+0x441/0xa90 kernel/exit.c:1523
 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665
 SYSC_wait4+0x134/0x140 kernel/exit.c:1677
 SyS_wait4+0x2c/0x40 kernel/exit.c:1673
 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline]
 call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323
 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097
 worker_thread+0x223/0x1860 kernel/workqueue.c:2231
 kthread+0x35e/0x430 kernel/kthread.c:231
 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425

Allocated by task 21267:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561
 ccid_new+0x20e/0x390 net/dccp/ccid.c:151
 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44
 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344
 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538
 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline]
 dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2269
 release_sock+0xa4/0x2a0 net/core/sock.c:2784
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1642
 SyS_connect+0x24/0x30 net/socket.c:1623
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 3049:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190
 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225
 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833
 dccp_done+0xb7/0xd0 net/dccp/proto.c:145
 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72
 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160
 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521
 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:477 [inline]
 ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455
 process_backlog+0x203/0x740 net/core/dev.c:5130
 napi_poll net/core/dev.c:5527 [inline]
 net_rx_action+0x792/0x1910 net/core/dev.c:5593
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284

The buggy address belongs to the object at ffff8801d2660100
 which belongs to the cache ccid2_hc_tx_sock of size 1240
The buggy address is located 1088 bytes inside of
 1240-byte region [ffff8801d2660100, ffff8801d26605d8)
The buggy address belongs to the page:
page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0
flags: 0x200000000008100(slab|head)
raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005
raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
 ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 86bc40ba6ba5..b68168fcc06a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,6 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_sock.h>
+#include <net/inet_common.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 
@@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type)
 
 EXPORT_SYMBOL_GPL(dccp_packet_name);
 
+static void dccp_sk_destruct(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_tx_ccid = NULL;
+	inet_sock_destruct(sk);
+}
+
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
+	sk->sk_destruct		= dccp_sk_destruct;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
 	dp->dccps_mss_cache	= 536;
 	dp->dccps_rate_last	= jiffies;
@@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk)
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+	dp->dccps_hc_rx_ccid = NULL;
 
 	/* clean up feature negotiation state */
 	dccp_feat_list_purge(&dp->dccps_featneg);
-- 
2.13.5


From 261a7e0f954a3a2c3d4cde558e1c9b64a0059812 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Aug 2017 05:26:17 -0700
Subject: [PATCH 04/25] ipv4: fix NULL dereference in free_fib_info_rcu()

[ Upstream commit 187e5b3ac84d3421d2de3aca949b2791fbcad554 ]

If fi->fib_metrics could not be allocated in fib_create_info()
we attempt to dereference a NULL pointer in free_fib_info_rcu() :

    m = fi->fib_metrics;
    if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
            kfree(m);

Before my recent patch, we used to call kfree(NULL) and nothing wrong
happened.

Instead of using RCU to defer freeing while we are under memory stress,
it seems better to take immediate action.

This was reported by syzkaller team.

Fixes: 3fb07daff8e9 ("ipv4: add reference counting to metrics")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 7563831fa432..38c1c979ecb1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1044,15 +1044,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 	if (!fi)
 		goto failure;
-	fib_info_cnt++;
 	if (cfg->fc_mx) {
 		fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
-		if (!fi->fib_metrics)
-			goto failure;
+		if (unlikely(!fi->fib_metrics)) {
+			kfree(fi);
+			return ERR_PTR(err);
+		}
 		atomic_set(&fi->fib_metrics->refcnt, 1);
-	} else
+	} else {
 		fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
-
+	}
+	fib_info_cnt++;
 	fi->fib_net = net;
 	fi->fib_protocol = cfg->fc_protocol;
 	fi->fib_scope = cfg->fc_scope;
-- 
2.13.5


From 473e538fe6aa691b702f9d454748a6751f4d86e8 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Tue, 15 Aug 2017 16:37:04 +0300
Subject: [PATCH 05/25] net_sched/sfq: update hierarchical backlog when drop
 packet

[ Upstream commit 325d5dc3f7e7c2840b65e4a2988c082c2c0025c5 ]

When sfq_enqueue() drops head packet or packet from another queue it
have to update backlog at upper qdiscs too.

Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_sfq.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index bc5e99584e41..ea8a56f76b32 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -434,6 +434,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 		qdisc_drop(head, sch, to_free);
 
 		slot_queue_add(slot, skb);
+		qdisc_tree_reduce_backlog(sch, 0, delta);
 		return NET_XMIT_CN;
 	}
 
@@ -465,8 +466,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 	/* Return Congestion Notification only if we dropped a packet
 	 * from this flow.
 	 */
-	if (qlen != slot->qlen)
+	if (qlen != slot->qlen) {
+		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 		return NET_XMIT_CN;
+	}
 
 	/* As we dropped a packet, better let upper stack know this */
 	qdisc_tree_reduce_backlog(sch, 1, dropped);
-- 
2.13.5


From 6e38092120c8a1adda0e680becae64b8351bebcd Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Tue, 15 Aug 2017 16:39:05 +0300
Subject: [PATCH 06/25] net_sched: remove warning from qdisc_hash_add

[ Upstream commit c90e95147c27b1780e76c6e8fea1b5c78d7d387f ]

It was added in commit e57a784d8cae ("pkt_sched: set root qdisc
before change() in attach_default_qdiscs()") to hide duplicates
from "tc qdisc show" for incative deivices.

After 59cc1f61f ("net: sched: convert qdisc linked list to hashtable")
it triggered when classful qdisc is added to inactive device because
default qdiscs are added before switching root qdisc.

Anyway after commit ea3274695353 ("net: sched: avoid duplicates in
qdisc dump") duplicates are filtered right in dumper.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ff27a85a71a9..195a3b2d9afc 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -277,9 +277,6 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 void qdisc_hash_add(struct Qdisc *q)
 {
 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
-		struct Qdisc *root = qdisc_dev(q)->qdisc;
-
-		WARN_ON_ONCE(root == &noop_qdisc);
 		ASSERT_RTNL();
 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 	}
-- 
2.13.5


From 2536fc56659c822c8a21793315a1fd430c8a0258 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Aug 2017 01:45:33 +0200
Subject: [PATCH 07/25] bpf: fix bpf_trace_printk on 32 bit archs

[ Upstream commit 88a5c690b66110ad255380d8f629c629cf6ca559 ]

James reported that on MIPS32 bpf_trace_printk() is currently
broken while MIPS64 works fine:

  bpf_trace_printk() uses conditional operators to attempt to
  pass different types to __trace_printk() depending on the
  format operators. This doesn't work as intended on 32-bit
  architectures where u32 and long are passed differently to
  u64, since the result of C conditional operators follows the
  "usual arithmetic conversions" rules, such that the values
  passed to __trace_printk() will always be u64 [causing issues
  later in the va_list handling for vscnprintf()].

  For example the samples/bpf/tracex5 test printed lines like
  below on MIPS32, where the fd and buf have come from the u64
  fd argument, and the size from the buf argument:

    [...] 1180.941542: 0x00000001: write(fd=1, buf=  (null), size=6258688)

  Instead of this:

    [...] 1625.616026: 0x00000001: write(fd=1, buf=009e4000, size=512)

One way to get it working is to expand various combinations
of argument types into 8 different combinations for 32 bit
and 64 bit kernels. Fix tested by James on MIPS32 and MIPS64
as well that it resolves the issue.

Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()")
Reported-by: James Hogan <james.hogan@imgtec.com>
Tested-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5dcb99281259..41805fb3c661 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -203,10 +203,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 		fmt_cnt++;
 	}
 
-	return __trace_printk(1/* fake ip will not be printed */, fmt,
-			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
-			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
-			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT()	__BPF_ARG3_TP()
+#define __BPF_TP(...)							\
+	__trace_printk(1 /* Fake ip will not be printed. */,		\
+		       fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...)						\
+	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_TP(arg1, ##__VA_ARGS__)				\
+	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_TP((long)arg1, ##__VA_ARGS__)			\
+	      : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...)						\
+	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__)				\
+	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__)		\
+	      : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...)						\
+	((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__)				\
+	  : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__)		\
+	      : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+	return __BPF_TP_EMIT();
 }
 
 static const struct bpf_func_proto bpf_trace_printk_proto = {
-- 
2.13.5


From 283a1ad21a47cd8b557c25258d36ac3cc6206e29 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Wed, 16 Aug 2017 13:30:07 +0800
Subject: [PATCH 08/25] openvswitch: fix skb_panic due to the incorrect actions
 attrlen

[ Upstream commit 494bea39f3201776cdfddc232705f54a0bd210c4 ]

For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.

But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
  skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
  ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
  ------------[ cut here ]------------
  kernel BUG at net/core/skbuff.c:129!
  [...]
  Call Trace:
   <IRQ>
   [<ffffffff8148be82>] skb_put+0x43/0x44
   [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
   [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
   [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
   [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
   [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
   [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
   [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
   [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
   [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
   [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
  [...]

Also we can find that the actions_len is much little than the orig_len:
  crash> struct sw_flow_actions 0xffff8812f539d000
  struct sw_flow_actions {
    rcu = {
      next = 0xffff8812f5398800,
      func = 0xffffe3b00035db32
    },
    orig_len = 1384,
    actions_len = 592,
    actions = 0xffff8812f539d01c
  }

So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.

Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.

Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c  | 1 +
 net/openvswitch/datapath.c | 7 ++++---
 net/openvswitch/datapath.h | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 4e03f64709bc..05d9f42fc309 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1240,6 +1240,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		goto out;
 	}
 
+	OVS_CB(skb)->acts_origlen = acts->orig_len;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d67ea856067..453f806afe6e 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -383,7 +383,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 }
 
 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
-			      unsigned int hdrlen)
+			      unsigned int hdrlen, int actions_attrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
@@ -400,7 +400,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 
 	/* OVS_PACKET_ATTR_ACTIONS */
 	if (upcall_info->actions_len)
-		size += nla_total_size(upcall_info->actions_len);
+		size += nla_total_size(actions_attrlen);
 
 	/* OVS_PACKET_ATTR_MRU */
 	if (upcall_info->mru)
@@ -467,7 +467,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	else
 		hlen = skb->len;
 
-	len = upcall_msg_size(upcall_info, hlen - cutlen);
+	len = upcall_msg_size(upcall_info, hlen - cutlen,
+			      OVS_CB(skb)->acts_origlen);
 	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ab85c1cae255..e19ace428e38 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -100,12 +100,14 @@ struct datapath {
  * @input_vport: The original vport packet came in on. This value is cached
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  * fragmented.
  */
 struct ovs_skb_cb {
 	struct vport		*input_vport;
 	u16			mru;
+	u16			acts_origlen;
 	u32			cutlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
-- 
2.13.5


From b86f23def8a3b1a3447caa9d0ad485fff4990356 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 10:36:47 -0700
Subject: [PATCH 09/25] ptr_ring: use kmalloc_array()

[ Upstream commit 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 ]

As found by syzkaller, malicious users can set whatever tx_queue_len
on a tun device and eventually crash the kernel.

Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small
ring buffer is not fast anyway.

Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h  | 9 +++++----
 include/linux/skb_array.h | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444da3b9..b83507c0640c 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -340,9 +340,9 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
 	__PTR_RING_PEEK_CALL_v; \
 })
 
-static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
+	return kcalloc(size, sizeof(void *), gfp);
 }
 
 static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
@@ -417,7 +417,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
+					   unsigned int nrings,
 					   int size,
 					   gfp_t gfp, void (*destroy)(void *))
 {
@@ -425,7 +426,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 	void ***queues;
 	int i;
 
-	queues = kmalloc(nrings * sizeof *queues, gfp);
+	queues = kmalloc_array(nrings, sizeof(*queues), gfp);
 	if (!queues)
 		goto noqueues;
 
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index f4dfade428f0..be8b902b5845 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -162,7 +162,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 }
 
 static inline int skb_array_resize_multiple(struct skb_array **rings,
-					    int nrings, int size, gfp_t gfp)
+					    int nrings, unsigned int size,
+					    gfp_t gfp)
 {
 	BUILD_BUG_ON(offsetof(struct skb_array, ring));
 	return ptr_ring_resize_multiple((struct ptr_ring **)rings,
-- 
2.13.5


From a99d77acad082de1ca1906d47b6a08938239bd6a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 11:09:12 -0700
Subject: [PATCH 10/25] ipv4: better IP_MAX_MTU enforcement

[ Upstream commit c780a049f9bf442314335372c9abc4548bfe3e44 ]

While working on yet another syzkaller report, I found
that our IP_MAX_MTU enforcements were not properly done.

gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and
final result can be bigger than IP_MAX_MTU :/

This is a problem because device mtu can be changed on other cpus or
threads.

While this patch does not fix the issue I am working on, it is
probably worth addressing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++--
 net/ipv4/route.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index d3a107850a41..51c6b9786c46 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -339,7 +339,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	    !forwarding)
 		return dst_mtu(dst);
 
-	return min(dst->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
@@ -351,7 +351,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
 	}
 
-	return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
 u32 ip_idents_reserve(u32 hash, int segs);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6cd49fd17ac0..6a5b7783932e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1247,7 +1247,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	if (mtu)
 		return mtu;
 
-	mtu = dst->dev->mtu;
+	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
 		if (rt->rt_uses_gateway && mtu > 576)
-- 
2.13.5


From 7afaa3a7065889c1d1241056902e886f4444a567 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 Aug 2017 12:11:50 +0100
Subject: [PATCH 11/25] nfp: fix infinite loop on umapping cleanup

[ Upstream commit eac2c68d663effb077210218788952b5a0c1f60e ]

The while loop that performs the dma page unmapping never decrements
index counter f and hence loops forever. Fix this with a pre-decrement
on f.

Detected by CoverityScan, CID#1357309 ("Infinite loop")

Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index aee3fd2b6538..4ca82bd8c4f0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -871,8 +871,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 
 err_unmap:
-	--f;
-	while (f >= 0) {
+	while (--f >= 0) {
 		frag = &skb_shinfo(skb)->frags[f];
 		dma_unmap_page(&nn->pdev->dev,
 			       tx_ring->txbufs[wr_idx].dma_addr,
-- 
2.13.5


From baf3d6d1470d8e8af92e5841fa28728bf6f43479 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 16 Aug 2017 20:16:40 +0200
Subject: [PATCH 12/25] sctp: fully initialize the IPv6 address in
 sctp_v6_to_addr()

[ Upstream commit 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d ]

KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and
sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below).
Make sure all fields of an IPv6 address are initialized, which
guarantees that the IPv4 fields are also initialized.

==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================

Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0c090600f377..ca4a63e3eadd 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -512,7 +512,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
 {
 	addr->sa.sa_family = AF_INET6;
 	addr->v6.sin6_port = port;
+	addr->v6.sin6_flowinfo = 0;
 	addr->v6.sin6_addr = *saddr;
+	addr->v6.sin6_scope_id = 0;
 }
 
 /* Compare addresses exactly.
-- 
2.13.5


From b5910da3749d1891338a6677ad2b951b7af9372e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 09:41:54 -0700
Subject: [PATCH 13/25] tipc: fix use-after-free

[ Upstream commit 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 ]

syszkaller reported use-after-free in tipc [1]

When msg->rep skb is freed, set the pointer to NULL,
so that caller does not free it again.

[1]

==================================================================
BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466
Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115

CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
 skb_push+0xd4/0xe0 net/core/skbuff.c:1466
 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9
RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76
R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000

Allocated by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651
 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219
 alloc_skb include/linux/skbuff.h:903 [inline]
 tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148
 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622
 __kfree_skb net/core/skbuff.c:682 [inline]
 kfree_skb+0x165/0x4c0 net/core/skbuff.c:699
 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801c6e71dc0
 which belongs to the cache skbuff_head_cache of size 224
The buggy address is located 208 bytes inside of
 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0)
The buggy address belongs to the page:
page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c
raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
 ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
                         ^
 ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov  <dvyukov@google.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1fd464764765..aedc476fac02 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
 	arg = nlmsg_new(0, GFP_KERNEL);
 	if (!arg) {
 		kfree_skb(msg->rep);
+		msg->rep = NULL;
 		return -ENOMEM;
 	}
 
 	err = __tipc_nl_compat_dumpit(cmd, msg, arg);
-	if (err)
+	if (err) {
 		kfree_skb(msg->rep);
-
+		msg->rep = NULL;
+	}
 	kfree_skb(arg);
 
 	return err;
-- 
2.13.5


From 911d75453c23fee19aac4bf8e9c21ad74b40bd6f Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 16 Aug 2017 11:18:09 -0700
Subject: [PATCH 14/25] ipv6: reset fn->rr_ptr when replacing route

[ Upstream commit 383143f31d7d3525a1dbff733d52fff917f82f15 ]

syzcaller reported the following use-after-free issue in rt6_select():
BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8
BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8
Read of size 4 by task syz-executor1/439628
CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00
 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0
 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380
Call Trace:
 [<ffffffff81ca384d>] __dump_stack lib/dump_stack.c:15 [inline]
 [<ffffffff81ca384d>] dump_stack+0xc1/0x124 lib/dump_stack.c:51
sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option.
Use struct sctp_sack_info instead
 [<ffffffff81735751>] kasan_object_err+0x21/0x70 mm/kasan/report.c:158
 [<ffffffff817359c4>] print_address_description mm/kasan/report.c:196 [inline]
 [<ffffffff817359c4>] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285
 [<ffffffff81735d93>] kasan_report mm/kasan/report.c:305 [inline]
 [<ffffffff81735d93>] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325
 [<ffffffff82a28e39>] rt6_select net/ipv6/route.c:755 [inline]
 [<ffffffff82a28e39>] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084
 [<ffffffff82a28fb1>] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203
 [<ffffffff82ab0a50>] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95
 [<ffffffff8265cbb6>] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223
 [<ffffffff82ab1430>] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41
 [<ffffffff82a22006>] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224
 [<ffffffff829e83d2>] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943
 [<ffffffff829e889a>] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079
 [<ffffffff82a9f7d8>] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91
 [<ffffffff82aa0978>] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline]
 [<ffffffff82aa0978>] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272
 [<ffffffff82aa1313>] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284
 [<ffffffff8292f790>] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564
 [<ffffffff82565547>] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582
 [<ffffffff8256a649>] SyS_connect+0x29/0x30 net/socket.c:1563
 [<ffffffff82c72032>] entry_SYSCALL_64_fastpath+0x12/0x17
Object at ffff8800bc699380, in cache ip6_dst_cache size: 384

The root cause of it is that in fib6_add_rt2node(), when it replaces an
existing route with the new one, it does not update fn->rr_ptr.
This commit resets fn->rr_ptr to NULL when it points to a route which is
replaced in fib6_add_rt2node().

Fixes: 27596472473a ("ipv6: fix ECMP route replacement")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4345ee39f180..1421a846ad36 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -897,6 +897,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		}
 		nsiblings = iter->rt6i_nsiblings;
 		fib6_purge_rt(iter, fn, info->nl_net);
+		if (fn->rr_ptr == iter)
+			fn->rr_ptr = NULL;
 		rt6_release(iter);
 
 		if (nsiblings) {
@@ -909,6 +911,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->dst.rt6_next;
 					fib6_purge_rt(iter, fn, info->nl_net);
+					if (fn->rr_ptr == iter)
+						fn->rr_ptr = NULL;
 					rt6_release(iter);
 					nsiblings--;
 				} else {
-- 
2.13.5


From 24d3a0cd900d8a3057e4bac588c06f230918b6af Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 18 Aug 2017 17:14:49 -0700
Subject: [PATCH 15/25] ipv6: repair fib6 tree in failure case

[ Upstream commit 348a4002729ccab8b888b38cbc099efa2f2a2036 ]

In fib6_add(), it is possible that fib6_add_1() picks an intermediate
node and sets the node's fn->leaf to NULL in order to add this new
route. However, if fib6_add_rt2node() fails to add the new
route for some reason, fn->leaf will be left as NULL and could
potentially cause crash when fn->leaf is accessed in fib6_locate().
This patch makes sure fib6_repair_tree() is called to properly repair
fn->leaf in the above failure case.

Here is the syzkaller reported general protection fault in fib6_locate:
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] ipv6_prefix_equal include/net/ipv6.h:492 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233
RSP: 0018:ffff8801d01a36a8  EFLAGS: 00010202
RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000
RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100
RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000
R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000
FS:  00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Stack:
 ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0
 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0
 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988
Call Trace:
 [<ffffffff82a223d6>] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109
 [<ffffffff82a23f9d>] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075
 [<ffffffff82621359>] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450
 [<ffffffff8274c1d1>] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281
 [<ffffffff82613ddf>] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456
 [<ffffffff8274ad38>] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline]
 [<ffffffff8274ad38>] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232
 [<ffffffff8274b83e>] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778
 [<ffffffff82564aff>] sock_sendmsg_nosec net/socket.c:609 [inline]
 [<ffffffff82564aff>] sock_sendmsg+0xcf/0x110 net/socket.c:619
 [<ffffffff82564d62>] sock_write_iter+0x222/0x3a0 net/socket.c:834
 [<ffffffff8178523d>] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478
 [<ffffffff817853f4>] __vfs_write+0xe4/0x110 fs/read_write.c:491
 [<ffffffff81786c38>] vfs_write+0x178/0x4b0 fs/read_write.c:538
 [<ffffffff817892a9>] SYSC_write fs/read_write.c:585 [inline]
 [<ffffffff817892a9>] SyS_write+0xd9/0x1b0 fs/read_write.c:577
 [<ffffffff82c71e32>] entry_SYSCALL_64_fastpath+0x12/0x17

Note: there is no "Fixes" tag as this seems to be a bug introduced
very early.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1421a846ad36..ff389591a340 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1001,7 +1001,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			/* Create subtree root node */
 			sfn = node_alloc();
 			if (!sfn)
-				goto st_failure;
+				goto failure;
 
 			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
@@ -1017,12 +1017,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				/* If it is failed, discard just allocated
-				   root, and then (in st_failure) stale node
+				   root, and then (in failure) stale node
 				   in main tree.
 				 */
 				node_free(sfn);
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 
 			/* Now link new subtree to main tree */
@@ -1036,7 +1036,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 		}
 
@@ -1078,22 +1078,22 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			atomic_inc(&pn->leaf->rt6i_ref);
 		}
 #endif
-		if (!(rt->dst.flags & DST_NOCACHE))
-			dst_free(&rt->dst);
+		goto failure;
 	}
 	return err;
 
-#ifdef CONFIG_IPV6_SUBTREES
-	/* Subtree creation failed, probably main tree node
-	   is orphan. If it is, shoot it.
+failure:
+	/* fn->leaf could be NULL if fn is an intermediate node and we
+	 * failed to add the new route to it in both subtree creation
+	 * failure and fib6_add_rt2node() failure case.
+	 * In both cases, fib6_repair_tree() should be called to fix
+	 * fn->leaf.
 	 */
-st_failure:
 	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
 		fib6_repair_tree(info->nl_net, fn);
 	if (!(rt->dst.flags & DST_NOCACHE))
 		dst_free(&rt->dst);
 	return err;
-#endif
 }
 
 /*
-- 
2.13.5


From 9d7d7c3d7dcd12b0b6780a01e4cf5f9ae7ca69be Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 16 Aug 2017 17:53:36 -0400
Subject: [PATCH 16/25] tcp: when rearming RTO, if RTO time is in past then
 fire RTO ASAP

[ Upstream commit cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 ]

In some situations tcp_send_loss_probe() can realize that it's unable
to send a loss probe (TLP), and falls back to calling tcp_rearm_rto()
to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto()
realizes that the RTO was eligible to fire immediately or at some
point in the past (delta_us <= 0). Previously in such cases
tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now +
icsk_rto, which caused needless delays of hundreds of milliseconds
(and non-linear behavior that made reproducible testing
difficult). This commit changes the logic to schedule "overdue" RTOs
ASAP, rather than at now + icsk_rto.

Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)")
Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 32c540145c17..9d9f5ba72493 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3036,8 +3036,7 @@ void tcp_rearm_rto(struct sock *sk)
 			/* delta may not be positive if the socket is locked
 			 * when the retrans timer fires and is rescheduled.
 			 */
-			if (delta > 0)
-				rto = delta;
+			delta = max(delta, 1);
 		}
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
 					  TCP_RTO_MAX);
-- 
2.13.5


From e4d55c566e2b9d39215edf2fa58d31a68237e862 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Aug 2017 18:29:52 +0300
Subject: [PATCH 17/25] net/mlx4_core: Enable 4K UAR if SRIOV module parameter
 is not enabled

[ Upstream commit ca3d89a3ebe79367bd41b6b8ba37664478ae2dba ]

enable_4k_uar module parameter was added in patch cited below to
address the backward compatibility issue in SRIOV when the VM has
system's PAGE_SIZE uar implementation and the Hypervisor has 4k uar
implementation.

The above compatibility issue does not exist in the non SRIOV case.
In this patch, we always enable 4k uar implementation if SRIOV
is not enabled on mlx4's supported cards.

Fixes: 76e39ccf9c36 ("net/mlx4_core: Fix backward compatibility on VFs")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 551786f58e59..ba652d8a2b93 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -430,7 +430,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		/* Virtual PCI function needs to determine UAR page size from
 		 * firmware. Only master PCI function can set the uar page size
 		 */
-		if (enable_4k_uar)
+		if (enable_4k_uar || !dev->persist->num_vfs)
 			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
 		else
 			dev->uar_page_shift = PAGE_SHIFT;
@@ -2269,7 +2269,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 
 		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
 
-		if (enable_4k_uar) {
+		if (enable_4k_uar || !dev->persist->num_vfs) {
 			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
 						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
 			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
-- 
2.13.5


From 2c9d33de4e85d8d2fe7f3ed2263908b241a1f04c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 17 Aug 2017 23:14:58 +0100
Subject: [PATCH 18/25] irda: do not leak initialized list.dev to userspace

[ Upstream commit b024d949a3c24255a7ef1a470420eb478949aa4c ]

list.dev has not been initialized and so the copy_to_user is copying
data from the stack back to user space which is a potential
information leak. Fix this ensuring all of list is initialized to
zero.

Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/af_irda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 391c3cbd2eed..101ed6c42808 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2223,7 +2223,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct irda_sock *self = irda_sk(sk);
-	struct irda_device_list list;
+	struct irda_device_list list = { 0 };
 	struct irda_device_info *discoveries;
 	struct irda_ias_set *	ias_opt;	/* IAS get/query params */
 	struct ias_object *	ias_obj;	/* Object in IAS */
-- 
2.13.5


From 8084e255021919eee67d41b6c7207b138ff373f7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 18 Aug 2017 11:01:36 +0800
Subject: [PATCH 19/25] net: sched: fix NULL pointer dereference when action
 calls some targets

[ Upstream commit 4f8a881acc9d1adaf1e552349a0b1df28933a04c ]

As we know in some target's checkentry it may dereference par.entryinfo
to check entry stuff inside. But when sched action calls xt_check_target,
par.entryinfo is set with NULL. It would cause kernel panic when calling
some targets.

It can be reproduce with:
  # tc qd add dev eth1 ingress handle ffff:
  # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \
    -j ECN --ecn-tcp-remove

It could also crash kernel when using target CLUSTERIP or TPROXY.

By now there's no proper value for par.entryinfo in ipt_init_target,
but it can not be set with NULL. This patch is to void all these
panics by setting it with an ipt_entry obj with all members = 0.

Note that this issue has been there since the very beginning.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ipt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index a1aec0a6c789..50030519a89b 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -41,6 +41,7 @@ static int ipt_init_target(struct xt_entry_target *t, char *table,
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
+	struct ipt_entry e = {};
 	int ret = 0;
 
 	target = xt_request_find_target(AF_INET, t->u.user.name,
@@ -51,6 +52,7 @@ static int ipt_init_target(struct xt_entry_target *t, char *table,
 	t->u.kernel.target = target;
 	memset(&par, 0, sizeof(par));
 	par.table     = table;
+	par.entryinfo = &e;
 	par.target    = target;
 	par.targinfo  = t->data;
 	par.hook_mask = hook;
-- 
2.13.5


From 77a2295177e36d37194aac13a80c9d5d663a54db Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 19 Aug 2017 15:37:07 +0300
Subject: [PATCH 20/25] net_sched: fix order of queue length updates in
 qdisc_replace()

[ Upstream commit 68a66d149a8c78ec6720f268597302883e48e9fa ]

This important to call qdisc_tree_reduce_backlog() after changing queue
length. Parent qdisc should deactivate class in ->qlen_notify() called from
qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero.

Missed class deactivations leads to crashes/warnings at picking packets
from empty qdisc and corrupting state at reactivating this class in future.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper")
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672..f18fc1a0321f 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -768,8 +768,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+		unsigned int qlen = old->q.qlen;
+		unsigned int backlog = old->qstats.backlog;
+
 		qdisc_reset(old);
+		qdisc_tree_reduce_backlog(old, qlen, backlog);
 	}
 	sch_tree_unlock(sch);
 
-- 
2.13.5


From 4a32a692cd1acd3991a70110a76218a22155bb05 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 2 Jul 2017 02:13:30 +0200
Subject: [PATCH 21/25] bpf, verifier: add additional patterns to
 evaluate_reg_imm_alu

[ Upstream commit 43188702b3d98d2792969a3377a30957f05695e6 ]

Currently the verifier does not track imm across alu operations when
the source register is of unknown type. This adds additional pattern
matching to catch this and track imm. We've seen LLVM generating this
pattern while working on cilium.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8ce679d36c58..404b6ea7dd92 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1467,6 +1467,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+					struct bpf_insn *insn)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+	u8 opcode = BPF_OP(insn->code);
+	s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+	/* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+	if (src_reg->imm > 0 && dst_reg->imm) {
+		switch (opcode) {
+		case BPF_ADD:
+			/* dreg += sreg
+			 * where both have zero upper bits. Adding them
+			 * can only result making one more bit non-zero
+			 * in the larger value.
+			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			dst_reg->imm--;
+			break;
+		case BPF_AND:
+			/* dreg &= sreg
+			 * AND can not extend zero bits only shrink
+			 * Ex.  0x00..00ffffff
+			 *    & 0x0f..ffffffff
+			 *     ----------------
+			 *      0x00..00ffffff
+			 */
+			dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_OR:
+			/* dreg |= sreg
+			 * OR can only extend zero bits
+			 * Ex.  0x00..00ffffff
+			 *    | 0x0f..ffffffff
+			 *     ----------------
+			 *      0x0f..00ffffff
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_SUB:
+		case BPF_MUL:
+		case BPF_RSH:
+		case BPF_LSH:
+			/* These may be flushed out later */
+		default:
+			mark_reg_unknown_value(regs, insn->dst_reg);
+		}
+	} else {
+		mark_reg_unknown_value(regs, insn->dst_reg);
+	}
+
+	dst_reg->type = UNKNOWN_VALUE;
+	return 0;
+}
+
 static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 				struct bpf_insn *insn)
 {
@@ -1475,6 +1534,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
 	u8 opcode = BPF_OP(insn->code);
 
+	if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+		return evaluate_reg_imm_alu_unknown(env, insn);
+
 	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
 	 * Don't care about overflow or negative values, just add them
 	 */
-- 
2.13.5


From 6e9c7a507b78cb8b311088c34c9a52dce7eba6a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 18 May 2017 03:00:06 +0200
Subject: [PATCH 22/25] bpf: adjust verifier heuristics

[ Upstream commit 3c2ce60bdd3d57051bf85615deec04a694473840 ]

Current limits with regards to processing program paths do not
really reflect today's needs anymore due to programs becoming
more complex and verifier smarter, keeping track of more data
such as const ALU operations, alignment tracking, spilling of
PTR_TO_MAP_VALUE_ADJ registers, and other features allowing for
smarter matching of what LLVM generates.

This also comes with the side-effect that we result in fewer
opportunities to prune search states and thus often need to do
more work to prove safety than in the past due to different
register states and stack layout where we mismatch. Generally,
it's quite hard to determine what caused a sudden increase in
complexity, it could be caused by something as trivial as a
single branch somewhere at the beginning of the program where
LLVM assigned a stack slot that is marked differently throughout
other branches and thus causing a mismatch, where verifier
then needs to prove safety for the whole rest of the program.
Subsequently, programs with even less than half the insn size
limit can get rejected. We noticed that while some programs
load fine under pre 4.11, they get rejected due to hitting
limits on more recent kernels. We saw that in the vast majority
of cases (90+%) pruning failed due to register mismatches. In
case of stack mismatches, majority of cases failed due to
different stack slot types (invalid, spill, misc) rather than
differences in spilled registers.

This patch makes pruning more aggressive by also adding markers
that sit at conditional jumps as well. Currently, we only mark
jump targets for pruning. For example in direct packet access,
these are usually error paths where we bail out. We found that
adding these markers, it can reduce number of processed insns
by up to 30%. Another option is to ignore reg->id in probing
PTR_TO_MAP_VALUE_OR_NULL registers, which can help pruning
slightly as well by up to 7% observed complexity reduction as
stand-alone. Meaning, if a previous path with register type
PTR_TO_MAP_VALUE_OR_NULL for map X was found to be safe, then
in the current state a PTR_TO_MAP_VALUE_OR_NULL register for
the same map X must be safe as well. Last but not least the
patch also adds a scheduling point and bumps the current limit
for instructions to be processed to a more adequate value.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 404b6ea7dd92..2ee2e7970df6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -139,7 +139,7 @@ struct bpf_verifier_stack_elem {
 	struct bpf_verifier_stack_elem *next;
 };
 
-#define BPF_COMPLEXITY_LIMIT_INSNS	65536
+#define BPF_COMPLEXITY_LIMIT_INSNS	98304
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
 
 struct bpf_call_arg_meta {
@@ -2452,6 +2452,7 @@ static int check_cfg(struct bpf_verifier_env *env)
 				env->explored_states[t + 1] = STATE_LIST_MARK;
 		} else {
 			/* conditional jump with two edges */
+			env->explored_states[t] = STATE_LIST_MARK;
 			ret = push_insn(t, t + 1, FALLTHROUGH, env);
 			if (ret == 1)
 				goto peek_stack;
@@ -2610,6 +2611,12 @@ static bool states_equal(struct bpf_verifier_env *env,
 		     rcur->type != NOT_INIT))
 			continue;
 
+		/* Don't care about the reg->id in this case. */
+		if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
+		    rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
+		    rold->map_ptr == rcur->map_ptr)
+			continue;
+
 		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
 		    compare_ptrs_to_packet(rold, rcur))
 			continue;
@@ -2744,6 +2751,9 @@ static int do_check(struct bpf_verifier_env *env)
 			goto process_bpf_exit;
 		}
 
+		if (need_resched())
+			cond_resched();
+
 		if (log_level && do_print_state) {
 			verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
 			print_verifier_state(&env->cur_state);
-- 
2.13.5


From 787b8354be98d2d73637c673d2d64f45f85ffce8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 31 Mar 2017 02:24:02 +0200
Subject: [PATCH 23/25] bpf, verifier: fix alu ops against map_value{, _adj}
 register types

[ Upstream commit fce366a9dd0ddc47e7ce05611c266e8574a45116 ]

While looking into map_value_adj, I noticed that alu operations
directly on the map_value() resp. map_value_adj() register (any
alu operation on a map_value() register will turn it into a
map_value_adj() typed register) are not sufficiently protected
against some of the operations. Two non-exhaustive examples are
provided that the verifier needs to reject:

 i) BPF_AND on r0 (map_value_adj):

  0: (bf) r2 = r10
  1: (07) r2 += -8
  2: (7a) *(u64 *)(r2 +0) = 0
  3: (18) r1 = 0xbf842a00
  5: (85) call bpf_map_lookup_elem#1
  6: (15) if r0 == 0x0 goto pc+2
   R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  7: (57) r0 &= 8
  8: (7a) *(u64 *)(r0 +0) = 22
   R0=map_value_adj(ks=8,vs=48,id=0),min_value=0,max_value=8 R10=fp
  9: (95) exit

  from 6 to 9: R0=inv,min_value=0,max_value=0 R10=fp
  9: (95) exit
  processed 10 insns

ii) BPF_ADD in 32 bit mode on r0 (map_value_adj):

  0: (bf) r2 = r10
  1: (07) r2 += -8
  2: (7a) *(u64 *)(r2 +0) = 0
  3: (18) r1 = 0xc24eee00
  5: (85) call bpf_map_lookup_elem#1
  6: (15) if r0 == 0x0 goto pc+2
   R0=map_value(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  7: (04) (u32) r0 += (u32) 0
  8: (7a) *(u64 *)(r0 +0) = 22
   R0=map_value_adj(ks=8,vs=48,id=0),min_value=0,max_value=0 R10=fp
  9: (95) exit

  from 6 to 9: R0=inv,min_value=0,max_value=0 R10=fp
  9: (95) exit
  processed 10 insns

Issue is, while min_value / max_value boundaries for the access
are adjusted appropriately, we change the pointer value in a way
that cannot be sufficiently tracked anymore from its origin.
Operations like BPF_{AND,OR,DIV,MUL,etc} on a destination register
that is PTR_TO_MAP_VALUE{,_ADJ} was probably unintended, in fact,
all the test cases coming with 484611357c19 ("bpf: allow access
into map value arrays") perform BPF_ADD only on the destination
register that is PTR_TO_MAP_VALUE_ADJ.

Only for UNKNOWN_VALUE register types such operations make sense,
f.e. with unknown memory content fetched initially from a constant
offset from the map value memory into a register. That register is
then later tested against lower / upper bounds, so that the verifier
can then do the tracking of min_value / max_value, and properly
check once that UNKNOWN_VALUE register is added to the destination
register with type PTR_TO_MAP_VALUE{,_ADJ}. This is also what the
original use-case is solving. Note, tracking on what is being
added is done through adjust_reg_min_max_vals() and later access
to the map value enforced with these boundaries and the given offset
from the insn through check_map_access_adj().

Tests will fail for non-root environment due to prohibited pointer
arithmetic, in particular in check_alu_op(), we bail out on the
is_pointer_value() check on the dst_reg (which is false in root
case as we allow for pointer arithmetic via env->allow_ptr_leaks).

Similarly to PTR_TO_PACKET, one way to fix it is to restrict the
allowed operations on PTR_TO_MAP_VALUE{,_ADJ} registers to 64 bit
mode BPF_ADD. The test_verifier suite runs fine after the patch
and it also rejects mentioned test cases.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2ee2e7970df6..df0485bdaccd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1870,6 +1870,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		 * register as unknown.
 		 */
 		if (env->allow_ptr_leaks &&
+		    BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
 		    (dst_reg->type == PTR_TO_MAP_VALUE ||
 		     dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
 			dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
-- 
2.13.5


From b69dfb0fd8acb624b0dcfc3f5c1905f8b6d143a3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Jul 2017 00:00:21 +0200
Subject: [PATCH 24/25] bpf: fix mixed signed/unsigned derived min/max value
 bounds

[ Upstream commit 4cabc5b186b5427b9ee5a7495172542af105f02b ]

Edward reported that there's an issue in min/max value bounds
tracking when signed and unsigned compares both provide hints
on limits when having unknown variables. E.g. a program such
as the following should have been rejected:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff8a94cda93400
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  11: (65) if r1 s> 0x1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

What happens is that in the first part ...

   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3

... r1 carries an unsigned value, and is compared as unsigned
against a register carrying an immediate. Verifier deduces in
reg_set_min_max() that since the compare is unsigned and operation
is greater than (>), that in the fall-through/false case, r1's
minimum bound must be 0 and maximum bound must be r2. Latter is
larger than the bound and thus max value is reset back to being
'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now
'R1=inv,min_value=0'. The subsequent test ...

  11: (65) if r1 s> 0x1 goto pc+2

... is a signed compare of r1 with immediate value 1. Here,
verifier deduces in reg_set_min_max() that since the compare
is signed this time and operation is greater than (>), that
in the fall-through/false case, we can deduce that r1's maximum
bound must be 1, meaning with prior test, we result in r1 having
the following state: R1=inv,min_value=0,max_value=1. Given that
the actual value this holds is -8, the bounds are wrongly deduced.
When this is being added to r0 which holds the map_value(_adj)
type, then subsequent store access in above case will go through
check_mem_access() which invokes check_map_access_adj(), that
will then probe whether the map memory is in bounds based
on the min_value and max_value as well as access size since
the actual unknown value is min_value <= x <= max_value; commit
fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{,
_adj} register types") provides some more explanation on the
semantics.

It's worth to note in this context that in the current code,
min_value and max_value tracking are used for two things, i)
dynamic map value access via check_map_access_adj() and since
commit 06c1c049721a ("bpf: allow helpers access to variable memory")
ii) also enforced at check_helper_mem_access() when passing a
memory address (pointer to packet, map value, stack) and length
pair to a helper and the length in this case is an unknown value
defining an access range through min_value/max_value in that
case. The min_value/max_value tracking is /not/ used in the
direct packet access case to track ranges. However, the issue
also affects case ii), for example, the following crafted program
based on the same principle must be rejected as well:

   0: (b7) r2 = 0
   1: (bf) r3 = r10
   2: (07) r3 += -512
   3: (7a) *(u64 *)(r10 -16) = -8
   4: (79) r4 = *(u64 *)(r10 -16)
   5: (b7) r6 = -1
   6: (2d) if r4 > r6 goto pc+5
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
   7: (65) if r4 s> 0x1 goto pc+4
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1
  R10=fp
   8: (07) r4 += 1
   9: (b7) r5 = 0
  10: (6a) *(u16 *)(r10 -512) = 0
  11: (85) call bpf_skb_load_bytes#26
  12: (b7) r0 = 0
  13: (95) exit

Meaning, while we initialize the max_value stack slot that the
verifier thinks we access in the [1,2] range, in reality we
pass -7 as length which is interpreted as u32 in the helper.
Thus, this issue is relevant also for the case of helper ranges.
Resetting both bounds in check_reg_overflow() in case only one
of them exceeds limits is also not enough as similar test can be
created that uses values which are within range, thus also here
learned min value in r1 is incorrect when mixed with later signed
test to create a range:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff880ad081fa00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (65) if r1 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

This leaves us with two options for fixing this: i) to invalidate
all prior learned information once we switch signed context, ii)
to track min/max signed and unsigned boundaries separately as
done in [0]. (Given latter introduces major changes throughout
the whole verifier, it's rather net-next material, thus this
patch follows option i), meaning we can derive bounds either
from only signed tests or only unsigned tests.) There is still the
case of adjust_reg_min_max_vals(), where we adjust bounds on ALU
operations, meaning programs like the following where boundaries
on the reg get mixed in context later on when bounds are merged
on the dst reg must get rejected, too:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff89b2bf87ce00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+6
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (b7) r7 = 1
  12: (65) if r7 s> 0x0 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp
  13: (b7) r0 = 0
  14: (95) exit

  from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp
  15: (0f) r7 += r1
  16: (65) if r7 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  17: (0f) r0 += r7
  18: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  19: (b7) r0 = 0
  20: (95) exit

Meaning, in adjust_reg_min_max_vals() we must also reset range
values on the dst when src/dst registers have mixed signed/
unsigned derived min/max value bounds with one unbounded value
as otherwise they can be added together deducing false boundaries.
Once both boundaries are established from either ALU ops or
compare operations w/o mixing signed/unsigned insns, then they
can safely be added to other regs also having both boundaries
established. Adding regs with one unbounded side to a map value
where the bounded side has been learned w/o mixing ops is
possible, but the resulting map value won't recover from that,
meaning such op is considered invalid on the time of actual
access. Invalid bounds are set on the dst reg in case i) src reg,
or ii) in case dst reg already had them. The only way to recover
would be to perform i) ALU ops but only 'add' is allowed on map
value types or ii) comparisons, but these are disallowed on
pointers in case they span a range. This is fine as only BPF_JEQ
and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers
which potentially turn them into PTR_TO_MAP_VALUE type depending
on the branch, so only here min/max value cannot be invalidated
for them.

In terms of state pruning, value_from_signed is considered
as well in states_equal() when dealing with adjusted map values.
With regards to breaking existing programs, there is a small
risk, but use-cases are rather quite narrow where this could
occur and mixing compares probably unlikely.

Joint work with Josef and Edward.

  [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Reported-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/verifier.c        | 110 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a13b031dc6b8..3101141661a1 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -40,6 +40,7 @@ struct bpf_reg_state {
 	 */
 	s64 min_value;
 	u64 max_value;
+	bool value_from_signed;
 };
 
 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df0485bdaccd..4f9fa4c24c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -682,12 +682,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
 	return -EACCES;
 }
 
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+			       const struct bpf_reg_state *reg)
 {
-	if (env->allow_ptr_leaks)
+	if (allow_ptr_leaks)
 		return false;
 
-	switch (env->cur_state.regs[regno].type) {
+	switch (reg->type) {
 	case UNKNOWN_VALUE:
 	case CONST_IMM:
 		return false;
@@ -696,6 +697,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	}
 }
 
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
 static int check_ptr_alignment(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg, int off, int size)
 {
@@ -1592,10 +1598,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	}
 
 	/* We don't know anything about what was done to this register, mark it
-	 * as unknown.
+	 * as unknown. Also, if both derived bounds came from signed/unsigned
+	 * mixed compares and one side is unbounded, we cannot really do anything
+	 * with them as boundaries cannot be trusted. Thus, arithmetic of two
+	 * regs of such kind will get invalidated bounds on the dst side.
 	 */
-	if (min_val == BPF_REGISTER_MIN_RANGE &&
-	    max_val == BPF_REGISTER_MAX_RANGE) {
+	if ((min_val == BPF_REGISTER_MIN_RANGE &&
+	     max_val == BPF_REGISTER_MAX_RANGE) ||
+	    (BPF_SRC(insn->code) == BPF_X &&
+	     ((min_val != BPF_REGISTER_MIN_RANGE &&
+	       max_val == BPF_REGISTER_MAX_RANGE) ||
+	      (min_val == BPF_REGISTER_MIN_RANGE &&
+	       max_val != BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+	     regs[insn->dst_reg].value_from_signed !=
+	     regs[insn->src_reg].value_from_signed)) {
 		reset_reg_range_values(regs, insn->dst_reg);
 		return;
 	}
@@ -1939,38 +1959,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg, u64 val,
 			    u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
+		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum val is val,
 		 * otherwise we know the min val is val+1.
 		 */
 		false_reg->max_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val + 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
+		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum value is val - 1,
 		 * otherwise we know the mimimum value is val.
 		 */
 		false_reg->max_value = val - 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -1978,6 +2023,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -1987,39 +2038,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg, u64 val,
 				u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
+		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/*
 		 * If this is false, then the val is <= the register, if it is
 		 * true the register <= to the val.
 		 */
 		false_reg->min_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val - 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
+		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/* If this is false then constant < register, if it is true then
 		 * the register < constant.
 		 */
 		false_reg->min_value = val + 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2027,6 +2103,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-- 
2.13.5


From 39ee5266e0a067354b0a835e663e9c78c154d53a Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 21 Jul 2017 14:37:34 +0100
Subject: [PATCH 25/25] bpf/verifier: fix min/max handling in BPF_SUB

[ Upstream commit 9305706c2e808ae59f1eb201867f82f1ddf6d7a6 ]

We have to subtract the src max from the dst min, and vice-versa, since
 (e.g.) the smallest result comes from the largest subtrahend.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4f9fa4c24c30..779c871c5dcd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1624,10 +1624,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	 * do our normal operations to the register, we need to set the values
 	 * to the min/max since they are undefined.
 	 */
-	if (min_val == BPF_REGISTER_MIN_RANGE)
-		dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-	if (max_val == BPF_REGISTER_MAX_RANGE)
-		dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+	if (opcode != BPF_SUB) {
+		if (min_val == BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		if (max_val == BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+	}
 
 	switch (opcode) {
 	case BPF_ADD:
@@ -1637,10 +1639,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->max_value += max_val;
 		break;
 	case BPF_SUB:
+		/* If one of our values was at the end of our ranges, then the
+		 * _opposite_ value in the dst_reg goes to the end of our range.
+		 */
+		if (min_val == BPF_REGISTER_MIN_RANGE)
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+		if (max_val == BPF_REGISTER_MAX_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value -= min_val;
+			dst_reg->min_value -= max_val;
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value -= max_val;
+			dst_reg->max_value -= min_val;
 		break;
 	case BPF_MUL:
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-- 
2.13.5


[-- Attachment #3: net_412.mbox --]
[-- Type: Application/Octet-Stream, Size: 110306 bytes --]

From 6d1d438884ad68706bad96d4caf64828275c82ed Mon Sep 17 00:00:00 2001
From: Andreas Born <futur.andy@googlemail.com>
Date: Thu, 10 Aug 2017 06:41:44 +0200
Subject: [PATCH 01/27] bonding: require speed/duplex only for 802.3ad, alb and
 tlb

[ Upstream commit ad729bc9acfb7c47112964b4877ef5404578ed13 ]

The patch c4adfc822bf5 ("bonding: make speed, duplex setting consistent
with link state") puts the link state to down if
bond_update_speed_duplex() cannot retrieve speed and duplex settings.
Assumably the patch was written with 802.3ad mode in mind which relies
on link speed/duplex settings. For other modes like active-backup these
settings are not required. Thus, only for these other modes, this patch
reintroduces support for slaves that do not support reporting speed or
duplex such as wireless devices. This fixes the regression reported in
bug 196547 (https://bugzilla.kernel.org/show_bug.cgi?id=196547).

Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent
with link state")
Signed-off-by: Andreas Born <futur.andy@googlemail.com>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 6 ++++--
 include/net/bonding.h           | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 224e93aa6d23..a063d28304ff 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1569,7 +1569,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	new_slave->delay = 0;
 	new_slave->link_failure_count = 0;
 
-	if (bond_update_speed_duplex(new_slave))
+	if (bond_update_speed_duplex(new_slave) &&
+	    bond_needs_speed_duplex(bond))
 		new_slave->link = BOND_LINK_DOWN;
 
 	new_slave->last_rx = jiffies -
@@ -2137,7 +2138,8 @@ static void bond_miimon_commit(struct bonding *bond)
 			continue;
 
 		case BOND_LINK_UP:
-			if (bond_update_speed_duplex(slave)) {
+			if (bond_update_speed_duplex(slave) &&
+			    bond_needs_speed_duplex(bond)) {
 				slave->link = BOND_LINK_DOWN;
 				netdev_warn(bond->dev,
 					    "failed to get link speed/duplex for %s\n",
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b00508d22e0a..b2e68657a216 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -277,6 +277,11 @@ static inline bool bond_is_lb(const struct bonding *bond)
 	       BOND_MODE(bond) == BOND_MODE_ALB;
 }
 
+static inline bool bond_needs_speed_duplex(const struct bonding *bond)
+{
+	return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond);
+}
+
 static inline bool bond_is_nondyn_tlb(const struct bonding *bond)
 {
 	return (BOND_MODE(bond) == BOND_MODE_TLB)  &&
-- 
2.13.5


From 0ed9f7149ca15ba15fca944b01c1e2cb0b7a6c23 Mon Sep 17 00:00:00 2001
From: Andreas Born <futur.andy@googlemail.com>
Date: Sat, 12 Aug 2017 00:36:55 +0200
Subject: [PATCH 02/27] bonding: ratelimit failed speed/duplex update warning

[ Upstream commit 11e9d7829dd08dbafb24517fe922f11c3a8a9dc2 ]

bond_miimon_commit() handles the UP transition for each slave of a bond
in the case of MII. It is triggered 10 times per second for the default
MII Polling interval of 100ms. For device drivers that do not implement
__ethtool_get_link_ksettings() the call to bond_update_speed_duplex()
fails persistently while the MII status could remain UP. That is, in
this and other cases where the speed/duplex update keeps failing over a
longer period of time while the MII state is UP, a warning is printed
every MII polling interval.

To address these excessive warnings net_ratelimit() should be used.
Printing a warning once would not be sufficient since the call to
bond_update_speed_duplex() could recover to succeed and fail again
later. In that case there would be no new indication what went wrong.

Fixes: b5bf0f5b16b9c (bonding: correctly update link status during mii-commit phase)
Signed-off-by: Andreas Born <futur.andy@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a063d28304ff..510a580e0348 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2141,9 +2141,10 @@ static void bond_miimon_commit(struct bonding *bond)
 			if (bond_update_speed_duplex(slave) &&
 			    bond_needs_speed_duplex(bond)) {
 				slave->link = BOND_LINK_DOWN;
-				netdev_warn(bond->dev,
-					    "failed to get link speed/duplex for %s\n",
-					    slave->dev->name);
+				if (net_ratelimit())
+					netdev_warn(bond->dev,
+						    "failed to get link speed/duplex for %s\n",
+						    slave->dev->name);
 				continue;
 			}
 			bond_set_slave_link_state(slave, BOND_LINK_UP,
-- 
2.13.5


From f57c8c34ca10de921254e0e26f7fd9919b74a37e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Aug 2017 10:16:45 -0700
Subject: [PATCH 03/27] af_key: do not use GFP_KERNEL in atomic contexts

[ Upstream commit 36f41f8fc6d8aa9f8c9072d66ff7cf9055f5e69b ]

pfkey_broadcast() might be called from non process contexts,
we can not use GFP_KERNEL in these cases [1].

This patch partially reverts commit ba51b6be38c1 ("net: Fix RCU splat in
af_key"), only keeping the GFP_ATOMIC forcing under rcu_read_lock()
section.

[1] : syzkaller reported :

in_atomic(): 1, irqs_disabled(): 0, pid: 2932, name: syzkaller183439
3 locks held by syzkaller183439/2932:
 #0:  (&net->xfrm.xfrm_cfg_mutex){+.+.+.}, at: [<ffffffff83b43888>] pfkey_sendmsg+0x4c8/0x9f0 net/key/af_key.c:3649
 #1:  (&pfk->dump_lock){+.+.+.}, at: [<ffffffff83b467f6>] pfkey_do_dump+0x76/0x3f0 net/key/af_key.c:293
 #2:  (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [<ffffffff83957632>] spin_lock_bh include/linux/spinlock.h:304 [inline]
 #2:  (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [<ffffffff83957632>] xfrm_policy_walk+0x192/0xa30 net/xfrm/xfrm_policy.c:1028
CPU: 0 PID: 2932 Comm: syzkaller183439 Not tainted 4.13.0-rc4+ #24
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:5994
 __might_sleep+0x95/0x190 kernel/sched/core.c:5947
 slab_pre_alloc_hook mm/slab.h:416 [inline]
 slab_alloc mm/slab.c:3383 [inline]
 kmem_cache_alloc+0x24b/0x6e0 mm/slab.c:3559
 skb_clone+0x1a0/0x400 net/core/skbuff.c:1037
 pfkey_broadcast_one+0x4b2/0x6f0 net/key/af_key.c:207
 pfkey_broadcast+0x4ba/0x770 net/key/af_key.c:281
 dump_sp+0x3d6/0x500 net/key/af_key.c:2685
 xfrm_policy_walk+0x2f1/0xa30 net/xfrm/xfrm_policy.c:1042
 pfkey_dump_sp+0x42/0x50 net/key/af_key.c:2695
 pfkey_do_dump+0xaa/0x3f0 net/key/af_key.c:299
 pfkey_spddump+0x1a0/0x210 net/key/af_key.c:2722
 pfkey_process+0x606/0x710 net/key/af_key.c:2814
 pfkey_sendmsg+0x4d6/0x9f0 net/key/af_key.c:3650
sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 ___sys_sendmsg+0x755/0x890 net/socket.c:2035
 __sys_sendmsg+0xe5/0x210 net/socket.c:2069
 SYSC_sendmsg net/socket.c:2080 [inline]
 SyS_sendmsg+0x2d/0x50 net/socket.c:2076
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x445d79
RSP: 002b:00007f32447c1dc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000445d79
RDX: 0000000000000000 RSI: 000000002023dfc8 RDI: 0000000000000008
RBP: 0000000000000086 R08: 00007f32447c2700 R09: 00007f32447c2700
R10: 00007f32447c2700 R11: 0000000000000202 R12: 0000000000000000
R13: 00007ffe33edec4f R14: 00007f32447c29c0 R15: 0000000000000000

Fixes: ba51b6be38c1 ("net: Fix RCU splat in af_key")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index b1432b668033..166e32c93038 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -228,7 +228,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
 #define BROADCAST_ONE		1
 #define BROADCAST_REGISTERED	2
 #define BROADCAST_PROMISC_ONLY	4
-static int pfkey_broadcast(struct sk_buff *skb,
+static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
 			   int broadcast_flags, struct sock *one_sk,
 			   struct net *net)
 {
@@ -278,7 +278,7 @@ static int pfkey_broadcast(struct sk_buff *skb,
 	rcu_read_unlock();
 
 	if (one_sk != NULL)
-		err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk);
+		err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
 
 	kfree_skb(skb2);
 	kfree_skb(skb);
@@ -311,7 +311,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
 		hdr = (struct sadb_msg *) pfk->dump.skb->data;
 		hdr->sadb_msg_seq = 0;
 		hdr->sadb_msg_errno = rc;
-		pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
 				&pfk->sk, sock_net(&pfk->sk));
 		pfk->dump.skb = NULL;
 	}
@@ -355,7 +355,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) /
 			     sizeof(uint64_t));
 
-	pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk));
+	pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk));
 
 	return 0;
 }
@@ -1396,7 +1396,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
 
 	xfrm_state_put(x);
 
-	pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net);
+	pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net);
 
 	return 0;
 }
@@ -1483,7 +1483,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
 	hdr->sadb_msg_seq = c->seq;
 	hdr->sadb_msg_pid = c->portid;
 
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x));
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x));
 
 	return 0;
 }
@@ -1596,7 +1596,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg
 	out_hdr->sadb_msg_reserved = 0;
 	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
 	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
 
 	return 0;
 }
@@ -1701,8 +1701,8 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad
 		return -ENOBUFS;
 	}
 
-	pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk));
-
+	pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk,
+			sock_net(sk));
 	return 0;
 }
 
@@ -1720,7 +1720,8 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr)
 	hdr->sadb_msg_errno = (uint8_t) 0;
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
 
-	return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk));
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk,
+			       sock_net(sk));
 }
 
 static int key_notify_sa_flush(const struct km_event *c)
@@ -1741,7 +1742,7 @@ static int key_notify_sa_flush(const struct km_event *c)
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
 	hdr->sadb_msg_reserved = 0;
 
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net);
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 
 	return 0;
 }
@@ -1798,7 +1799,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr)
 	out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
 
 	if (pfk->dump.skb)
-		pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
 				&pfk->sk, sock_net(&pfk->sk));
 	pfk->dump.skb = out_skb;
 
@@ -1886,7 +1887,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb
 		new_hdr->sadb_msg_errno = 0;
 	}
 
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk));
+	pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
 	return 0;
 }
 
@@ -2219,7 +2220,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev
 	out_hdr->sadb_msg_errno = 0;
 	out_hdr->sadb_msg_seq = c->seq;
 	out_hdr->sadb_msg_pid = c->portid;
-	pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp));
 	return 0;
 
 }
@@ -2439,7 +2440,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc
 	out_hdr->sadb_msg_errno = 0;
 	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
 	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp));
 	err = 0;
 
 out:
@@ -2695,7 +2696,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
 	out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
 
 	if (pfk->dump.skb)
-		pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE,
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
 				&pfk->sk, sock_net(&pfk->sk));
 	pfk->dump.skb = out_skb;
 
@@ -2752,7 +2753,7 @@ static int key_notify_policy_flush(const struct km_event *c)
 	hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
 	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
 	hdr->sadb_msg_reserved = 0;
-	pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net);
+	pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 	return 0;
 
 }
@@ -2816,7 +2817,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb
 	void *ext_hdrs[SADB_EXT_MAX];
 	int err;
 
-	pfkey_broadcast(skb_clone(skb, GFP_KERNEL),
+	pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
 			BROADCAST_PROMISC_ONLY, NULL, sock_net(sk));
 
 	memset(ext_hdrs, 0, sizeof(ext_hdrs));
@@ -3038,7 +3039,8 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
 	out_hdr->sadb_msg_seq = 0;
 	out_hdr->sadb_msg_pid = 0;
 
-	pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x));
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL,
+			xs_net(x));
 	return 0;
 }
 
@@ -3228,7 +3230,8 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
 		       xfrm_ctx->ctx_len);
 	}
 
-	return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x));
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL,
+			       xs_net(x));
 }
 
 static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
@@ -3426,7 +3429,8 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
 	n_port->sadb_x_nat_t_port_port = sport;
 	n_port->sadb_x_nat_t_port_reserved = 0;
 
-	return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x));
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL,
+			       xs_net(x));
 }
 
 #ifdef CONFIG_NET_KEY_MIGRATE
@@ -3618,7 +3622,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 	}
 
 	/* broadcast migrate message to sockets */
-	pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net);
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net);
 
 	return 0;
 
-- 
2.13.5


From 5560159ebbad0c3e054baba2861f9bccd4f6a8ed Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Aug 2017 14:10:25 -0700
Subject: [PATCH 04/27] dccp: purge write queue in dccp_destroy_sock()

[ Upstream commit 7749d4ff88d31b0be17c8683143135adaaadc6a7 ]

syzkaller reported that DCCP could have a non empty
write queue at dismantle time.

WARNING: CPU: 1 PID: 2953 at net/core/stream.c:199 sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 2953 Comm: syz-executor0 Not tainted 4.13.0-rc4+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:180
 __warn+0x1c4/0x1d9 kernel/panic.c:541
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190
 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:273
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323
 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846
RIP: 0010:sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199
RSP: 0018:ffff8801d182f108 EFLAGS: 00010297
RAX: ffff8801d1144140 RBX: ffff8801d13cb280 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff85137b00 RDI: ffff8801d13cb280
RBP: ffff8801d182f148 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d13cb4d0
R13: ffff8801d13cb3b8 R14: ffff8801d13cb300 R15: ffff8801d13cb3b8
 inet_csk_destroy_sock+0x175/0x3f0 net/ipv4/inet_connection_sock.c:835
 dccp_close+0x84d/0xc10 net/dccp/proto.c:1067
 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425
 sock_release+0x8d/0x1e0 net/socket.c:597
 sock_close+0x16/0x20 net/socket.c:1126
 __fput+0x327/0x7e0 fs/file_table.c:210
 ____fput+0x15/0x20 fs/file_table.c:246
 task_work_run+0x18a/0x260 kernel/task_work.c:116
 exit_task_work include/linux/task_work.h:21 [inline]
 do_exit+0xa32/0x1b10 kernel/exit.c:865
 do_group_exit+0x149/0x400 kernel/exit.c:969
 get_signal+0x7e8/0x17e0 kernel/signal.c:2330
 do_signal+0x94/0x1ee0 arch/x86/kernel/signal.c:808
 exit_to_usermode_loop+0x21c/0x2d0 arch/x86/entry/common.c:157
 prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
 syscall_return_slowpath+0x3a7/0x450 arch/x86/entry/common.c:263

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 9fe25bf63296..86bc40ba6ba5 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -201,10 +201,7 @@ void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	/*
-	 * DCCP doesn't use sk_write_queue, just sk_send_head
-	 * for retransmissions
-	 */
+	__skb_queue_purge(&sk->sk_write_queue);
 	if (sk->sk_send_head != NULL) {
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
-- 
2.13.5


From bef8f171a698c0eb7a02fafac04964c86e60b27b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 07:03:15 -0700
Subject: [PATCH 05/27] dccp: defer ccid_hc_tx_delete() at dismantle time

[ Upstream commit 120e9dabaf551c6dc03d3a10a1f026376cb1811c ]

syszkaller team reported another problem in DCCP [1]

Problem here is that the structure holding RTO timer
(ccid2_hc_tx_rto_expire() handler) is freed too soon.

We can not use del_timer_sync() to cancel the timer
since this timer wants to grab socket lock (that would risk a dead lock)

Solution is to defer the freeing of memory when all references to
the socket were released. Socket timers do own a reference, so this
should fix the issue.

[1]

==================================================================
BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365

CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events_unbound call_usermodehelper_exec_work
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429
 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268
 expire_timers kernel/time/timer.c:1307 [inline]
 __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601
 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:638 [inline]
 smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044
 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702
RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline]
RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline]
RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343
RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10
RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006
RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98
RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60
R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0
 </IRQ>
 release_task+0xe9e/0x1a40 kernel/exit.c:220
 wait_task_zombie kernel/exit.c:1162 [inline]
 wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389
 do_wait_thread kernel/exit.c:1452 [inline]
 do_wait+0x441/0xa90 kernel/exit.c:1523
 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665
 SYSC_wait4+0x134/0x140 kernel/exit.c:1677
 SyS_wait4+0x2c/0x40 kernel/exit.c:1673
 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline]
 call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323
 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097
 worker_thread+0x223/0x1860 kernel/workqueue.c:2231
 kthread+0x35e/0x430 kernel/kthread.c:231
 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425

Allocated by task 21267:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561
 ccid_new+0x20e/0x390 net/dccp/ccid.c:151
 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44
 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344
 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538
 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline]
 dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2269
 release_sock+0xa4/0x2a0 net/core/sock.c:2784
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1642
 SyS_connect+0x24/0x30 net/socket.c:1623
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 3049:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190
 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225
 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833
 dccp_done+0xb7/0xd0 net/dccp/proto.c:145
 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72
 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160
 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521
 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:477 [inline]
 ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455
 process_backlog+0x203/0x740 net/core/dev.c:5130
 napi_poll net/core/dev.c:5527 [inline]
 net_rx_action+0x792/0x1910 net/core/dev.c:5593
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284

The buggy address belongs to the object at ffff8801d2660100
 which belongs to the cache ccid2_hc_tx_sock of size 1240
The buggy address is located 1088 bytes inside of
 1240-byte region [ffff8801d2660100, ffff8801d26605d8)
The buggy address belongs to the page:
page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0
flags: 0x200000000008100(slab|head)
raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005
raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
 ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 86bc40ba6ba5..b68168fcc06a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,6 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_sock.h>
+#include <net/inet_common.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 
@@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type)
 
 EXPORT_SYMBOL_GPL(dccp_packet_name);
 
+static void dccp_sk_destruct(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_tx_ccid = NULL;
+	inet_sock_destruct(sk);
+}
+
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
+	sk->sk_destruct		= dccp_sk_destruct;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
 	dp->dccps_mss_cache	= 536;
 	dp->dccps_rate_last	= jiffies;
@@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk)
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+	dp->dccps_hc_rx_ccid = NULL;
 
 	/* clean up feature negotiation state */
 	dccp_feat_list_purge(&dp->dccps_featneg);
-- 
2.13.5


From 168a84f006de09a6c5e67cedacd386a9a3c0ec2b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Aug 2017 05:26:17 -0700
Subject: [PATCH 06/27] ipv4: fix NULL dereference in free_fib_info_rcu()

[ Upstream commit 187e5b3ac84d3421d2de3aca949b2791fbcad554 ]

If fi->fib_metrics could not be allocated in fib_create_info()
we attempt to dereference a NULL pointer in free_fib_info_rcu() :

    m = fi->fib_metrics;
    if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
            kfree(m);

Before my recent patch, we used to call kfree(NULL) and nothing wrong
happened.

Instead of using RCU to defer freeing while we are under memory stress,
it seems better to take immediate action.

This was reported by syzkaller team.

Fixes: 3fb07daff8e9 ("ipv4: add reference counting to metrics")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ce7bc2e5175a..ac9a8fbbacfd 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1033,15 +1033,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 	if (!fi)
 		goto failure;
-	fib_info_cnt++;
 	if (cfg->fc_mx) {
 		fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
-		if (!fi->fib_metrics)
-			goto failure;
+		if (unlikely(!fi->fib_metrics)) {
+			kfree(fi);
+			return ERR_PTR(err);
+		}
 		atomic_set(&fi->fib_metrics->refcnt, 1);
-	} else
+	} else {
 		fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
-
+	}
+	fib_info_cnt++;
 	fi->fib_net = net;
 	fi->fib_protocol = cfg->fc_protocol;
 	fi->fib_scope = cfg->fc_scope;
-- 
2.13.5


From d758e8c1af1c62eabe40a3fdd4648d89b8c9e0f3 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Tue, 15 Aug 2017 16:37:04 +0300
Subject: [PATCH 07/27] net_sched/sfq: update hierarchical backlog when drop
 packet

[ Upstream commit 325d5dc3f7e7c2840b65e4a2988c082c2c0025c5 ]

When sfq_enqueue() drops head packet or packet from another queue it
have to update backlog at upper qdiscs too.

Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too")
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_sfq.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 332d94be6e1c..22451a9eb89d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -435,6 +435,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 		qdisc_drop(head, sch, to_free);
 
 		slot_queue_add(slot, skb);
+		qdisc_tree_reduce_backlog(sch, 0, delta);
 		return NET_XMIT_CN;
 	}
 
@@ -466,8 +467,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 	/* Return Congestion Notification only if we dropped a packet
 	 * from this flow.
 	 */
-	if (qlen != slot->qlen)
+	if (qlen != slot->qlen) {
+		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 		return NET_XMIT_CN;
+	}
 
 	/* As we dropped a packet, better let upper stack know this */
 	qdisc_tree_reduce_backlog(sch, 1, dropped);
-- 
2.13.5


From 79543be0b28fae543650dbf86bd54251b5c3d472 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Tue, 15 Aug 2017 16:39:05 +0300
Subject: [PATCH 08/27] net_sched: remove warning from qdisc_hash_add

[ Upstream commit c90e95147c27b1780e76c6e8fea1b5c78d7d387f ]

It was added in commit e57a784d8cae ("pkt_sched: set root qdisc
before change() in attach_default_qdiscs()") to hide duplicates
from "tc qdisc show" for incative deivices.

After 59cc1f61f ("net: sched: convert qdisc linked list to hashtable")
it triggered when classful qdisc is added to inactive device because
default qdiscs are added before switching root qdisc.

Anyway after commit ea3274695353 ("net: sched: avoid duplicates in
qdisc dump") duplicates are filtered right in dumper.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index cfdbfa18a95e..fdbbdfd8e9a8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -286,9 +286,6 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 void qdisc_hash_add(struct Qdisc *q, bool invisible)
 {
 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
-		struct Qdisc *root = qdisc_dev(q)->qdisc;
-
-		WARN_ON_ONCE(root == &noop_qdisc);
 		ASSERT_RTNL();
 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 		if (invisible)
-- 
2.13.5


From 91d3e96ab5907cc4f7c6e0483845179996db8c7b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 16 Aug 2017 01:45:33 +0200
Subject: [PATCH 09/27] bpf: fix bpf_trace_printk on 32 bit archs

[ Upstream commit 88a5c690b66110ad255380d8f629c629cf6ca559 ]

James reported that on MIPS32 bpf_trace_printk() is currently
broken while MIPS64 works fine:

  bpf_trace_printk() uses conditional operators to attempt to
  pass different types to __trace_printk() depending on the
  format operators. This doesn't work as intended on 32-bit
  architectures where u32 and long are passed differently to
  u64, since the result of C conditional operators follows the
  "usual arithmetic conversions" rules, such that the values
  passed to __trace_printk() will always be u64 [causing issues
  later in the va_list handling for vscnprintf()].

  For example the samples/bpf/tracex5 test printed lines like
  below on MIPS32, where the fd and buf have come from the u64
  fd argument, and the size from the buf argument:

    [...] 1180.941542: 0x00000001: write(fd=1, buf=  (null), size=6258688)

  Instead of this:

    [...] 1625.616026: 0x00000001: write(fd=1, buf=009e4000, size=512)

One way to get it working is to expand various combinations
of argument types into 8 different combinations for 32 bit
and 64 bit kernels. Fix tested by James on MIPS32 and MIPS64
as well that it resolves the issue.

Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()")
Reported-by: James Hogan <james.hogan@imgtec.com>
Tested-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..d521b301dee9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -203,10 +203,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 		fmt_cnt++;
 	}
 
-	return __trace_printk(1/* fake ip will not be printed */, fmt,
-			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
-			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
-			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT()	__BPF_ARG3_TP()
+#define __BPF_TP(...)							\
+	__trace_printk(1 /* Fake ip will not be printed. */,		\
+		       fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...)						\
+	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_TP(arg1, ##__VA_ARGS__)				\
+	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_TP((long)arg1, ##__VA_ARGS__)			\
+	      : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...)						\
+	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__)				\
+	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__)		\
+	      : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...)						\
+	((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__)				\
+	  : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__)		\
+	      : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+	return __BPF_TP_EMIT();
 }
 
 static const struct bpf_func_proto bpf_trace_printk_proto = {
-- 
2.13.5


From a748e13990ec10436015a1039ec8b6df90bde2a8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 15 Aug 2017 18:38:42 -0700
Subject: [PATCH 10/27] net: igmp: Use ingress interface rather than vrf device

[ Upstream commit c7b725be84985532161bcb4fbecd056326998a77 ]

Anuradha reported that statically added groups for interfaces enslaved
to a VRF device were not persisting. The problem is that igmp queries
and reports need to use the data in the in_dev for the real ingress
device rather than the VRF device. Update igmp_rcv accordingly.

Fixes: e58e41596811 ("net: Enable support for VRF with ipv4 multicast")
Reported-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3db1adb6b7a0..abdbe79ee175 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb)
 {
 	/* This basically follows the spec line by line -- see RFC1112 */
 	struct igmphdr *ih;
-	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
 	int len = skb->len;
 	bool dropped = true;
 
+	if (netif_is_l3_master(dev)) {
+		dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+		if (!dev)
+			goto drop;
+	}
+
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev)
 		goto drop;
 
-- 
2.13.5


From 92290c0d678175d8cfc362658c737d705bfb305c Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Wed, 16 Aug 2017 13:30:07 +0800
Subject: [PATCH 11/27] openvswitch: fix skb_panic due to the incorrect actions
 attrlen

[ Upstream commit 494bea39f3201776cdfddc232705f54a0bd210c4 ]

For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.

But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
  skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
  ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
  ------------[ cut here ]------------
  kernel BUG at net/core/skbuff.c:129!
  [...]
  Call Trace:
   <IRQ>
   [<ffffffff8148be82>] skb_put+0x43/0x44
   [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
   [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
   [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
   [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
   [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
   [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
   [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
   [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
   [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
   [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
  [...]

Also we can find that the actions_len is much little than the orig_len:
  crash> struct sw_flow_actions 0xffff8812f539d000
  struct sw_flow_actions {
    rcu = {
      next = 0xffff8812f5398800,
      func = 0xffffe3b00035db32
    },
    orig_len = 1384,
    actions_len = 592,
    actions = 0xffff8812f539d01c
  }

So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.

Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.

Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c  | 1 +
 net/openvswitch/datapath.c | 7 ++++---
 net/openvswitch/datapath.h | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e4610676299b..a54a556fcdb5 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1337,6 +1337,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		goto out;
 	}
 
+	OVS_CB(skb)->acts_origlen = acts->orig_len;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7b17da9a94a0..57ce10b6cf6b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -381,7 +381,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 }
 
 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
-			      unsigned int hdrlen)
+			      unsigned int hdrlen, int actions_attrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
@@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 
 	/* OVS_PACKET_ATTR_ACTIONS */
 	if (upcall_info->actions_len)
-		size += nla_total_size(upcall_info->actions_len);
+		size += nla_total_size(actions_attrlen);
 
 	/* OVS_PACKET_ATTR_MRU */
 	if (upcall_info->mru)
@@ -465,7 +465,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	else
 		hlen = skb->len;
 
-	len = upcall_msg_size(upcall_info, hlen - cutlen);
+	len = upcall_msg_size(upcall_info, hlen - cutlen,
+			      OVS_CB(skb)->acts_origlen);
 	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index da931bdef8a7..98a28f78aff2 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -98,12 +98,14 @@ struct datapath {
  * @input_vport: The original vport packet came in on. This value is cached
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  * fragmented.
  */
 struct ovs_skb_cb {
 	struct vport		*input_vport;
 	u16			mru;
+	u16			acts_origlen;
 	u32			cutlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
-- 
2.13.5


From 10ce377e2fdbca671f764b691d8f8537f558d136 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 10:36:47 -0700
Subject: [PATCH 12/27] ptr_ring: use kmalloc_array()

[ Upstream commit 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 ]

As found by syzkaller, malicious users can set whatever tx_queue_len
on a tun device and eventually crash the kernel.

Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small
ring buffer is not fast anyway.

Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h  | 9 +++++----
 include/linux/skb_array.h | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6b2e0dd88569..feff771e8ea0 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -371,9 +371,9 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
 	__PTR_RING_PEEK_CALL_v; \
 })
 
-static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
+	return kcalloc(size, sizeof(void *), gfp);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -462,7 +462,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
+					   unsigned int nrings,
 					   int size,
 					   gfp_t gfp, void (*destroy)(void *))
 {
@@ -470,7 +471,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 	void ***queues;
 	int i;
 
-	queues = kmalloc(nrings * sizeof *queues, gfp);
+	queues = kmalloc_array(nrings, sizeof(*queues), gfp);
 	if (!queues)
 		goto noqueues;
 
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index f4dfade428f0..be8b902b5845 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -162,7 +162,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 }
 
 static inline int skb_array_resize_multiple(struct skb_array **rings,
-					    int nrings, int size, gfp_t gfp)
+					    int nrings, unsigned int size,
+					    gfp_t gfp)
 {
 	BUILD_BUG_ON(offsetof(struct skb_array, ring));
 	return ptr_ring_resize_multiple((struct ptr_ring **)rings,
-- 
2.13.5


From 0ddde403729bd9b98f956e1db5926c2d0d99809a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 11:09:12 -0700
Subject: [PATCH 13/27] ipv4: better IP_MAX_MTU enforcement

[ Upstream commit c780a049f9bf442314335372c9abc4548bfe3e44 ]

While working on yet another syzkaller report, I found
that our IP_MAX_MTU enforcements were not properly done.

gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and
final result can be bigger than IP_MAX_MTU :/

This is a problem because device mtu can be changed on other cpus or
threads.

While this patch does not fix the issue I am working on, it is
probably worth addressing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++--
 net/ipv4/route.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 821cedcc8e73..0cf7f5a65fe6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -352,7 +352,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	    !forwarding)
 		return dst_mtu(dst);
 
-	return min(dst->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
@@ -364,7 +364,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
 	}
 
-	return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
 u32 ip_idents_reserve(u32 hash, int segs);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6883b3d4ba8f..22ba873546c3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1268,7 +1268,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	if (mtu)
 		return mtu;
 
-	mtu = dst->dev->mtu;
+	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
 		if (rt->rt_uses_gateway && mtu > 576)
-- 
2.13.5


From 0622af29e530d327687af157dd0c158fb261cac2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 Aug 2017 12:11:50 +0100
Subject: [PATCH 14/27] nfp: fix infinite loop on umapping cleanup

[ Upstream commit eac2c68d663effb077210218788952b5a0c1f60e ]

The while loop that performs the dma page unmapping never decrements
index counter f and hence loops forever. Fix this with a pre-decrement
on f.

Detected by CoverityScan, CID#1357309 ("Infinite loop")

Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 82bd6b0935f1..fd4a785431ac 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -881,8 +881,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 
 err_unmap:
-	--f;
-	while (f >= 0) {
+	while (--f >= 0) {
 		frag = &skb_shinfo(skb)->frags[f];
 		dma_unmap_page(dp->dev, tx_ring->txbufs[wr_idx].dma_addr,
 			       skb_frag_size(frag), DMA_TO_DEVICE);
-- 
2.13.5


From 4c348dde87047d562cebc14e3b68b1cf1d0533b9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Aug 2017 13:39:56 -0700
Subject: [PATCH 15/27] tun: handle register_netdevice() failures properly

[ Upstream commit ff244c6b29b176f3f448bc75e55df297225e1b3a ]

syzkaller reported a double free [1], caused by the fact
that tun driver was not updated properly when priv_destructor
was added.

When/if register_netdevice() fails, priv_destructor() must have been
called already.

[1]
BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023

CPU: 0 PID: 2919 Comm: syzkaller227220 Not tainted 4.13.0-rc4+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x7f/0x260 mm/kasan/report.c:252
 kasan_report_double_free+0x55/0x80 mm/kasan/report.c:333
 kasan_slab_free+0xa0/0xc0 mm/kasan/kasan.c:514
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_set_iff drivers/net/tun.c:1884 [inline]
 __tun_chr_ioctl+0x2ce6/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x443ff9
RSP: 002b:00007ffc34271f68 EFLAGS: 00000217 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000004002e0 RCX: 0000000000443ff9
RDX: 0000000020533000 RSI: 00000000400454ca RDI: 0000000000000003
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000401ce0
R13: 0000000000401d70 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xaa/0xd0 mm/kasan/kasan.c:551
 kmem_cache_alloc_trace+0x101/0x6f0 mm/slab.c:3627
 kmalloc include/linux/slab.h:493 [inline]
 kzalloc include/linux/slab.h:666 [inline]
 selinux_tun_dev_alloc_security+0x49/0x170 security/selinux/hooks.c:5012
 security_tun_dev_alloc_security+0x6d/0xa0 security/security.c:1506
 tun_set_iff drivers/net/tun.c:1839 [inline]
 __tun_chr_ioctl+0x1730/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x6e/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_free_netdev+0x13b/0x1b0 drivers/net/tun.c:1563
 register_netdevice+0x8d0/0xee0 net/core/dev.c:7605
 tun_set_iff drivers/net/tun.c:1859 [inline]
 __tun_chr_ioctl+0x1caf/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801d2843b40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 0 bytes inside of
 32-byte region [ffff8801d2843b40, ffff8801d2843b60)
The buggy address belongs to the page:
page:ffffea000660cea8 count:1 mapcount:0 mapping:ffff8801d2843000 index:0xffff8801d2843fc1
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801d2843000 ffff8801d2843fc1 000000010000003f
raw: ffffea0006626a40 ffffea00066141a0 ffff8801dbc00100
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2843a00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843a80: 00 00 00 fc fc fc fc fc fb fb fb fb fc fc fc fc
>ffff8801d2843b00: 00 00 00 00 fc fc fc fc fb fb fb fb fc fc fc fc
                                           ^
 ffff8801d2843b80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843c00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc

==================================================================

Fixes: cf124db566e6 ("net: Fix inconsistent teardown and release of private netdev state.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9ee7d4275640..5bd954d12541 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1876,6 +1876,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 err_detach:
 	tun_detach_all(dev);
+	/* register_netdevice() already called tun_free_netdev() */
+	goto err_free_dev;
+
 err_free_flow:
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
-- 
2.13.5


From 7bfce86e706f57734a21ea85ac82f810932045d0 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 16 Aug 2017 20:16:40 +0200
Subject: [PATCH 16/27] sctp: fully initialize the IPv6 address in
 sctp_v6_to_addr()

[ Upstream commit 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d ]

KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and
sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below).
Make sure all fields of an IPv6 address are initialized, which
guarantees that the IPv4 fields are also initialized.

==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================

Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f5b45b8b8b16..0de5f5f8ddbc 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -510,7 +510,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
 {
 	addr->sa.sa_family = AF_INET6;
 	addr->v6.sin6_port = port;
+	addr->v6.sin6_flowinfo = 0;
 	addr->v6.sin6_addr = *saddr;
+	addr->v6.sin6_scope_id = 0;
 }
 
 /* Compare addresses exactly.
-- 
2.13.5


From dea35c23c4fbe3b65430465163531c0e31ba5f94 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 09:41:54 -0700
Subject: [PATCH 17/27] tipc: fix use-after-free

[ Upstream commit 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 ]

syszkaller reported use-after-free in tipc [1]

When msg->rep skb is freed, set the pointer to NULL,
so that caller does not free it again.

[1]

==================================================================
BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466
Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115

CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
 skb_push+0xd4/0xe0 net/core/skbuff.c:1466
 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9
RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76
R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000

Allocated by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651
 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219
 alloc_skb include/linux/skbuff.h:903 [inline]
 tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148
 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622
 __kfree_skb net/core/skbuff.c:682 [inline]
 kfree_skb+0x165/0x4c0 net/core/skbuff.c:699
 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801c6e71dc0
 which belongs to the cache skbuff_head_cache of size 224
The buggy address is located 208 bytes inside of
 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0)
The buggy address belongs to the page:
page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c
raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
 ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
                         ^
 ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov  <dvyukov@google.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 9bfe886ab330..750949dfc1d7 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
 	arg = nlmsg_new(0, GFP_KERNEL);
 	if (!arg) {
 		kfree_skb(msg->rep);
+		msg->rep = NULL;
 		return -ENOMEM;
 	}
 
 	err = __tipc_nl_compat_dumpit(cmd, msg, arg);
-	if (err)
+	if (err) {
 		kfree_skb(msg->rep);
-
+		msg->rep = NULL;
+	}
 	kfree_skb(arg);
 
 	return err;
-- 
2.13.5


From c90956996a7f3549184b28c437634dc7128ba188 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 16 Aug 2017 11:18:09 -0700
Subject: [PATCH 18/27] ipv6: reset fn->rr_ptr when replacing route

[ Upstream commit 383143f31d7d3525a1dbff733d52fff917f82f15 ]

syzcaller reported the following use-after-free issue in rt6_select():
BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8
BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8
Read of size 4 by task syz-executor1/439628
CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00
 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0
 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380
Call Trace:
 [<ffffffff81ca384d>] __dump_stack lib/dump_stack.c:15 [inline]
 [<ffffffff81ca384d>] dump_stack+0xc1/0x124 lib/dump_stack.c:51
sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option.
Use struct sctp_sack_info instead
 [<ffffffff81735751>] kasan_object_err+0x21/0x70 mm/kasan/report.c:158
 [<ffffffff817359c4>] print_address_description mm/kasan/report.c:196 [inline]
 [<ffffffff817359c4>] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285
 [<ffffffff81735d93>] kasan_report mm/kasan/report.c:305 [inline]
 [<ffffffff81735d93>] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325
 [<ffffffff82a28e39>] rt6_select net/ipv6/route.c:755 [inline]
 [<ffffffff82a28e39>] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084
 [<ffffffff82a28fb1>] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203
 [<ffffffff82ab0a50>] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95
 [<ffffffff8265cbb6>] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223
 [<ffffffff82ab1430>] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41
 [<ffffffff82a22006>] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224
 [<ffffffff829e83d2>] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943
 [<ffffffff829e889a>] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079
 [<ffffffff82a9f7d8>] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91
 [<ffffffff82aa0978>] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline]
 [<ffffffff82aa0978>] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272
 [<ffffffff82aa1313>] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284
 [<ffffffff8292f790>] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564
 [<ffffffff82565547>] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582
 [<ffffffff8256a649>] SyS_connect+0x29/0x30 net/socket.c:1563
 [<ffffffff82c72032>] entry_SYSCALL_64_fastpath+0x12/0x17
Object at ffff8800bc699380, in cache ip6_dst_cache size: 384

The root cause of it is that in fib6_add_rt2node(), when it replaces an
existing route with the new one, it does not update fn->rr_ptr.
This commit resets fn->rr_ptr to NULL when it points to a route which is
replaced in fib6_add_rt2node().

Fixes: 27596472473a ("ipv6: fix ECMP route replacement")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e4e9f752ebbf..e93932d70620 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -912,6 +912,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		}
 		nsiblings = iter->rt6i_nsiblings;
 		fib6_purge_rt(iter, fn, info->nl_net);
+		if (fn->rr_ptr == iter)
+			fn->rr_ptr = NULL;
 		rt6_release(iter);
 
 		if (nsiblings) {
@@ -924,6 +926,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->dst.rt6_next;
 					fib6_purge_rt(iter, fn, info->nl_net);
+					if (fn->rr_ptr == iter)
+						fn->rr_ptr = NULL;
 					rt6_release(iter);
 					nsiblings--;
 				} else {
-- 
2.13.5


From ca2ec1ce5afff4d0dc09a57b00e8917d52872eb4 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 18 Aug 2017 17:14:49 -0700
Subject: [PATCH 19/27] ipv6: repair fib6 tree in failure case

[ Upstream commit 348a4002729ccab8b888b38cbc099efa2f2a2036 ]

In fib6_add(), it is possible that fib6_add_1() picks an intermediate
node and sets the node's fn->leaf to NULL in order to add this new
route. However, if fib6_add_rt2node() fails to add the new
route for some reason, fn->leaf will be left as NULL and could
potentially cause crash when fn->leaf is accessed in fib6_locate().
This patch makes sure fib6_repair_tree() is called to properly repair
fn->leaf in the above failure case.

Here is the syzkaller reported general protection fault in fib6_locate:
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] ipv6_prefix_equal include/net/ipv6.h:492 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233
RSP: 0018:ffff8801d01a36a8  EFLAGS: 00010202
RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000
RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100
RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000
R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000
FS:  00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Stack:
 ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0
 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0
 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988
Call Trace:
 [<ffffffff82a223d6>] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109
 [<ffffffff82a23f9d>] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075
 [<ffffffff82621359>] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450
 [<ffffffff8274c1d1>] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281
 [<ffffffff82613ddf>] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456
 [<ffffffff8274ad38>] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline]
 [<ffffffff8274ad38>] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232
 [<ffffffff8274b83e>] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778
 [<ffffffff82564aff>] sock_sendmsg_nosec net/socket.c:609 [inline]
 [<ffffffff82564aff>] sock_sendmsg+0xcf/0x110 net/socket.c:619
 [<ffffffff82564d62>] sock_write_iter+0x222/0x3a0 net/socket.c:834
 [<ffffffff8178523d>] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478
 [<ffffffff817853f4>] __vfs_write+0xe4/0x110 fs/read_write.c:491
 [<ffffffff81786c38>] vfs_write+0x178/0x4b0 fs/read_write.c:538
 [<ffffffff817892a9>] SYSC_write fs/read_write.c:585 [inline]
 [<ffffffff817892a9>] SyS_write+0xd9/0x1b0 fs/read_write.c:577
 [<ffffffff82c71e32>] entry_SYSCALL_64_fastpath+0x12/0x17

Note: there is no "Fixes" tag as this seems to be a bug introduced
very early.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e93932d70620..cd8dd8c4e819 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1016,7 +1016,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			/* Create subtree root node */
 			sfn = node_alloc();
 			if (!sfn)
-				goto st_failure;
+				goto failure;
 
 			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
@@ -1032,12 +1032,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				/* If it is failed, discard just allocated
-				   root, and then (in st_failure) stale node
+				   root, and then (in failure) stale node
 				   in main tree.
 				 */
 				node_free(sfn);
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 
 			/* Now link new subtree to main tree */
@@ -1051,7 +1051,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 		}
 
@@ -1093,22 +1093,22 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			atomic_inc(&pn->leaf->rt6i_ref);
 		}
 #endif
-		if (!(rt->dst.flags & DST_NOCACHE))
-			dst_free(&rt->dst);
+		goto failure;
 	}
 	return err;
 
-#ifdef CONFIG_IPV6_SUBTREES
-	/* Subtree creation failed, probably main tree node
-	   is orphan. If it is, shoot it.
+failure:
+	/* fn->leaf could be NULL if fn is an intermediate node and we
+	 * failed to add the new route to it in both subtree creation
+	 * failure and fib6_add_rt2node() failure case.
+	 * In both cases, fib6_repair_tree() should be called to fix
+	 * fn->leaf.
 	 */
-st_failure:
 	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
 		fib6_repair_tree(info->nl_net, fn);
 	if (!(rt->dst.flags & DST_NOCACHE))
 		dst_free(&rt->dst);
 	return err;
-#endif
 }
 
 /*
-- 
2.13.5


From f5b505ce91819dfa6a85568befd9097d501d6424 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 16 Aug 2017 17:53:36 -0400
Subject: [PATCH 20/27] tcp: when rearming RTO, if RTO time is in past then
 fire RTO ASAP

[ Upstream commit cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 ]

In some situations tcp_send_loss_probe() can realize that it's unable
to send a loss probe (TLP), and falls back to calling tcp_rearm_rto()
to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto()
realizes that the RTO was eligible to fire immediately or at some
point in the past (delta_us <= 0). Previously in such cases
tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now +
icsk_rto, which caused needless delays of hundreds of milliseconds
(and non-linear behavior that made reproducible testing
difficult). This commit changes the logic to schedule "overdue" RTOs
ASAP, rather than at now + icsk_rto.

Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)")
Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 57bcae81fe42..fbaac4423a99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3007,8 +3007,7 @@ void tcp_rearm_rto(struct sock *sk)
 			/* delta may not be positive if the socket is locked
 			 * when the retrans timer fires and is rescheduled.
 			 */
-			if (delta > 0)
-				rto = delta;
+			rto = max(delta, 1);
 		}
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
 					  TCP_RTO_MAX);
-- 
2.13.5


From 826efe95a4e93539a35ffeb392b0cf899972332d Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Aug 2017 18:29:52 +0300
Subject: [PATCH 21/27] net/mlx4_core: Enable 4K UAR if SRIOV module parameter
 is not enabled

[ Upstream commit ca3d89a3ebe79367bd41b6b8ba37664478ae2dba ]

enable_4k_uar module parameter was added in patch cited below to
address the backward compatibility issue in SRIOV when the VM has
system's PAGE_SIZE uar implementation and the Hypervisor has 4k uar
implementation.

The above compatibility issue does not exist in the non SRIOV case.
In this patch, we always enable 4k uar implementation if SRIOV
is not enabled on mlx4's supported cards.

Fixes: 76e39ccf9c36 ("net/mlx4_core: Fix backward compatibility on VFs")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 83aab1e4c8c8..9f214f9fb48c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -430,7 +430,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		/* Virtual PCI function needs to determine UAR page size from
 		 * firmware. Only master PCI function can set the uar page size
 		 */
-		if (enable_4k_uar)
+		if (enable_4k_uar || !dev->persist->num_vfs)
 			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
 		else
 			dev->uar_page_shift = PAGE_SHIFT;
@@ -2275,7 +2275,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 
 		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
 
-		if (enable_4k_uar) {
+		if (enable_4k_uar || !dev->persist->num_vfs) {
 			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
 						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
 			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
-- 
2.13.5


From 6b093ce76d0ed3de3949fc4edf9139726a4abacd Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 17 Aug 2017 23:14:58 +0100
Subject: [PATCH 22/27] irda: do not leak initialized list.dev to userspace

[ Upstream commit b024d949a3c24255a7ef1a470420eb478949aa4c ]

list.dev has not been initialized and so the copy_to_user is copying
data from the stack back to user space which is a potential
information leak. Fix this ensuring all of list is initialized to
zero.

Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/af_irda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 8d77ad5cadaf..4cadc29f547c 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2225,7 +2225,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct irda_sock *self = irda_sk(sk);
-	struct irda_device_list list;
+	struct irda_device_list list = { 0 };
 	struct irda_device_info *discoveries;
 	struct irda_ias_set *	ias_opt;	/* IAS get/query params */
 	struct ias_object *	ias_obj;	/* Object in IAS */
-- 
2.13.5


From f7c8f55a33f335fea5ee2648710d860061067665 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 18 Aug 2017 11:01:36 +0800
Subject: [PATCH 23/27] net: sched: fix NULL pointer dereference when action
 calls some targets

[ Upstream commit 4f8a881acc9d1adaf1e552349a0b1df28933a04c ]

As we know in some target's checkentry it may dereference par.entryinfo
to check entry stuff inside. But when sched action calls xt_check_target,
par.entryinfo is set with NULL. It would cause kernel panic when calling
some targets.

It can be reproduce with:
  # tc qd add dev eth1 ingress handle ffff:
  # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \
    -j ECN --ecn-tcp-remove

It could also crash kernel when using target CLUSTERIP or TPROXY.

By now there's no proper value for par.entryinfo in ipt_init_target,
but it can not be set with NULL. This patch is to void all these
panics by setting it with an ipt_entry obj with all members = 0.

Note that this issue has been there since the very beginning.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ipt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d516ba8178b8..541707802a23 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -41,6 +41,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
+	struct ipt_entry e = {};
 	int ret = 0;
 
 	target = xt_request_find_target(AF_INET, t->u.user.name,
@@ -52,6 +53,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
 	memset(&par, 0, sizeof(par));
 	par.net       = net;
 	par.table     = table;
+	par.entryinfo = &e;
 	par.target    = target;
 	par.targinfo  = t->data;
 	par.hook_mask = hook;
-- 
2.13.5


From 1d8bda32a0968ce03ab8f5189fd52d25c5aeb193 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 19 Aug 2017 15:37:07 +0300
Subject: [PATCH 24/27] net_sched: fix order of queue length updates in
 qdisc_replace()

[ Upstream commit 68a66d149a8c78ec6720f268597302883e48e9fa ]

This important to call qdisc_tree_reduce_backlog() after changing queue
length. Parent qdisc should deactivate class in ->qlen_notify() called from
qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero.

Missed class deactivations leads to crashes/warnings at picking packets
from empty qdisc and corrupting state at reactivating this class in future.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper")
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 22e52093bfda..db5b6b6346b3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -785,8 +785,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+		unsigned int qlen = old->q.qlen;
+		unsigned int backlog = old->qstats.backlog;
+
 		qdisc_reset(old);
+		qdisc_tree_reduce_backlog(old, qlen, backlog);
 	}
 	sch_tree_unlock(sch);
 
-- 
2.13.5


From ba75231c8ff0e1dd9cb4beb9414d6650bd3ad32f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 2 Jul 2017 02:13:30 +0200
Subject: [PATCH 25/27] bpf, verifier: add additional patterns to
 evaluate_reg_imm_alu

[ Upstream commit 43188702b3d98d2792969a3377a30957f05695e6 ]

Currently the verifier does not track imm across alu operations when
the source register is of unknown type. This adds additional pattern
matching to catch this and track imm. We've seen LLVM generating this
pattern while working on cilium.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8a725697bed..03f8d02849be 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1650,6 +1650,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+					struct bpf_insn *insn)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+	u8 opcode = BPF_OP(insn->code);
+	s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+	/* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+	if (src_reg->imm > 0 && dst_reg->imm) {
+		switch (opcode) {
+		case BPF_ADD:
+			/* dreg += sreg
+			 * where both have zero upper bits. Adding them
+			 * can only result making one more bit non-zero
+			 * in the larger value.
+			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			dst_reg->imm--;
+			break;
+		case BPF_AND:
+			/* dreg &= sreg
+			 * AND can not extend zero bits only shrink
+			 * Ex.  0x00..00ffffff
+			 *    & 0x0f..ffffffff
+			 *     ----------------
+			 *      0x00..00ffffff
+			 */
+			dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_OR:
+			/* dreg |= sreg
+			 * OR can only extend zero bits
+			 * Ex.  0x00..00ffffff
+			 *    | 0x0f..ffffffff
+			 *     ----------------
+			 *      0x0f..00ffffff
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_SUB:
+		case BPF_MUL:
+		case BPF_RSH:
+		case BPF_LSH:
+			/* These may be flushed out later */
+		default:
+			mark_reg_unknown_value(regs, insn->dst_reg);
+		}
+	} else {
+		mark_reg_unknown_value(regs, insn->dst_reg);
+	}
+
+	dst_reg->type = UNKNOWN_VALUE;
+	return 0;
+}
+
 static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 				struct bpf_insn *insn)
 {
@@ -1659,6 +1718,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	u8 opcode = BPF_OP(insn->code);
 	u64 dst_imm = dst_reg->imm;
 
+	if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+		return evaluate_reg_imm_alu_unknown(env, insn);
+
 	/* dst_reg->type == CONST_IMM here. Simulate execution of insns
 	 * containing ALU ops. Don't care about overflow or negative
 	 * values, just add/sub/... them; registers are in u64.
-- 
2.13.5


From 6702c5e8722bb43f9c5c1c4f6da633e28313aa92 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Jul 2017 00:00:21 +0200
Subject: [PATCH 26/27] bpf: fix mixed signed/unsigned derived min/max value
 bounds

[ Upstream commit 4cabc5b186b5427b9ee5a7495172542af105f02b ]

Edward reported that there's an issue in min/max value bounds
tracking when signed and unsigned compares both provide hints
on limits when having unknown variables. E.g. a program such
as the following should have been rejected:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff8a94cda93400
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  11: (65) if r1 s> 0x1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

What happens is that in the first part ...

   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3

... r1 carries an unsigned value, and is compared as unsigned
against a register carrying an immediate. Verifier deduces in
reg_set_min_max() that since the compare is unsigned and operation
is greater than (>), that in the fall-through/false case, r1's
minimum bound must be 0 and maximum bound must be r2. Latter is
larger than the bound and thus max value is reset back to being
'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now
'R1=inv,min_value=0'. The subsequent test ...

  11: (65) if r1 s> 0x1 goto pc+2

... is a signed compare of r1 with immediate value 1. Here,
verifier deduces in reg_set_min_max() that since the compare
is signed this time and operation is greater than (>), that
in the fall-through/false case, we can deduce that r1's maximum
bound must be 1, meaning with prior test, we result in r1 having
the following state: R1=inv,min_value=0,max_value=1. Given that
the actual value this holds is -8, the bounds are wrongly deduced.
When this is being added to r0 which holds the map_value(_adj)
type, then subsequent store access in above case will go through
check_mem_access() which invokes check_map_access_adj(), that
will then probe whether the map memory is in bounds based
on the min_value and max_value as well as access size since
the actual unknown value is min_value <= x <= max_value; commit
fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{,
_adj} register types") provides some more explanation on the
semantics.

It's worth to note in this context that in the current code,
min_value and max_value tracking are used for two things, i)
dynamic map value access via check_map_access_adj() and since
commit 06c1c049721a ("bpf: allow helpers access to variable memory")
ii) also enforced at check_helper_mem_access() when passing a
memory address (pointer to packet, map value, stack) and length
pair to a helper and the length in this case is an unknown value
defining an access range through min_value/max_value in that
case. The min_value/max_value tracking is /not/ used in the
direct packet access case to track ranges. However, the issue
also affects case ii), for example, the following crafted program
based on the same principle must be rejected as well:

   0: (b7) r2 = 0
   1: (bf) r3 = r10
   2: (07) r3 += -512
   3: (7a) *(u64 *)(r10 -16) = -8
   4: (79) r4 = *(u64 *)(r10 -16)
   5: (b7) r6 = -1
   6: (2d) if r4 > r6 goto pc+5
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
   7: (65) if r4 s> 0x1 goto pc+4
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1
  R10=fp
   8: (07) r4 += 1
   9: (b7) r5 = 0
  10: (6a) *(u16 *)(r10 -512) = 0
  11: (85) call bpf_skb_load_bytes#26
  12: (b7) r0 = 0
  13: (95) exit

Meaning, while we initialize the max_value stack slot that the
verifier thinks we access in the [1,2] range, in reality we
pass -7 as length which is interpreted as u32 in the helper.
Thus, this issue is relevant also for the case of helper ranges.
Resetting both bounds in check_reg_overflow() in case only one
of them exceeds limits is also not enough as similar test can be
created that uses values which are within range, thus also here
learned min value in r1 is incorrect when mixed with later signed
test to create a range:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff880ad081fa00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (65) if r1 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

This leaves us with two options for fixing this: i) to invalidate
all prior learned information once we switch signed context, ii)
to track min/max signed and unsigned boundaries separately as
done in [0]. (Given latter introduces major changes throughout
the whole verifier, it's rather net-next material, thus this
patch follows option i), meaning we can derive bounds either
from only signed tests or only unsigned tests.) There is still the
case of adjust_reg_min_max_vals(), where we adjust bounds on ALU
operations, meaning programs like the following where boundaries
on the reg get mixed in context later on when bounds are merged
on the dst reg must get rejected, too:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff89b2bf87ce00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+6
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (b7) r7 = 1
  12: (65) if r7 s> 0x0 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp
  13: (b7) r0 = 0
  14: (95) exit

  from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp
  15: (0f) r7 += r1
  16: (65) if r7 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  17: (0f) r0 += r7
  18: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  19: (b7) r0 = 0
  20: (95) exit

Meaning, in adjust_reg_min_max_vals() we must also reset range
values on the dst when src/dst registers have mixed signed/
unsigned derived min/max value bounds with one unbounded value
as otherwise they can be added together deducing false boundaries.
Once both boundaries are established from either ALU ops or
compare operations w/o mixing signed/unsigned insns, then they
can safely be added to other regs also having both boundaries
established. Adding regs with one unbounded side to a map value
where the bounded side has been learned w/o mixing ops is
possible, but the resulting map value won't recover from that,
meaning such op is considered invalid on the time of actual
access. Invalid bounds are set on the dst reg in case i) src reg,
or ii) in case dst reg already had them. The only way to recover
would be to perform i) ALU ops but only 'add' is allowed on map
value types or ii) comparisons, but these are disallowed on
pointers in case they span a range. This is fine as only BPF_JEQ
and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers
which potentially turn them into PTR_TO_MAP_VALUE type depending
on the branch, so only here min/max value cannot be invalidated
for them.

In terms of state pruning, value_from_signed is considered
as well in states_equal() when dealing with adjusted map values.
With regards to breaking existing programs, there is a small
risk, but use-cases are rather quite narrow where this could
occur and mixing compares probably unlikely.

Joint work with Josef and Edward.

  [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Reported-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/verifier.c        | 108 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 95 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d5093b52b485..88f4289e7eee 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -43,6 +43,7 @@ struct bpf_reg_state {
 	u32 min_align;
 	u32 aux_off;
 	u32 aux_off_align;
+	bool value_from_signed;
 };
 
 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 03f8d02849be..56c0257e0026 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 {
 	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
 	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+	regs[regno].value_from_signed = false;
 	regs[regno].min_align = 0;
 }
 
@@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
 	return -EACCES;
 }
 
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+			       const struct bpf_reg_state *reg)
 {
-	if (env->allow_ptr_leaks)
+	if (allow_ptr_leaks)
 		return false;
 
-	switch (env->cur_state.regs[regno].type) {
+	switch (reg->type) {
 	case UNKNOWN_VALUE:
 	case CONST_IMM:
 		return false;
@@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	}
 }
 
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
@@ -1825,10 +1832,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	dst_align = dst_reg->min_align;
 
 	/* We don't know anything about what was done to this register, mark it
-	 * as unknown.
+	 * as unknown. Also, if both derived bounds came from signed/unsigned
+	 * mixed compares and one side is unbounded, we cannot really do anything
+	 * with them as boundaries cannot be trusted. Thus, arithmetic of two
+	 * regs of such kind will get invalidated bounds on the dst side.
 	 */
-	if (min_val == BPF_REGISTER_MIN_RANGE &&
-	    max_val == BPF_REGISTER_MAX_RANGE) {
+	if ((min_val == BPF_REGISTER_MIN_RANGE &&
+	     max_val == BPF_REGISTER_MAX_RANGE) ||
+	    (BPF_SRC(insn->code) == BPF_X &&
+	     ((min_val != BPF_REGISTER_MIN_RANGE &&
+	       max_val == BPF_REGISTER_MAX_RANGE) ||
+	      (min_val == BPF_REGISTER_MIN_RANGE &&
+	       max_val != BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+	     regs[insn->dst_reg].value_from_signed !=
+	     regs[insn->src_reg].value_from_signed)) {
 		reset_reg_range_values(regs, insn->dst_reg);
 		return;
 	}
@@ -2015,6 +2036,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
 			regs[insn->dst_reg].min_align = calc_align(insn->imm);
+			regs[insn->dst_reg].value_from_signed = false;
 		}
 
 	} else if (opcode > BPF_END) {
@@ -2190,40 +2212,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg, u64 val,
 			    u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum val is val,
 		 * otherwise we know the min val is val+1.
 		 */
 		false_reg->max_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val + 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum value is val - 1,
 		 * otherwise we know the mimimum value is val.
 		 */
 		false_reg->max_value = val - 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2231,6 +2276,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -2240,41 +2291,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg, u64 val,
 				u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/*
 		 * If this is false, then the val is <= the register, if it is
 		 * true the register <= to the val.
 		 */
 		false_reg->min_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val - 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/* If this is false then constant < register, if it is true then
 		 * the register < constant.
 		 */
 		false_reg->min_value = val + 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2282,6 +2356,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-- 
2.13.5


From a44f11588c8506059c6843a2a0221979e547f86e Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 21 Jul 2017 14:37:34 +0100
Subject: [PATCH 27/27] bpf/verifier: fix min/max handling in BPF_SUB

[ Upstream commit 9305706c2e808ae59f1eb201867f82f1ddf6d7a6 ]

We have to subtract the src max from the dst min, and vice-versa, since
 (e.g.) the smallest result comes from the largest subtrahend.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 56c0257e0026..1e64ee3dd650 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1858,10 +1858,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	 * do our normal operations to the register, we need to set the values
 	 * to the min/max since they are undefined.
 	 */
-	if (min_val == BPF_REGISTER_MIN_RANGE)
-		dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-	if (max_val == BPF_REGISTER_MAX_RANGE)
-		dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+	if (opcode != BPF_SUB) {
+		if (min_val == BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		if (max_val == BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+	}
 
 	switch (opcode) {
 	case BPF_ADD:
@@ -1872,10 +1874,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		dst_reg->min_align = min(src_align, dst_align);
 		break;
 	case BPF_SUB:
+		/* If one of our values was at the end of our ranges, then the
+		 * _opposite_ value in the dst_reg goes to the end of our range.
+		 */
+		if (min_val == BPF_REGISTER_MIN_RANGE)
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+		if (max_val == BPF_REGISTER_MAX_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value -= min_val;
+			dst_reg->min_value -= max_val;
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value -= max_val;
+			dst_reg->max_value -= min_val;
 		dst_reg->min_align = min(src_align, dst_align);
 		break;
 	case BPF_MUL:
-- 
2.13.5