All of lore.kernel.org
 help / color / mirror / Atom feed
* [bpf PATCH v2 0/3] sockmap/ktls fixes
@ 2019-04-25 16:02 John Fastabend
  2019-04-25 16:03 ` [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED John Fastabend
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: John Fastabend @ 2019-04-25 16:02 UTC (permalink / raw)
  To: jakub.kicinski, john.fastabend, ast, daniel; +Cc: netdev, bpf

Series of fixes for sockmap and ktls, see patches for descriptions.

v2: fix build issue for CONFIG_TLS_DEVICE and fixup couple comments from
    Jakub.

---

John Fastabend (3):
      bpf: tls, implement unhash to avoid transition out of ESTABLISHED
      bpf: sockmap remove duplicate queue free
      bpf: sockmap fix msg->sg.size account on ingress skb


 include/net/tls.h  |   14 ++++++++++++-
 net/core/skmsg.c   |    1 +
 net/ipv4/tcp_bpf.c |    2 --
 net/tls/tls_main.c |   55 +++++++++++++++++++++++++++++++++++++++-------------
 net/tls/tls_sw.c   |   13 +++++++++---
 5 files changed, 64 insertions(+), 21 deletions(-)

--
Signature

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED
  2019-04-25 16:02 [bpf PATCH v2 0/3] sockmap/ktls fixes John Fastabend
@ 2019-04-25 16:03 ` John Fastabend
  2019-04-25 19:29   ` Jakub Kicinski
  2019-04-25 16:03 ` [bpf PATCH v2 2/3] bpf: sockmap remove duplicate queue free John Fastabend
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: John Fastabend @ 2019-04-25 16:03 UTC (permalink / raw)
  To: jakub.kicinski, john.fastabend, ast, daniel; +Cc: netdev, bpf

It is possible (via shutdown()) for TCP socks to go through TCP_CLOSE
state via tcp_disconnect() without calling into close callback. This
would allow a kTLS enabled socket to exist outside of ESTABLISHED
state which is not supported.

Solve this the same way we solved the sock{map|hash} case by adding
an unhash hook to remove tear down the TLS state.

In the process we also make the close hook more robust. We add a put
call into the close path, also in the unhash path, to remove the
reference to ulp data after free. Its no longer valid and may confuse
things later if the socket (re)enters kTLS code paths. Second we add
an 'if(ctx)' check to ensure the ctx is still valid and not released
from a previous unhash/close path.

Fixes: d91c3e17f75f2 ("net/tls: Only attach to sockets in ESTABLISHED state")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tls.h  |   14 ++++++++++++-
 net/tls/tls_main.c |   55 +++++++++++++++++++++++++++++++++++++++-------------
 net/tls/tls_sw.c   |   13 +++++++++---
 3 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index d9d0ac66f040..ae13ea19b375 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -266,6 +266,8 @@ struct tls_context {
 	void (*sk_write_space)(struct sock *sk);
 	void (*sk_destruct)(struct sock *sk);
 	void (*sk_proto_close)(struct sock *sk, long timeout);
+	void (*sk_proto_unhash)(struct sock *sk);
+	struct proto *sk_proto;
 
 	int  (*setsockopt)(struct sock *sk, int level,
 			   int optname, char __user *optval,
@@ -303,7 +305,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_resources_tx(struct sock *sk);
+void tls_sw_free_resources_tx(struct sock *sk, bool locked);
 void tls_sw_free_resources_rx(struct sock *sk);
 void tls_sw_release_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -504,6 +506,16 @@ static inline void xor_iv_with_seq(int version, char *iv, char *seq)
 	}
 }
 
+static inline void tls_put_ctx(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tls_context *ctx = icsk->icsk_ulp_data;
+
+	if (!ctx)
+		return;
+	sk->sk_prot = ctx->sk_proto;
+	icsk->icsk_ulp_data = NULL;
+}
 
 static inline struct tls_sw_context_rx *tls_sw_ctx_rx(
 		const struct tls_context *tls_ctx)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 7e546b8ec000..54842d0ddbb5 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -261,23 +261,16 @@ static void tls_ctx_free(struct tls_context *ctx)
 	kfree(ctx);
 }
 
-static void tls_sk_proto_close(struct sock *sk, long timeout)
+static bool tls_sk_proto_destroy(struct sock *sk,
+				 struct tls_context *ctx, bool locked)
 {
-	struct tls_context *ctx = tls_get_ctx(sk);
 	long timeo = sock_sndtimeo(sk, 0);
-	void (*sk_proto_close)(struct sock *sk, long timeout);
-	bool free_ctx = false;
-
-	lock_sock(sk);
-	sk_proto_close = ctx->sk_proto_close;
 
 	if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
-		goto skip_tx_cleanup;
+		return false;
 
-	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) {
-		free_ctx = true;
-		goto skip_tx_cleanup;
-	}
+	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)
+		return true;
 
 	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
 		tls_handle_open_record(sk, 0);
@@ -286,7 +279,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	if (ctx->tx_conf == TLS_SW) {
 		kfree(ctx->tx.rec_seq);
 		kfree(ctx->tx.iv);
-		tls_sw_free_resources_tx(sk);
+		tls_sw_free_resources_tx(sk, locked);
 #ifdef CONFIG_TLS_DEVICE
 	} else if (ctx->tx_conf == TLS_HW) {
 		tls_device_free_resources_tx(sk);
@@ -310,8 +303,39 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		tls_ctx_free(ctx);
 		ctx = NULL;
 	}
+	return false;
+}
+
+static void tls_sk_proto_unhash(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	void (*sk_proto_unhash)(struct sock *sk);
+	bool free_ctx;
+
+	if (!ctx)
+		return sk->sk_prot->unhash(sk);
+	sk_proto_unhash = ctx->sk_proto_unhash;
+	free_ctx = tls_sk_proto_destroy(sk, ctx, false);
+	tls_put_ctx(sk);
+	if (sk_proto_unhash)
+		sk_proto_unhash(sk);
+	if (free_ctx)
+		tls_ctx_free(ctx);
+}
 
-skip_tx_cleanup:
+static void tls_sk_proto_close(struct sock *sk, long timeout)
+{
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+	struct tls_context *ctx = tls_get_ctx(sk);
+	bool free_ctx;
+
+	if (!ctx)
+		return sk->sk_prot->destroy(sk);
+
+	lock_sock(sk);
+	sk_proto_close = ctx->sk_proto_close;
+	free_ctx = tls_sk_proto_destroy(sk, ctx, true);
+	tls_put_ctx(sk);
 	release_sock(sk);
 	sk_proto_close(sk, timeout);
 	/* free ctx for TLS_HW_RECORD, used by tcp_set_state
@@ -609,6 +633,8 @@ static struct tls_context *create_ctx(struct sock *sk)
 	ctx->setsockopt = sk->sk_prot->setsockopt;
 	ctx->getsockopt = sk->sk_prot->getsockopt;
 	ctx->sk_proto_close = sk->sk_prot->close;
+	ctx->sk_proto_unhash = sk->sk_prot->unhash;
+	ctx->sk_proto = sk->sk_prot;
 	return ctx;
 }
 
@@ -732,6 +758,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 	prot[TLS_BASE][TLS_BASE].setsockopt	= tls_setsockopt;
 	prot[TLS_BASE][TLS_BASE].getsockopt	= tls_getsockopt;
 	prot[TLS_BASE][TLS_BASE].close		= tls_sk_proto_close;
+	prot[TLS_BASE][TLS_BASE].unhash		= tls_sk_proto_unhash;
 
 	prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
 	prot[TLS_SW][TLS_BASE].sendmsg		= tls_sw_sendmsg;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f780b473827b..0577633c319b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2044,7 +2044,7 @@ static void tls_data_ready(struct sock *sk)
 	}
 }
 
-void tls_sw_free_resources_tx(struct sock *sk)
+void tls_sw_free_resources_tx(struct sock *sk, bool locked)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
@@ -2055,9 +2055,11 @@ void tls_sw_free_resources_tx(struct sock *sk)
 	if (atomic_read(&ctx->encrypt_pending))
 		crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
 
-	release_sock(sk);
+	if (locked)
+		release_sock(sk);
 	cancel_delayed_work_sync(&ctx->tx_work.work);
-	lock_sock(sk);
+	if (locked)
+		lock_sock(sk);
 
 	/* Tx whatever records we can transmit and abandon the rest */
 	tls_tx_records(sk, -1);
@@ -2080,7 +2082,10 @@ void tls_sw_free_resources_tx(struct sock *sk)
 		kfree(rec);
 	}
 
-	crypto_free_aead(ctx->aead_send);
+	if (ctx->aead_send) {
+		crypto_free_aead(ctx->aead_send);
+		ctx->aead_send = NULL;
+	}
 	tls_free_open_rec(sk);
 
 	kfree(ctx);


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [bpf PATCH v2 2/3] bpf: sockmap remove duplicate queue free
  2019-04-25 16:02 [bpf PATCH v2 0/3] sockmap/ktls fixes John Fastabend
  2019-04-25 16:03 ` [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED John Fastabend
@ 2019-04-25 16:03 ` John Fastabend
  2019-04-25 16:03 ` [bpf PATCH v2 3/3] bpf: sockmap fix msg->sg.size account on ingress skb John Fastabend
  2019-04-25 18:30 ` [bpf PATCH v2 0/3] sockmap/ktls fixes Jakub Kicinski
  3 siblings, 0 replies; 11+ messages in thread
From: John Fastabend @ 2019-04-25 16:03 UTC (permalink / raw)
  To: jakub.kicinski, john.fastabend, ast, daniel; +Cc: netdev, bpf

In tcp bpf remove we free the cork list and purge the ingress msg
list. However we do this before the ref count reaches zero so it
could be possible some other access is in progress. In this case
(tcp close and/or tcp_unhash) we happen to also hold the sock
lock so no path exists but lets fix it otherwise it is extremely
fragile and breaks the reference counting rules. Also we already
check the cork list and ingress msg queue and free them once the
ref count reaches zero so its wasteful to check twice.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 net/ipv4/tcp_bpf.c |    2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 1bb7321a256d..4a619c85daed 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -528,8 +528,6 @@ static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
 {
 	struct sk_psock_link *link;
 
-	sk_psock_cork_free(psock);
-	__sk_psock_purge_ingress_msg(psock);
 	while ((link = sk_psock_link_pop(psock))) {
 		sk_psock_unlink(sk, link);
 		sk_psock_free_link(link);


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [bpf PATCH v2 3/3] bpf: sockmap fix msg->sg.size account on ingress skb
  2019-04-25 16:02 [bpf PATCH v2 0/3] sockmap/ktls fixes John Fastabend
  2019-04-25 16:03 ` [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED John Fastabend
  2019-04-25 16:03 ` [bpf PATCH v2 2/3] bpf: sockmap remove duplicate queue free John Fastabend
@ 2019-04-25 16:03 ` John Fastabend
  2019-04-25 18:30 ` [bpf PATCH v2 0/3] sockmap/ktls fixes Jakub Kicinski
  3 siblings, 0 replies; 11+ messages in thread
From: John Fastabend @ 2019-04-25 16:03 UTC (permalink / raw)
  To: jakub.kicinski, john.fastabend, ast, daniel; +Cc: netdev, bpf

When converting a skb to msg->sg we forget to set the size after the
latest ktls/tls code conversion. This patch can be reached by doing
a redir into ingress path from BPF skb sock recv hook. Then trying to
read the size fails.

Fix this by setting the size.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 net/core/skmsg.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index cc94d921476c..782ae9eb4dce 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -411,6 +411,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 	sk_mem_charge(sk, skb->len);
 	copied = skb->len;
 	msg->sg.start = 0;
+	msg->sg.size = copied;
 	msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
 	msg->skb = skb;
 


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 0/3] sockmap/ktls fixes
  2019-04-25 16:02 [bpf PATCH v2 0/3] sockmap/ktls fixes John Fastabend
                   ` (2 preceding siblings ...)
  2019-04-25 16:03 ` [bpf PATCH v2 3/3] bpf: sockmap fix msg->sg.size account on ingress skb John Fastabend
@ 2019-04-25 18:30 ` Jakub Kicinski
  2019-04-25 18:49   ` John Fastabend
  3 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2019-04-25 18:30 UTC (permalink / raw)
  To: John Fastabend; +Cc: ast, daniel, netdev, bpf

On Thu, 25 Apr 2019 09:02:50 -0700, John Fastabend wrote:
> Series of fixes for sockmap and ktls, see patches for descriptions.
> 
> v2: fix build issue for CONFIG_TLS_DEVICE and fixup couple comments from
>     Jakub.

Ah, right my comment about the rx side sleeping was fairly nonsensical,
the locking issues is that the work queue tries to lock the same socket.

But I'm hitting some nasties, there is a UAF on a non-offload socket,
and offload dies fairly hard.  It _could_ be my offload patches on top,
but "they worked yesterday".  Digging deeper on the offload side,
here's the UAF:

[  258.559962] =================================================================
[  258.568212] BUG: KASAN: use-after-free in tls_sk_proto_close+0x1a9/0x1e0 [tl]
[  258.576398] Read of size 8 at addr ffff88871d1edf18 by task ktls_source/2542
[  258.584369] 
[  258.586121] CPU: 18 PID: 2542 Comm: ktls_source Not tainted 5.1.0-rc5-debug-7
[  258.596445] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/177
[  258.604968] Call Trace:
[  258.607796]  dump_stack+0x7c/0xc0
[  258.611594]  print_address_description.cold.2+0x9/0x239
[  258.617528]  kasan_report.cold.3+0x78/0x92
[  258.622200]  ? tls_sk_proto_close+0x1a9/0x1e0 [tls]
[  258.627745]  ? tcp_check_oom+0x390/0x390
[  258.632221]  tls_sk_proto_close+0x1a9/0x1e0 [tls]
[  258.637573]  inet_release+0xd6/0x1b0
[  258.641661]  __sock_release+0xc0/0x290
[  258.645942]  sock_close+0x11/0x20
[  258.649735]  __fput+0x244/0x730
[  258.653341]  task_work_run+0xfe/0x180
[  258.657530]  exit_to_usermode_loop+0x10d/0x130
[  258.662589]  do_syscall_64+0x2ff/0x400
[  258.666875]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  258.672630] RIP: 0033:0x7fb42bbe2421
[  258.676723] Code: f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00
[  258.697857] RSP: 002b:00007fffaabd9428 EFLAGS: 00000246 ORIG_RAX: 00000000003
[  258.706526] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00007fb42bbe2421
[  258.714595] RDX: 00007fb41ffbf000 RSI: 000000000bebd000 RDI: 0000000000000003
[  258.722664] RBP: 0000000000000003 R08: 00000000ffffffff R09: 0000000000000000
[  258.730735] R10: 0000000000000022 R11: 0000000000000246 R12: 00007fb42b7df210
[  258.738805] R13: 00007fb41f923010 R14: 0000000000004113 R15: 0000000000000000
[  258.746875] 
[  258.748645] Allocated by task 2542:
[  258.752655]  create_ctx+0x46/0x2d0 [tls]
[  258.757129]  tls_init+0xd2/0x470 [tls]
[  258.761410]  tcp_set_ulp+0x235/0x4bf
[  258.765499]  do_tcp_setsockopt.isra.5+0x28b/0x1d90
[  258.770944]  __sys_setsockopt+0x10e/0x1d0
[  258.775514]  __x64_sys_setsockopt+0xba/0x150
[  258.780378]  do_syscall_64+0x96/0x400
[  258.784578]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  258.790308] 
[  258.792057] Freed by task 2542:
[  258.795656]  kfree+0xe5/0x300
[  258.799060]  tls_sk_proto_destroy+0x1c7/0x400 [tls]
[  258.804615]  tls_sk_proto_close+0x8a/0x1e0 [tls]
[  258.809870]  inet_release+0xd6/0x1b0
[  258.813953]  __sock_release+0xc0/0x290
[  258.818231]  sock_close+0x11/0x20
[  258.822023]  __fput+0x244/0x730
[  258.825620]  task_work_run+0xfe/0x180
[  258.829799]  exit_to_usermode_loop+0x10d/0x130
[  258.834855]  do_syscall_64+0x2ff/0x400
[  258.839136]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  258.844880] 
[  258.846649] The buggy address belongs to the object at ffff88871d1ede88
[  258.846649]  which belongs to the cache kmalloc-512 of size 512
[  258.860764] The buggy address is located 144 bytes inside of
[  258.860764]  512-byte region [ffff88871d1ede88, ffff88871d1ee088)
[  258.874002] The buggy address belongs to the page:
[  258.879450] page:ffffea001c747a00 count:1 mapcount:0 mapping:ffff88881e411080
[  258.892014] flags: 0x2ffff0000010200(slab|head)
[  258.897169] raw: 02ffff0000010200 ffffea001c88b208 ffffea00204bb208 ffff88880
[  258.905940] raw: ffff88871d1ed7c8 0000000000250019 00000001ffffffff 000000000
[  258.914711] page dumped because: kasan: bad access detected
[  258.921048] 
[  258.922797] Memory state around the buggy address:
[  258.928245]  ffff88871d1ede00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc c
[  258.936435]  ffff88871d1ede80: fc fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
[  258.944635] >ffff88871d1edf00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
[  258.952830]                             ^
[  258.957401]  ffff88871d1edf80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
[  258.965591]  ffff88871d1ee000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
[  258.973778] =================================================================

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 0/3] sockmap/ktls fixes
  2019-04-25 18:30 ` [bpf PATCH v2 0/3] sockmap/ktls fixes Jakub Kicinski
@ 2019-04-25 18:49   ` John Fastabend
  2019-04-25 19:12     ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: John Fastabend @ 2019-04-25 18:49 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: ast, daniel, netdev, bpf

On 4/25/19 11:30 AM, Jakub Kicinski wrote:
> On Thu, 25 Apr 2019 09:02:50 -0700, John Fastabend wrote:
>> Series of fixes for sockmap and ktls, see patches for descriptions.
>>
>> v2: fix build issue for CONFIG_TLS_DEVICE and fixup couple comments from
>>     Jakub.
> 
> Ah, right my comment about the rx side sleeping was fairly nonsensical,
> the locking issues is that the work queue tries to lock the same socket.
> 

Right.

> But I'm hitting some nasties, there is a UAF on a non-offload socket,
> and offload dies fairly hard.  It _could_ be my offload patches on top,
> but "they worked yesterday".  Digging deeper on the offload side,
> here's the UAF:

hmm OK I see what is happening. I could also only enable the unhash for
SW/SW  base proto. So only with,

  prot[TLS_SW][TLS_SW].unhash

There is this on the offload side did I smash it somehow?

   prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash       = tls_hw_unhash;

Also I have this in my stack,

commit 01628cbabdf2fbf0b710a399f54ae005d0963f3f (HEAD -> ktls-fixes,
refs/patches/ktls-fixes/bpf-sockmap-only-stop-strp-if)
Author: John Fastabend <john.fastabend@gmail.com>
Date:   Wed Apr 24 15:55:55 2019 -0700

    bpf: sockmap, only stop/flush strp if it was enabled at some point

    If we try to call strp_done on a parser that has never been
    initialized, because the sockmap user is only using TX side for
    example we get the following error.


      [  883.422081] WARNING: CPU: 1 PID: 208 at kernel/workqueue.c:3030
__flush_work+0x1ca/0x1e0
      ...
      [  883.422095] Workqueue: events sk_psock_destroy_deferred
      [  883.422097] RIP: 0010:__flush_work+0x1ca/0x1e0


    This had been wrapped in a 'if (psock->parser.enabled)' logic which
    was broken because the strp_done() was never actually being called
    because we do a strp_stop() earlier in the tear down logic will
    set parser.enabled to false. This could result in a use after free
    if work was still in the queue and was resolved by the patch here,
    1d79895aef18f ("sk_msg: Always cancel strp work before freeing the
    psock"). However, calling strp_stop(), done by the patch marked in
    the fixes tag, only is useful if we never initialized a strp parser
    program and never initialized the strp to start with. Because if
    we had initialized a stream parser strp_stop() would have beencalled
    by sk_psock_drop() earlier in the tear down process.  By forcing the
    strp to stop we get past the WARNING in strp_done that checks
    the stopped flag but calling cancel_work_sync on work that has never
    been initialized is also wrong and generates the warning above.

    To fix check if the parser program exists. If the program exists
    then the strp work has been initialized and must be sync'd and
    cancelled before free'ing any structures. If no program exists we
    never initialized the stream parser in the first place so skip the
    sync/cancel logic implemented by strp_done.

    Finally, remove the strp_done its not needed and in the case where
    we are using the
    stream parser has already been called.

    Fixes: e8e3437762ad9 ("bpf: Stop the psock parser before canceling
its work")
    Signed-off-by: John Fastabend <john.fastabend@gmail.com>

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 782ae9eb4dce..4b4b9ad4bb86 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -555,8 +555,12 @@ static void sk_psock_destroy_deferred(struct
work_struct *gc)
        struct sk_psock *psock = container_of(gc, struct sk_psock, gc);

        /* No sk_callback_lock since already detached. */
-       strp_stop(&psock->parser.strp);
-       strp_done(&psock->parser.strp);
+
+       /* Parser has been stopped */
+       if (psock->progs.skb_parser)
+               strp_stop(&psock->parser.strp);
+               strp_done(&psock->parser.strp);
+       }

        cancel_work_sync(&psock->work);


> 
> [  258.559962] =================================================================
> [  258.568212] BUG: KASAN: use-after-free in tls_sk_proto_close+0x1a9/0x1e0 [tl]
> [  258.576398] Read of size 8 at addr ffff88871d1edf18 by task ktls_source/2542
> [  258.584369] 
> [  258.586121] CPU: 18 PID: 2542 Comm: ktls_source Not tainted 5.1.0-rc5-debug-7
> [  258.596445] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/177
> [  258.604968] Call Trace:
> [  258.607796]  dump_stack+0x7c/0xc0
> [  258.611594]  print_address_description.cold.2+0x9/0x239
> [  258.617528]  kasan_report.cold.3+0x78/0x92
> [  258.622200]  ? tls_sk_proto_close+0x1a9/0x1e0 [tls]
> [  258.627745]  ? tcp_check_oom+0x390/0x390
> [  258.632221]  tls_sk_proto_close+0x1a9/0x1e0 [tls]
> [  258.637573]  inet_release+0xd6/0x1b0
> [  258.641661]  __sock_release+0xc0/0x290
> [  258.645942]  sock_close+0x11/0x20
> [  258.649735]  __fput+0x244/0x730
> [  258.653341]  task_work_run+0xfe/0x180
> [  258.657530]  exit_to_usermode_loop+0x10d/0x130
> [  258.662589]  do_syscall_64+0x2ff/0x400
> [  258.666875]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  258.672630] RIP: 0033:0x7fb42bbe2421
> [  258.676723] Code: f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00
> [  258.697857] RSP: 002b:00007fffaabd9428 EFLAGS: 00000246 ORIG_RAX: 00000000003
> [  258.706526] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00007fb42bbe2421
> [  258.714595] RDX: 00007fb41ffbf000 RSI: 000000000bebd000 RDI: 0000000000000003
> [  258.722664] RBP: 0000000000000003 R08: 00000000ffffffff R09: 0000000000000000
> [  258.730735] R10: 0000000000000022 R11: 0000000000000246 R12: 00007fb42b7df210
> [  258.738805] R13: 00007fb41f923010 R14: 0000000000004113 R15: 0000000000000000
> [  258.746875] 
> [  258.748645] Allocated by task 2542:
> [  258.752655]  create_ctx+0x46/0x2d0 [tls]
> [  258.757129]  tls_init+0xd2/0x470 [tls]
> [  258.761410]  tcp_set_ulp+0x235/0x4bf
> [  258.765499]  do_tcp_setsockopt.isra.5+0x28b/0x1d90
> [  258.770944]  __sys_setsockopt+0x10e/0x1d0
> [  258.775514]  __x64_sys_setsockopt+0xba/0x150
> [  258.780378]  do_syscall_64+0x96/0x400
> [  258.784578]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  258.790308] 
> [  258.792057] Freed by task 2542:
> [  258.795656]  kfree+0xe5/0x300
> [  258.799060]  tls_sk_proto_destroy+0x1c7/0x400 [tls]
> [  258.804615]  tls_sk_proto_close+0x8a/0x1e0 [tls]
> [  258.809870]  inet_release+0xd6/0x1b0
> [  258.813953]  __sock_release+0xc0/0x290
> [  258.818231]  sock_close+0x11/0x20
> [  258.822023]  __fput+0x244/0x730
> [  258.825620]  task_work_run+0xfe/0x180
> [  258.829799]  exit_to_usermode_loop+0x10d/0x130
> [  258.834855]  do_syscall_64+0x2ff/0x400
> [  258.839136]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [  258.844880] 
> [  258.846649] The buggy address belongs to the object at ffff88871d1ede88
> [  258.846649]  which belongs to the cache kmalloc-512 of size 512
> [  258.860764] The buggy address is located 144 bytes inside of
> [  258.860764]  512-byte region [ffff88871d1ede88, ffff88871d1ee088)
> [  258.874002] The buggy address belongs to the page:
> [  258.879450] page:ffffea001c747a00 count:1 mapcount:0 mapping:ffff88881e411080
> [  258.892014] flags: 0x2ffff0000010200(slab|head)
> [  258.897169] raw: 02ffff0000010200 ffffea001c88b208 ffffea00204bb208 ffff88880
> [  258.905940] raw: ffff88871d1ed7c8 0000000000250019 00000001ffffffff 000000000
> [  258.914711] page dumped because: kasan: bad access detected
> [  258.921048] 
> [  258.922797] Memory state around the buggy address:
> [  258.928245]  ffff88871d1ede00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc c
> [  258.936435]  ffff88871d1ede80: fc fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
> [  258.944635] >ffff88871d1edf00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
> [  258.952830]                             ^
> [  258.957401]  ffff88871d1edf80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
> [  258.965591]  ffff88871d1ee000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb b
> [  258.973778] =================================================================
> 


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 0/3] sockmap/ktls fixes
  2019-04-25 18:49   ` John Fastabend
@ 2019-04-25 19:12     ` Jakub Kicinski
  0 siblings, 0 replies; 11+ messages in thread
From: Jakub Kicinski @ 2019-04-25 19:12 UTC (permalink / raw)
  To: John Fastabend; +Cc: ast, daniel, netdev, bpf

On Thu, 25 Apr 2019 11:49:18 -0700, John Fastabend wrote:
> On 4/25/19 11:30 AM, Jakub Kicinski wrote:
> > On Thu, 25 Apr 2019 09:02:50 -0700, John Fastabend wrote:  
> >> Series of fixes for sockmap and ktls, see patches for descriptions.
> >>
> >> v2: fix build issue for CONFIG_TLS_DEVICE and fixup couple comments from
> >>     Jakub.  
> > 
> > Ah, right my comment about the rx side sleeping was fairly nonsensical,
> > the locking issues is that the work queue tries to lock the same socket.
> >   
> 
> Right.
> 
> > But I'm hitting some nasties, there is a UAF on a non-offload socket,
> > and offload dies fairly hard.  It _could_ be my offload patches on top,
> > but "they worked yesterday".  Digging deeper on the offload side,
> > here's the UAF:  
> 
> hmm OK I see what is happening. I could also only enable the unhash for
> SW/SW  base proto. So only with,
> 
>   prot[TLS_SW][TLS_SW].unhash
> 
> There is this on the offload side did I smash it somehow?
> 
>    prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash       = tls_hw_unhash;


Um, I think you're good there, note that the TLS_HW_RECORD thing is not
the nice packet-based offload, it's the TOE stuff from Chelsio.  I'm
using TLS_HW.

> Also I have this in my stack,

Thanks, I will toss this in.

> commit 01628cbabdf2fbf0b710a399f54ae005d0963f3f (HEAD -> ktls-fixes,
> refs/patches/ktls-fixes/bpf-sockmap-only-stop-strp-if)
> Author: John Fastabend <john.fastabend@gmail.com>
> Date:   Wed Apr 24 15:55:55 2019 -0700
> 
>     bpf: sockmap, only stop/flush strp if it was enabled at some point
> 
>     If we try to call strp_done on a parser that has never been
>     initialized, because the sockmap user is only using TX side for
>     example we get the following error.
> 
> 
>       [  883.422081] WARNING: CPU: 1 PID: 208 at kernel/workqueue.c:3030
> __flush_work+0x1ca/0x1e0
>       ...
>       [  883.422095] Workqueue: events sk_psock_destroy_deferred
>       [  883.422097] RIP: 0010:__flush_work+0x1ca/0x1e0
> 
> 
>     This had been wrapped in a 'if (psock->parser.enabled)' logic which
>     was broken because the strp_done() was never actually being called
>     because we do a strp_stop() earlier in the tear down logic will
>     set parser.enabled to false. This could result in a use after free
>     if work was still in the queue and was resolved by the patch here,
>     1d79895aef18f ("sk_msg: Always cancel strp work before freeing the
>     psock"). However, calling strp_stop(), done by the patch marked in
>     the fixes tag, only is useful if we never initialized a strp parser
>     program and never initialized the strp to start with. Because if
>     we had initialized a stream parser strp_stop() would have beencalled
>     by sk_psock_drop() earlier in the tear down process.  By forcing the
>     strp to stop we get past the WARNING in strp_done that checks
>     the stopped flag but calling cancel_work_sync on work that has never
>     been initialized is also wrong and generates the warning above.
> 
>     To fix check if the parser program exists. If the program exists
>     then the strp work has been initialized and must be sync'd and
>     cancelled before free'ing any structures. If no program exists we
>     never initialized the stream parser in the first place so skip the
>     sync/cancel logic implemented by strp_done.
> 
>     Finally, remove the strp_done its not needed and in the case where
>     we are using the
>     stream parser has already been called.
> 
>     Fixes: e8e3437762ad9 ("bpf: Stop the psock parser before canceling
> its work")
>     Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> 
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 782ae9eb4dce..4b4b9ad4bb86 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -555,8 +555,12 @@ static void sk_psock_destroy_deferred(struct
> work_struct *gc)
>         struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
> 
>         /* No sk_callback_lock since already detached. */
> -       strp_stop(&psock->parser.strp);
> -       strp_done(&psock->parser.strp);
> +
> +       /* Parser has been stopped */
> +       if (psock->progs.skb_parser)
> +               strp_stop(&psock->parser.strp);
> +               strp_done(&psock->parser.strp);
> +       }
> 
>         cancel_work_sync(&psock->work);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED
  2019-04-25 16:03 ` [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED John Fastabend
@ 2019-04-25 19:29   ` Jakub Kicinski
  2019-04-25 19:32     ` John Fastabend
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2019-04-25 19:29 UTC (permalink / raw)
  To: John Fastabend; +Cc: ast, daniel, netdev, bpf

On Thu, 25 Apr 2019 09:03:08 -0700, John Fastabend wrote:
> +static void tls_sk_proto_unhash(struct sock *sk)
> +{
> +	struct tls_context *ctx = tls_get_ctx(sk);
> +	void (*sk_proto_unhash)(struct sock *sk);
> +	bool free_ctx;
> +
> +	if (!ctx)
> +		return sk->sk_prot->unhash(sk);
> +	sk_proto_unhash = ctx->sk_proto_unhash;
> +	free_ctx = tls_sk_proto_destroy(sk, ctx, false);
> +	tls_put_ctx(sk);

Oh, I think you can't put_ctx() unconditionally,
when free_ctx is false, tls_device_sk_destruct() 
needs it the ctx pointer.

I think this explains the offload crashing.

> +	if (sk_proto_unhash)
> +		sk_proto_unhash(sk);
> +	if (free_ctx)
> +		tls_ctx_free(ctx);
> +}
>  
> -skip_tx_cleanup:
> +static void tls_sk_proto_close(struct sock *sk, long timeout)
> +{
> +	void (*sk_proto_close)(struct sock *sk, long timeout);
> +	struct tls_context *ctx = tls_get_ctx(sk);
> +	bool free_ctx;
> +
> +	if (!ctx)
> +		return sk->sk_prot->destroy(sk);
> +
> +	lock_sock(sk);
> +	sk_proto_close = ctx->sk_proto_close;
> +	free_ctx = tls_sk_proto_destroy(sk, ctx, true);
> +	tls_put_ctx(sk);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED
  2019-04-25 19:29   ` Jakub Kicinski
@ 2019-04-25 19:32     ` John Fastabend
  2019-04-25 19:35       ` John Fastabend
  0 siblings, 1 reply; 11+ messages in thread
From: John Fastabend @ 2019-04-25 19:32 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: ast, daniel, netdev, bpf

On 4/25/19 12:29 PM, Jakub Kicinski wrote:
> On Thu, 25 Apr 2019 09:03:08 -0700, John Fastabend wrote:
>> +static void tls_sk_proto_unhash(struct sock *sk)
>> +{
>> +	struct tls_context *ctx = tls_get_ctx(sk);
>> +	void (*sk_proto_unhash)(struct sock *sk);
>> +	bool free_ctx;
>> +
>> +	if (!ctx)
>> +		return sk->sk_prot->unhash(sk);
>> +	sk_proto_unhash = ctx->sk_proto_unhash;
>> +	free_ctx = tls_sk_proto_destroy(sk, ctx, false);
>> +	tls_put_ctx(sk);
> 
> Oh, I think you can't put_ctx() unconditionally,
> when free_ctx is false, tls_device_sk_destruct() 
> needs it the ctx pointer.
> 
> I think this explains the offload crashing.
> 

ugh yeah. So we need to _not_ free it from tls_sk_proto_destroy
do the put_ctx and then finally free it. Otherwise we can't
restore the sk_proto fields. v3 on its way. Thanks.

>> +	if (sk_proto_unhash)
>> +		sk_proto_unhash(sk);
>> +	if (free_ctx)
>> +		tls_ctx_free(ctx);
>> +}
>>  
>> -skip_tx_cleanup:
>> +static void tls_sk_proto_close(struct sock *sk, long timeout)
>> +{
>> +	void (*sk_proto_close)(struct sock *sk, long timeout);
>> +	struct tls_context *ctx = tls_get_ctx(sk);
>> +	bool free_ctx;
>> +
>> +	if (!ctx)
>> +		return sk->sk_prot->destroy(sk);
>> +
>> +	lock_sock(sk);
>> +	sk_proto_close = ctx->sk_proto_close;
>> +	free_ctx = tls_sk_proto_destroy(sk, ctx, true);
>> +	tls_put_ctx(sk);


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED
  2019-04-25 19:32     ` John Fastabend
@ 2019-04-25 19:35       ` John Fastabend
  2019-04-25 19:41         ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: John Fastabend @ 2019-04-25 19:35 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: ast, daniel, netdev, bpf

On 4/25/19 12:32 PM, John Fastabend wrote:
> On 4/25/19 12:29 PM, Jakub Kicinski wrote:
>> On Thu, 25 Apr 2019 09:03:08 -0700, John Fastabend wrote:
>>> +static void tls_sk_proto_unhash(struct sock *sk)
>>> +{
>>> +	struct tls_context *ctx = tls_get_ctx(sk);
>>> +	void (*sk_proto_unhash)(struct sock *sk);
>>> +	bool free_ctx;
>>> +
>>> +	if (!ctx)
>>> +		return sk->sk_prot->unhash(sk);
>>> +	sk_proto_unhash = ctx->sk_proto_unhash;
>>> +	free_ctx = tls_sk_proto_destroy(sk, ctx, false);
>>> +	tls_put_ctx(sk);
>>
>> Oh, I think you can't put_ctx() unconditionally,
>> when free_ctx is false, tls_device_sk_destruct() 
>> needs it the ctx pointer.
>>
>> I think this explains the offload crashing.
>>
> 
> ugh yeah. So we need to _not_ free it from tls_sk_proto_destroy
> do the put_ctx and then finally free it. Otherwise we can't
> restore the sk_proto fields. v3 on its way. Thanks.
> 

I'm going to throw that patch I sent earlier in this thread
on the series as well. Its the minimal set to get things working
again for me. Will follow up some selftests so we don't get
here again.

>>> +	if (sk_proto_unhash)
>>> +		sk_proto_unhash(sk);
>>> +	if (free_ctx)
>>> +		tls_ctx_free(ctx);
>>> +}
>>>  
>>> -skip_tx_cleanup:
>>> +static void tls_sk_proto_close(struct sock *sk, long timeout)
>>> +{
>>> +	void (*sk_proto_close)(struct sock *sk, long timeout);
>>> +	struct tls_context *ctx = tls_get_ctx(sk);
>>> +	bool free_ctx;
>>> +
>>> +	if (!ctx)
>>> +		return sk->sk_prot->destroy(sk);
>>> +
>>> +	lock_sock(sk);
>>> +	sk_proto_close = ctx->sk_proto_close;
>>> +	free_ctx = tls_sk_proto_destroy(sk, ctx, true);
>>> +	tls_put_ctx(sk);
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED
  2019-04-25 19:35       ` John Fastabend
@ 2019-04-25 19:41         ` Jakub Kicinski
  0 siblings, 0 replies; 11+ messages in thread
From: Jakub Kicinski @ 2019-04-25 19:41 UTC (permalink / raw)
  To: John Fastabend; +Cc: ast, daniel, netdev, bpf

On Thu, 25 Apr 2019 12:35:58 -0700, John Fastabend wrote:
> On 4/25/19 12:32 PM, John Fastabend wrote:
> > On 4/25/19 12:29 PM, Jakub Kicinski wrote:  
> >> On Thu, 25 Apr 2019 09:03:08 -0700, John Fastabend wrote:  
> >>> +static void tls_sk_proto_unhash(struct sock *sk)
> >>> +{
> >>> +	struct tls_context *ctx = tls_get_ctx(sk);
> >>> +	void (*sk_proto_unhash)(struct sock *sk);
> >>> +	bool free_ctx;
> >>> +
> >>> +	if (!ctx)
> >>> +		return sk->sk_prot->unhash(sk);
> >>> +	sk_proto_unhash = ctx->sk_proto_unhash;
> >>> +	free_ctx = tls_sk_proto_destroy(sk, ctx, false);
> >>> +	tls_put_ctx(sk);  
> >>
> >> Oh, I think you can't put_ctx() unconditionally,
> >> when free_ctx is false, tls_device_sk_destruct() 
> >> needs it the ctx pointer.
> >>
> >> I think this explains the offload crashing.
> >>  
> > 
> > ugh yeah. So we need to _not_ free it from tls_sk_proto_destroy
> > do the put_ctx and then finally free it. Otherwise we can't
> > restore the sk_proto fields. v3 on its way. Thanks.
> >   
> 
> I'm going to throw that patch I sent earlier in this thread
> on the series as well. Its the minimal set to get things working
> again for me. Will follow up some selftests so we don't get
> here again.

SGTM, I've been racking my brain trying to come up with a good test for
the offload stuff, because it's really hard to test that without actual
HW.  I don't see any other way than adding full on per-packet crypto
logic into netdevsim or such :/  Trying to lie about having offloaded
the crypto breaks down in corner cases.

> >>> +	if (sk_proto_unhash)
> >>> +		sk_proto_unhash(sk);
> >>> +	if (free_ctx)
> >>> +		tls_ctx_free(ctx);
> >>> +}
> >>>  
> >>> -skip_tx_cleanup:
> >>> +static void tls_sk_proto_close(struct sock *sk, long timeout)
> >>> +{
> >>> +	void (*sk_proto_close)(struct sock *sk, long timeout);
> >>> +	struct tls_context *ctx = tls_get_ctx(sk);
> >>> +	bool free_ctx;
> >>> +
> >>> +	if (!ctx)
> >>> +		return sk->sk_prot->destroy(sk);
> >>> +
> >>> +	lock_sock(sk);
> >>> +	sk_proto_close = ctx->sk_proto_close;
> >>> +	free_ctx = tls_sk_proto_destroy(sk, ctx, true);
> >>> +	tls_put_ctx(sk);  

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-04-25 19:41 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-25 16:02 [bpf PATCH v2 0/3] sockmap/ktls fixes John Fastabend
2019-04-25 16:03 ` [bpf PATCH v2 1/3] bpf: tls, implement unhash to avoid transition out of ESTABLISHED John Fastabend
2019-04-25 19:29   ` Jakub Kicinski
2019-04-25 19:32     ` John Fastabend
2019-04-25 19:35       ` John Fastabend
2019-04-25 19:41         ` Jakub Kicinski
2019-04-25 16:03 ` [bpf PATCH v2 2/3] bpf: sockmap remove duplicate queue free John Fastabend
2019-04-25 16:03 ` [bpf PATCH v2 3/3] bpf: sockmap fix msg->sg.size account on ingress skb John Fastabend
2019-04-25 18:30 ` [bpf PATCH v2 0/3] sockmap/ktls fixes Jakub Kicinski
2019-04-25 18:49   ` John Fastabend
2019-04-25 19:12     ` Jakub Kicinski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.