All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data
@ 2018-11-23  1:38 John Fastabend
  2018-11-23  1:38 ` [PATCH bpf-next 1/3] bpf: helper to pop data from messages John Fastabend
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: John Fastabend @ 2018-11-23  1:38 UTC (permalink / raw)
  To: ast, daniel; +Cc: john.fastabend, netdev

After being able to add metadata to messages with sk_msg_push_data we
have also found it useful to be able to "pop" this metadata off before
sending it to applications in some cases. This series adds a new helper
sk_msg_pop_data() and the associated patches to add tests and tools/lib
support.

Thanks!

John Fastabend (3):
  bpf: helper to pop data from messages
  bpf: add msg_pop_data helper to tools
  bpf: test_sockmap, add options for msg_pop_data() helper usage

 include/uapi/linux/bpf.h                        |  13 +-
 net/core/filter.c                               | 169 ++++++++++++++++++++++++
 net/ipv4/tcp_bpf.c                              |  14 +-
 tools/include/uapi/linux/bpf.h                  |  13 +-
 tools/testing/selftests/bpf/bpf_helpers.h       |   2 +
 tools/testing/selftests/bpf/test_sockmap.c      | 127 +++++++++++++++++-
 tools/testing/selftests/bpf/test_sockmap_kern.h |  70 ++++++++--
 7 files changed, 386 insertions(+), 22 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH bpf-next 1/3] bpf: helper to pop data from messages
  2018-11-23  1:38 [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data John Fastabend
@ 2018-11-23  1:38 ` John Fastabend
  2018-11-26  1:05   ` Daniel Borkmann
  2018-11-23  1:38 ` [PATCH bpf-next 2/3] bpf: add msg_pop_data helper to tools John Fastabend
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: John Fastabend @ 2018-11-23  1:38 UTC (permalink / raw)
  To: ast, daniel; +Cc: john.fastabend, netdev

This adds a BPF SK_MSG program helper so that we can pop data from a
msg. We use this to pop metadata from a previous push data call.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/uapi/linux/bpf.h |  13 +++-
 net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_bpf.c       |  14 +++-
 3 files changed, 192 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c1554aa..64681f8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,6 +2268,16 @@ union bpf_attr {
  *
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *	 Description
+ *		Will remove 'pop' bytes from a msg starting at byte 'start'.
+ *		This result in ENOMEM errors under certain situations where
+ *		a allocation and copy are required due to a full ring buffer.
+ *		However, the helper will try to avoid doing the allocation
+ *		if possible. Other errors can occur if input parameters are
+ *		invalid either do to start byte not being valid part of msg
+ *		payload and/or pop value being to large.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2360,7 +2370,8 @@ union bpf_attr {
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index f6ca38a..c6b35b5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2428,6 +2428,173 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+	int prev;
+
+	do {
+		prev = i;
+		sk_msg_iter_var_next(i);
+		msg->sg.data[prev] = msg->sg.data[i];
+	} while (i != msg->sg.end);
+
+	sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+	struct scatterlist tmp, sge;
+
+	sk_msg_iter_next(msg, end);
+	sge = sk_msg_elem_cpy(msg, i);
+	sk_msg_iter_var_next(i);
+	tmp = sk_msg_elem_cpy(msg, i);
+
+	while (i != msg->sg.end) {
+		msg->sg.data[i] = sge;
+		sk_msg_iter_var_next(i);
+		sge = tmp;
+		tmp = sk_msg_elem_cpy(msg, i);
+	}
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+	   u32, len, u64, flags)
+{
+	u32 i = 0, l, space, offset = 0;
+	u64 last = start + len;
+	int pop;
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg.start;
+	do {
+		l = sk_msg_elem(msg, i)->length;
+
+		if (start < offset + l)
+			break;
+		offset += l;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+
+	/* Bounds checks: start and pop must be inside message */
+	if (start >= offset + l || last >= msg->sg.size)
+		return -EINVAL;
+
+	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+	pop = len;
+	/* --------------| offset
+	 * -| start      |------- len ------|
+	 *
+	 *  |----- a ----|-------- pop -------|----- b ----|
+	 *  |______________________________________________| length
+	 *
+	 *
+	 * a:   region at front of scatter element to save
+	 * b:   region at back of scatter element to save when length > A + pop
+	 * pop: region to pop from element, same as input 'pop' here will be
+	 *      decremented below per iteration.
+	 *
+	 * Two top-level cases to handle when start != offset, first B is non
+	 * zero and second B is zero corresponding to when a pop includes more
+	 * than one element.
+	 *
+	 * Then if B is non-zero AND there is no space allocate space and
+	 * compact A, B regions into page. If there is space shift ring to
+	 * the rigth free'ing the next element in ring to place B, leaving
+	 * A untouched except to reduce length.
+	 */
+	if (start != offset) {
+		struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+		int a = start;
+		int b = sge->length - pop - a;
+
+		sk_msg_iter_var_next(i);
+
+		if (pop < sge->length - a) {
+			if (space) {
+				sge->length = a;
+				sk_msg_shift_right(msg, i);
+				nsge = sk_msg_elem(msg, i);
+				get_page(sg_page(sge));
+				sg_set_page(nsge,
+					    sg_page(sge), b, offset + pop);
+			} else {
+				struct page *page, *orig;
+				u8 *to, *from;
+
+				page = alloc_pages(__GFP_NOWARN |
+						   __GFP_COMP   | GFP_ATOMIC,
+						   get_order(a + b));
+				if (unlikely(!page))
+					return -ENOMEM;
+
+				sge->length = a;
+				orig = sg_page(sge);
+				from = sg_virt(sge);
+				to = page_address(page);
+				memcpy(to, from, a);
+				memcpy(to + a, from + a + pop, b);
+				sg_set_page(sge, page, a + b, 0);
+				put_page(orig);
+			}
+			pop = 0;
+		} else if (pop >= sge->length - a) {
+			sge->length = a;
+			pop -= (sge->length - a);
+		}
+	}
+
+	/* From above the current layout _must_ be as follows,
+	 *
+	 * -| offset
+	 * -| start
+	 *
+	 *  |---- pop ---|---------------- b ------------|
+	 *  |____________________________________________| length
+	 *
+	 * Offset and start of the current msg elem are equal because in the
+	 * previous case we handled offset != start and either consumed the
+	 * entire element and advanced to the next element OR pop == 0.
+	 *
+	 * Two cases to handle here are first pop is less than the length
+	 * leaving some remainder b above. Simply adjust the element's layout
+	 * in this case. Or pop >= length of the element so that b = 0. In this
+	 * case advance to next element decrementing pop.
+	 */
+	while (pop) {
+		struct scatterlist *sge = sk_msg_elem(msg, i);
+
+		if (pop < sge->length) {
+			sge->length -= pop;
+			sge->offset += pop;
+			pop = 0;
+		} else {
+			pop -= sge->length;
+			sk_msg_shift_left(msg, i);
+		}
+		sk_msg_iter_var_next(i);
+	}
+
+	sk_mem_uncharge(msg->sk, len - pop);
+	msg->sg.size -= (len - pop);
+	sk_msg_compute_data_pointers(msg);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pop_data_proto = {
+	.func		= bpf_msg_pop_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -5397,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_pull_data_proto;
 	case BPF_FUNC_msg_push_data:
 		return &bpf_msg_push_data_proto;
+	case BPF_FUNC_msg_pop_data:
+		return &bpf_msg_pop_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3b45fe5..0286a18 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -289,12 +289,20 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 {
 	bool cork = false, enospc = msg->sg.start == msg->sg.end;
 	struct sock *sk_redir;
-	u32 tosend;
+	u32 tosend, delta = 0;
 	int ret;
 
 more_data:
-	if (psock->eval == __SK_NONE)
+	if (psock->eval == __SK_NONE) {
+		/* Track delta in msg size to add/subtract it on SK_DROP from
+		 * returned to user copied size. This ensures user doesn't
+		 * get a positive return code with msg_cut_data and SK_DROP
+		 * verdict.
+		 */
+		delta = msg->sg.size;
 		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+		delta -= msg->sg.size;
+	}
 
 	if (msg->cork_bytes &&
 	    msg->cork_bytes > msg->sg.size && !enospc) {
@@ -350,7 +358,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 	default:
 		sk_msg_free_partial(sk, msg, tosend);
 		sk_msg_apply_bytes(psock, tosend);
-		*copied -= tosend;
+		*copied -= (tosend + delta);
 		return -EACCES;
 	}
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH bpf-next 2/3] bpf: add msg_pop_data helper to tools
  2018-11-23  1:38 [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data John Fastabend
  2018-11-23  1:38 ` [PATCH bpf-next 1/3] bpf: helper to pop data from messages John Fastabend
@ 2018-11-23  1:38 ` John Fastabend
  2018-11-23  1:38 ` [PATCH bpf-next 3/3] bpf: test_sockmap, add options for msg_pop_data() John Fastabend
  2018-11-26  0:45 ` [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data Daniel Borkmann
  3 siblings, 0 replies; 9+ messages in thread
From: John Fastabend @ 2018-11-23  1:38 UTC (permalink / raw)
  To: ast, daniel; +Cc: john.fastabend, netdev

Add the necessary header definitions to tools for new
msg_pop_data_helper.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/include/uapi/linux/bpf.h            | 13 ++++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c1554aa..95cf7a5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2268,6 +2268,16 @@ union bpf_attr {
  *
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ * 	Description
+ * 		Will remove 'pop' bytes from a msg starting at byte 'start'.
+ * 		This result in ENOMEM errors under certain situations where
+ * 		a allocation and copy are required due to a full ring buffer.
+ * 		However, the helper will try to avoid doing the allocation
+ * 		if possible. Other errors can occur if input parameters are
+ * 		invalid either do to start byte not being valid part of msg
+ * 		payload and/or pop value being to large.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2360,7 +2370,8 @@ union bpf_attr {
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 686e57c..7b69519 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -113,6 +113,8 @@ static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_pull_data;
 static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_push_data;
+static int (*bpf_msg_pop_data)(void *ctx, int start, int cut, int flags) =
+	(void *) BPF_FUNC_msg_pop_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH bpf-next 3/3] bpf: test_sockmap, add options for msg_pop_data()
  2018-11-23  1:38 [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data John Fastabend
  2018-11-23  1:38 ` [PATCH bpf-next 1/3] bpf: helper to pop data from messages John Fastabend
  2018-11-23  1:38 ` [PATCH bpf-next 2/3] bpf: add msg_pop_data helper to tools John Fastabend
@ 2018-11-23  1:38 ` John Fastabend
  2018-11-26  0:45 ` [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data Daniel Borkmann
  3 siblings, 0 replies; 9+ messages in thread
From: John Fastabend @ 2018-11-23  1:38 UTC (permalink / raw)
  To: ast, daniel; +Cc: john.fastabend, netdev

Similar to msg_pull_data and msg_push_data add a set of options to
have msg_pop_data() exercised.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/testing/selftests/bpf/test_sockmap.c      | 127 +++++++++++++++++++++++-
 tools/testing/selftests/bpf/test_sockmap_kern.h |  70 ++++++++++---
 2 files changed, 180 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 622ade0..e85a771 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -79,6 +79,8 @@ int txmsg_start;
 int txmsg_end;
 int txmsg_start_push;
 int txmsg_end_push;
+int txmsg_start_pop;
+int txmsg_pop;
 int txmsg_ingress;
 int txmsg_skb;
 int ktls;
@@ -104,6 +106,8 @@ static const struct option long_options[] = {
 	{"txmsg_end",	required_argument,	NULL, 'e'},
 	{"txmsg_start_push", required_argument,	NULL, 'p'},
 	{"txmsg_end_push",   required_argument,	NULL, 'q'},
+	{"txmsg_start_pop",  required_argument,	NULL, 'w'},
+	{"txmsg_pop",	     required_argument,	NULL, 'x'},
 	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
 	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
 	{"ktls", no_argument,			&ktls, 1 },
@@ -473,13 +477,27 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
 		int slct, recvp = 0, recv, max_fd = fd;
+		float total_bytes, txmsg_pop_total;
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
-		float total_bytes;
 		fd_set w;
 
 		fcntl(fd, fd_flags);
+		/* Account for pop bytes noting each iteration of apply will
+		 * call msg_pop_data helper so we need to account for this
+		 * by calculating the number of apply iterations. Note user
+		 * of the tool can create cases where no data is sent by
+		 * manipulating pop/push/pull/etc. For example txmsg_apply 1
+		 * with txmsg_pop 1 will try to apply 1B at a time but each
+		 * iteration will then pop 1B so no data will ever be sent.
+		 * This is really only useful for testing edge cases in code
+		 * paths.
+		 */
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		txmsg_pop_total = txmsg_pop;
+		if (txmsg_apply)
+			txmsg_pop_total *= (total_bytes / txmsg_apply);
+		total_bytes -= txmsg_pop_total;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
 		if (err < 0)
 			perror("recv start time: ");
@@ -488,7 +506,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				timeout.tv_sec = 0;
 				timeout.tv_usec = 300000;
 			} else {
-				timeout.tv_sec = 1;
+				timeout.tv_sec = 3;
 				timeout.tv_usec = 0;
 			}
 
@@ -503,7 +521,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				goto out_errno;
 			} else if (!slct) {
 				if (opt->verbose)
-					fprintf(stderr, "unexpected timeout\n");
+					fprintf(stderr, "unexpected timeout: recved %zu/%f pop_total %f\n", s->bytes_recvd, total_bytes, txmsg_pop_total);
 				errno = -EIO;
 				clock_gettime(CLOCK_MONOTONIC, &s->end);
 				goto out_errno;
@@ -619,7 +637,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 			iov_count = 1;
 		err = msg_loop(rx_fd, iov_count, iov_buf,
 			       cnt, &s, false, opt);
-		if (err && opt->verbose)
+		if (opt->verbose)
 			fprintf(stderr,
 				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
 				iov_count, iov_buf, cnt, err);
@@ -931,6 +949,39 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
 			}
 		}
 
+		if (txmsg_start_pop) {
+			i = 4;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_start_pop, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem %i@%i (txmsg_start_pop):  %d (%s)\n",
+					txmsg_start_pop, i, err, strerror(errno));
+				goto out;
+			}
+		} else {
+			i = 4;
+			bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_start_pop, BPF_ANY);
+		}
+
+		if (txmsg_pop) {
+			i = 5;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_pop, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem %i@%i (txmsg_pop):  %d (%s)\n",
+					txmsg_pop, i, err, strerror(errno));
+				goto out;
+			}
+		} else {
+			i = 5;
+			bpf_map_update_elem(map_fd[5],
+					    &i, &txmsg_pop, BPF_ANY);
+
+		}
+
 		if (txmsg_ingress) {
 			int in = BPF_F_INGRESS;
 
@@ -1082,6 +1133,11 @@ static void test_options(char *options)
 		snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
 		strncat(options, tstr, OPTSTRING);
 	}
+	if (txmsg_start_pop) {
+		snprintf(tstr, OPTSTRING, "pop (%d,%d),",
+			 txmsg_start_pop, txmsg_start_pop + txmsg_pop);
+		strncat(options, tstr, OPTSTRING);
+	}
 	if (txmsg_ingress)
 		strncat(options, "ingress,", OPTSTRING);
 	if (txmsg_skb)
@@ -1264,6 +1320,7 @@ static int test_mixed(int cgrp)
 	txmsg_apply = txmsg_cork = 0;
 	txmsg_start = txmsg_end = 0;
 	txmsg_start_push = txmsg_end_push = 0;
+	txmsg_start_pop = txmsg_pop = 0;
 
 	/* Test small and large iov_count values with pass/redir/apply/cork */
 	txmsg_pass = 1;
@@ -1383,6 +1440,19 @@ static int test_start_end(int cgrp)
 	txmsg_end = 2;
 	txmsg_start_push = 1;
 	txmsg_end_push = 2;
+	txmsg_start_pop = 1;
+	txmsg_pop = 1;
+	err = test_txmsg(cgrp);
+	if (err)
+		goto out;
+
+	/* Cut a byte of pushed data but leave reamining in place */
+	txmsg_start = 1;
+	txmsg_end = 2;
+	txmsg_start_push = 1;
+	txmsg_end_push = 3;
+	txmsg_start_pop = 1;
+	txmsg_pop = 1;
 	err = test_txmsg(cgrp);
 	if (err)
 		goto out;
@@ -1393,6 +1463,9 @@ static int test_start_end(int cgrp)
 	opt.iov_length = 100;
 	txmsg_cork = 1600;
 
+	txmsg_start_pop = 0;
+	txmsg_pop = 0;
+
 	for (i = 99; i <= 1600; i += 500) {
 		txmsg_start = 0;
 		txmsg_end = i;
@@ -1403,6 +1476,17 @@ static int test_start_end(int cgrp)
 			goto out;
 	}
 
+	/* Test pop data in middle of cork */
+	for (i = 99; i <= 1600; i += 500) {
+		txmsg_start_pop = 10;
+		txmsg_pop = i;
+		err = test_exec(cgrp, &opt);
+		if (err)
+			goto out;
+	}
+	txmsg_start_pop = 0;
+	txmsg_pop = 0;
+
 	/* Test start/end with cork but pull data in middle */
 	for (i = 199; i <= 1600; i += 500) {
 		txmsg_start = 100;
@@ -1423,6 +1507,15 @@ static int test_start_end(int cgrp)
 	if (err)
 		goto out;
 
+	/* Test pop with cork pulling last sg entry */
+	txmsg_start_pop = 1500;
+	txmsg_pop = 1600;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+	txmsg_start_pop = 0;
+	txmsg_pop = 0;
+
 	/* Test start/end pull of single byte in last page */
 	txmsg_start = 1111;
 	txmsg_end = 1112;
@@ -1432,6 +1525,13 @@ static int test_start_end(int cgrp)
 	if (err)
 		goto out;
 
+	/* Test pop of single byte in last page */
+	txmsg_start_pop = 1111;
+	txmsg_pop = 1112;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
 	/* Test start/end with end < start */
 	txmsg_start = 1111;
 	txmsg_end = 0;
@@ -1456,7 +1556,20 @@ static int test_start_end(int cgrp)
 	txmsg_start_push = 1601;
 	txmsg_end_push = 1600;
 	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test pop with start > data */
+	txmsg_start_pop = 1601;
+	txmsg_pop = 1;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
 
+	/* Test pop with pop range > data */
+	txmsg_start_pop = 1599;
+	txmsg_pop = 10;
+	err = test_exec(cgrp, &opt);
 out:
 	txmsg_start = 0;
 	txmsg_end = 0;
@@ -1641,6 +1754,12 @@ int main(int argc, char **argv)
 		case 'q':
 			txmsg_end_push = atoi(optarg);
 			break;
+		case 'w':
+			txmsg_start_pop = atoi(optarg);
+			break;
+		case 'x':
+			txmsg_pop = atoi(optarg);
+			break;
 		case 'a':
 			txmsg_apply = atoi(optarg);
 			break;
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
index 14b8bba..e7639f6 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -74,7 +74,7 @@ struct bpf_map_def SEC("maps") sock_bytes = {
 	.type = BPF_MAP_TYPE_ARRAY,
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.max_entries = 4
+	.max_entries = 6
 };
 
 struct bpf_map_def SEC("maps") sock_redir_flags = {
@@ -181,8 +181,8 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 SEC("sk_msg1")
 int bpf_prog4(struct sk_msg_md *msg)
 {
-	int *bytes, zero = 0, one = 1, two = 2, three = 3;
-	int *start, *end, *start_push, *end_push;
+	int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+	int *start, *end, *start_push, *end_push, *start_pop, *pop;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@@ -198,15 +198,19 @@ int bpf_prog4(struct sk_msg_md *msg)
 	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
 	if (start_push && end_push)
 		bpf_msg_push_data(msg, *start_push, *end_push, 0);
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop)
+		bpf_msg_pop_data(msg, *start_pop, *pop, 0);
 	return SK_PASS;
 }
 
 SEC("sk_msg2")
 int bpf_prog5(struct sk_msg_md *msg)
 {
-	int zero = 0, one = 1, two = 2, three = 3;
-	int *start, *end, *start_push, *end_push;
-	int *bytes, len1, len2 = 0, len3;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+	int *start, *end, *start_push, *end_push, *start_pop, *pop;
+	int *bytes, len1, len2 = 0, len3, len4;
 	int err1 = -1, err2 = -1;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
@@ -247,6 +251,20 @@ int bpf_prog5(struct sk_msg_md *msg)
 		bpf_printk("sk_msg2: length push_update %i->%i\n",
 			   len2 ? len2 : len1, len3);
 	}
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop) {
+		int err;
+
+		bpf_printk("sk_msg2: pop(%i@%i)\n",
+			   start_pop, pop);
+		err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+		if (err)
+			bpf_printk("sk_msg2: pop_data err %i\n", err);
+		len4 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length pop_data %i->%i\n",
+			   len1 ? len1 : 0,  len4);
+	}
 
 	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
 		   len1, err1, err2);
@@ -256,8 +274,8 @@ int bpf_prog5(struct sk_msg_md *msg)
 SEC("sk_msg3")
 int bpf_prog6(struct sk_msg_md *msg)
 {
-	int *bytes, *start, *end, *start_push, *end_push, *f;
-	int zero = 0, one = 1, two = 2, three = 3, key = 0;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
+	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
 	__u64 flags = 0;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
@@ -277,6 +295,11 @@ int bpf_prog6(struct sk_msg_md *msg)
 	if (start_push && end_push)
 		bpf_msg_push_data(msg, *start_push, *end_push, 0);
 
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop)
+		bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+
 	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
 	if (f && *f) {
 		key = 2;
@@ -292,8 +315,9 @@ int bpf_prog6(struct sk_msg_md *msg)
 SEC("sk_msg4")
 int bpf_prog7(struct sk_msg_md *msg)
 {
-	int zero = 0, one = 1, two = 2, three = 3, len1, len2 = 0, len3;
-	int *bytes, *start, *end, *start_push, *end_push, *f;
+	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+	int len1, len2 = 0, len3, len4;
 	int err1 = 0, err2 = 0, key = 0;
 	__u64 flags = 0;
 
@@ -335,6 +359,22 @@ int bpf_prog7(struct sk_msg_md *msg)
 			   len2 ? len2 : len1, len3);
 	}
 
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop) {
+		int err;
+
+		bpf_printk("sk_msg4: pop(%i@%i)\n",
+			   start_pop, pop);
+		err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+		if (err)
+			bpf_printk("sk_msg4: pop_data err %i\n", err);
+		len4 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg4: length pop_data %i->%i\n",
+			   len1 ? len1 : 0,  len4);
+	}
+
+
 	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
 	if (f && *f) {
 		key = 2;
@@ -389,8 +429,8 @@ int bpf_prog9(struct sk_msg_md *msg)
 SEC("sk_msg7")
 int bpf_prog10(struct sk_msg_md *msg)
 {
-	int *bytes, *start, *end, *start_push, *end_push;
-	int zero = 0, one = 1, two = 2, three = 3;
+	int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
+	int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@@ -406,7 +446,11 @@ int bpf_prog10(struct sk_msg_md *msg)
 	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
 	if (start_push && end_push)
 		bpf_msg_push_data(msg, *start_push, *end_push, 0);
-
+	start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+	pop = bpf_map_lookup_elem(&sock_bytes, &five);
+	if (start_pop && pop)
+		bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+	bpf_printk("return sk drop\n");
 	return SK_DROP;
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data
  2018-11-23  1:38 [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data John Fastabend
                   ` (2 preceding siblings ...)
  2018-11-23  1:38 ` [PATCH bpf-next 3/3] bpf: test_sockmap, add options for msg_pop_data() John Fastabend
@ 2018-11-26  0:45 ` Daniel Borkmann
  3 siblings, 0 replies; 9+ messages in thread
From: Daniel Borkmann @ 2018-11-26  0:45 UTC (permalink / raw)
  To: John Fastabend, ast; +Cc: netdev

On 11/23/2018 02:38 AM, John Fastabend wrote:
> After being able to add metadata to messages with sk_msg_push_data we
> have also found it useful to be able to "pop" this metadata off before
> sending it to applications in some cases. This series adds a new helper
> sk_msg_pop_data() and the associated patches to add tests and tools/lib
> support.
> 
> Thanks!
> 
> John Fastabend (3):
>   bpf: helper to pop data from messages
>   bpf: add msg_pop_data helper to tools
>   bpf: test_sockmap, add options for msg_pop_data() helper usage
> 
>  include/uapi/linux/bpf.h                        |  13 +-
>  net/core/filter.c                               | 169 ++++++++++++++++++++++++
>  net/ipv4/tcp_bpf.c                              |  14 +-
>  tools/include/uapi/linux/bpf.h                  |  13 +-
>  tools/testing/selftests/bpf/bpf_helpers.h       |   2 +
>  tools/testing/selftests/bpf/test_sockmap.c      | 127 +++++++++++++++++-
>  tools/testing/selftests/bpf/test_sockmap_kern.h |  70 ++++++++--
>  7 files changed, 386 insertions(+), 22 deletions(-)
> 

Applied to bpf-next, thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: helper to pop data from messages
  2018-11-23  1:38 ` [PATCH bpf-next 1/3] bpf: helper to pop data from messages John Fastabend
@ 2018-11-26  1:05   ` Daniel Borkmann
  2018-11-26 11:16     ` Quentin Monnet
  2018-11-26 15:43     ` John Fastabend
  0 siblings, 2 replies; 9+ messages in thread
From: Daniel Borkmann @ 2018-11-26  1:05 UTC (permalink / raw)
  To: John Fastabend, ast; +Cc: netdev

On 11/23/2018 02:38 AM, John Fastabend wrote:
> This adds a BPF SK_MSG program helper so that we can pop data from a
> msg. We use this to pop metadata from a previous push data call.
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  include/uapi/linux/bpf.h |  13 +++-
>  net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_bpf.c       |  14 +++-
>  3 files changed, 192 insertions(+), 4 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c1554aa..64681f8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2268,6 +2268,16 @@ union bpf_attr {
>   *
>   *	Return
>   *		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
> + *	 Description
> + *		Will remove 'pop' bytes from a msg starting at byte 'start'.
> + *		This result in ENOMEM errors under certain situations where
> + *		a allocation and copy are required due to a full ring buffer.
> + *		However, the helper will try to avoid doing the allocation
> + *		if possible. Other errors can occur if input parameters are
> + *		invalid either do to start byte not being valid part of msg
> + *		payload and/or pop value being to large.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2360,7 +2370,8 @@ union bpf_attr {
>  	FN(map_push_elem),		\
>  	FN(map_pop_elem),		\
>  	FN(map_peek_elem),		\
> -	FN(msg_push_data),
> +	FN(msg_push_data),		\
> +	FN(msg_pop_data),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index f6ca38a..c6b35b5 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2428,6 +2428,173 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
>  	.arg4_type	= ARG_ANYTHING,
>  };
>  
> +static void sk_msg_shift_left(struct sk_msg *msg, int i)
> +{
> +	int prev;
> +
> +	do {
> +		prev = i;
> +		sk_msg_iter_var_next(i);
> +		msg->sg.data[prev] = msg->sg.data[i];
> +	} while (i != msg->sg.end);
> +
> +	sk_msg_iter_prev(msg, end);
> +}
> +
> +static void sk_msg_shift_right(struct sk_msg *msg, int i)
> +{
> +	struct scatterlist tmp, sge;
> +
> +	sk_msg_iter_next(msg, end);
> +	sge = sk_msg_elem_cpy(msg, i);
> +	sk_msg_iter_var_next(i);
> +	tmp = sk_msg_elem_cpy(msg, i);
> +
> +	while (i != msg->sg.end) {
> +		msg->sg.data[i] = sge;
> +		sk_msg_iter_var_next(i);
> +		sge = tmp;
> +		tmp = sk_msg_elem_cpy(msg, i);
> +	}
> +}
> +
> +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
> +	   u32, len, u64, flags)
> +{
> +	u32 i = 0, l, space, offset = 0;
> +	u64 last = start + len;
> +	int pop;
> +
> +	if (unlikely(flags))
> +		return -EINVAL;
> +
> +	/* First find the starting scatterlist element */
> +	i = msg->sg.start;
> +	do {
> +		l = sk_msg_elem(msg, i)->length;
> +
> +		if (start < offset + l)
> +			break;
> +		offset += l;
> +		sk_msg_iter_var_next(i);
> +	} while (i != msg->sg.end);
> +
> +	/* Bounds checks: start and pop must be inside message */
> +	if (start >= offset + l || last >= msg->sg.size)
> +		return -EINVAL;
> +
> +	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
> +
> +	pop = len;
> +	/* --------------| offset
> +	 * -| start      |------- len ------|
> +	 *
> +	 *  |----- a ----|-------- pop -------|----- b ----|
> +	 *  |______________________________________________| length
> +	 *
> +	 *
> +	 * a:   region at front of scatter element to save
> +	 * b:   region at back of scatter element to save when length > A + pop
> +	 * pop: region to pop from element, same as input 'pop' here will be
> +	 *      decremented below per iteration.
> +	 *
> +	 * Two top-level cases to handle when start != offset, first B is non
> +	 * zero and second B is zero corresponding to when a pop includes more
> +	 * than one element.
> +	 *
> +	 * Then if B is non-zero AND there is no space allocate space and
> +	 * compact A, B regions into page. If there is space shift ring to
> +	 * the rigth free'ing the next element in ring to place B, leaving
> +	 * A untouched except to reduce length.
> +	 */
> +	if (start != offset) {
> +		struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
> +		int a = start;
> +		int b = sge->length - pop - a;
> +
> +		sk_msg_iter_var_next(i);
> +
> +		if (pop < sge->length - a) {
> +			if (space) {
> +				sge->length = a;
> +				sk_msg_shift_right(msg, i);
> +				nsge = sk_msg_elem(msg, i);
> +				get_page(sg_page(sge));
> +				sg_set_page(nsge,
> +					    sg_page(sge), b, offset + pop);
> +			} else {
> +				struct page *page, *orig;
> +				u8 *to, *from;
> +
> +				page = alloc_pages(__GFP_NOWARN |
> +						   __GFP_COMP   | GFP_ATOMIC,
> +						   get_order(a + b));
> +				if (unlikely(!page))
> +					return -ENOMEM;
> +
> +				sge->length = a;
> +				orig = sg_page(sge);
> +				from = sg_virt(sge);
> +				to = page_address(page);
> +				memcpy(to, from, a);
> +				memcpy(to + a, from + a + pop, b);
> +				sg_set_page(sge, page, a + b, 0);
> +				put_page(orig);
> +			}
> +			pop = 0;
> +		} else if (pop >= sge->length - a) {
> +			sge->length = a;
> +			pop -= (sge->length - a);
> +		}
> +	}
> +
> +	/* From above the current layout _must_ be as follows,
> +	 *
> +	 * -| offset
> +	 * -| start
> +	 *
> +	 *  |---- pop ---|---------------- b ------------|
> +	 *  |____________________________________________| length
> +	 *
> +	 * Offset and start of the current msg elem are equal because in the
> +	 * previous case we handled offset != start and either consumed the
> +	 * entire element and advanced to the next element OR pop == 0.
> +	 *
> +	 * Two cases to handle here are first pop is less than the length
> +	 * leaving some remainder b above. Simply adjust the element's layout
> +	 * in this case. Or pop >= length of the element so that b = 0. In this
> +	 * case advance to next element decrementing pop.
> +	 */
> +	while (pop) {
> +		struct scatterlist *sge = sk_msg_elem(msg, i);
> +
> +		if (pop < sge->length) {
> +			sge->length -= pop;
> +			sge->offset += pop;
> +			pop = 0;
> +		} else {
> +			pop -= sge->length;
> +			sk_msg_shift_left(msg, i);
> +		}
> +		sk_msg_iter_var_next(i);
> +	}
> +
> +	sk_mem_uncharge(msg->sk, len - pop);
> +	msg->sg.size -= (len - pop);
> +	sk_msg_compute_data_pointers(msg);
> +	return 0;
> +}

Hmm, had to revert. I noticed this will have a use after free. While you do the
sk_msg_compute_data_pointers() there is nothing from verifier that forces a
reload of the payload pointers that were set up before the helper call, so they
can still be used after the bpf_msg_pop_data() even though we dropped ref from
page etc.

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: helper to pop data from messages
  2018-11-26  1:05   ` Daniel Borkmann
@ 2018-11-26 11:16     ` Quentin Monnet
  2018-11-26 15:38       ` John Fastabend
  2018-11-26 15:43     ` John Fastabend
  1 sibling, 1 reply; 9+ messages in thread
From: Quentin Monnet @ 2018-11-26 11:16 UTC (permalink / raw)
  To: John Fastabend; +Cc: Daniel Borkmann, ast, netdev

2018-11-26 02:05 UTC+0100 ~ Daniel Borkmann <daniel@iogearbox.net>
> On 11/23/2018 02:38 AM, John Fastabend wrote:
>> This adds a BPF SK_MSG program helper so that we can pop data from a
>> msg. We use this to pop metadata from a previous push data call.
>>
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> ---
>>  include/uapi/linux/bpf.h |  13 +++-
>>  net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
>>  net/ipv4/tcp_bpf.c       |  14 +++-
>>  3 files changed, 192 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c1554aa..64681f8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2268,6 +2268,16 @@ union bpf_attr {
>>   *
>>   *	Return
>>   *		0 on success, or a negative error in case of failure.
>> + *
>> + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
>> + *	 Description
>> + *		Will remove 'pop' bytes from a msg starting at byte 'start'.
>> + *		This result in ENOMEM errors under certain situations where
>> + *		a allocation and copy are required due to a full ring buffer.
>> + *		However, the helper will try to avoid doing the allocation
>> + *		if possible. Other errors can occur if input parameters are
>> + *		invalid either do to start byte not being valid part of msg
>> + *		payload and/or pop value being to large.
>>   */

Hi John,

If you respin could you please update the helper documentation to use
RST syntax for argument and constant names (*pop* instead of 'pop',
*msg*, *start*, *flags*, **ENOMEM**), and document the return value from
the helper?

Thanks a lot,
Quentin

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: helper to pop data from messages
  2018-11-26 11:16     ` Quentin Monnet
@ 2018-11-26 15:38       ` John Fastabend
  0 siblings, 0 replies; 9+ messages in thread
From: John Fastabend @ 2018-11-26 15:38 UTC (permalink / raw)
  To: Quentin Monnet; +Cc: Daniel Borkmann, ast, netdev

On 11/26/18 3:16 AM, Quentin Monnet wrote:
> 2018-11-26 02:05 UTC+0100 ~ Daniel Borkmann <daniel@iogearbox.net>
>> On 11/23/2018 02:38 AM, John Fastabend wrote:
>>> This adds a BPF SK_MSG program helper so that we can pop data from a
>>> msg. We use this to pop metadata from a previous push data call.
>>>
>>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>>> ---
>>>  include/uapi/linux/bpf.h |  13 +++-
>>>  net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
>>>  net/ipv4/tcp_bpf.c       |  14 +++-
>>>  3 files changed, 192 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index c1554aa..64681f8 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -2268,6 +2268,16 @@ union bpf_attr {
>>>   *
>>>   *	Return
>>>   *		0 on success, or a negative error in case of failure.
>>> + *
>>> + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
>>> + *	 Description
>>> + *		Will remove 'pop' bytes from a msg starting at byte 'start'.
>>> + *		This result in ENOMEM errors under certain situations where
>>> + *		a allocation and copy are required due to a full ring buffer.
>>> + *		However, the helper will try to avoid doing the allocation
>>> + *		if possible. Other errors can occur if input parameters are
>>> + *		invalid either do to start byte not being valid part of msg
>>> + *		payload and/or pop value being to large.
>>>   */
> 
> Hi John,
> 
> If you respin could you please update the helper documentation to use
> RST syntax for argument and constant names (*pop* instead of 'pop',
> *msg*, *start*, *flags*, **ENOMEM**), and document the return value from
> the helper?
> 

Sure no problem.

> Thanks a lot,
> Quentin
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next 1/3] bpf: helper to pop data from messages
  2018-11-26  1:05   ` Daniel Borkmann
  2018-11-26 11:16     ` Quentin Monnet
@ 2018-11-26 15:43     ` John Fastabend
  1 sibling, 0 replies; 9+ messages in thread
From: John Fastabend @ 2018-11-26 15:43 UTC (permalink / raw)
  To: Daniel Borkmann, ast; +Cc: netdev

On 11/25/18 5:05 PM, Daniel Borkmann wrote:
> On 11/23/2018 02:38 AM, John Fastabend wrote:
>> This adds a BPF SK_MSG program helper so that we can pop data from a
>> msg. We use this to pop metadata from a previous push data call.
>>
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> ---
>>  include/uapi/linux/bpf.h |  13 +++-
>>  net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
>>  net/ipv4/tcp_bpf.c       |  14 +++-
>>  3 files changed, 192 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c1554aa..64681f8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2268,6 +2268,16 @@ union bpf_attr {
>>   *
>>   *	Return
>>   *		0 on success, or a negative error in case of failure.
>> + *
>> + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
>> + *	 Description
>> + *		Will remove 'pop' bytes from a msg starting at byte 'start'.
>> + *		This result in ENOMEM errors under certain situations where
>> + *		a allocation and copy are required due to a full ring buffer.
>> + *		However, the helper will try to avoid doing the allocation
>> + *		if possible. Other errors can occur if input parameters are
>> + *		invalid either do to start byte not being valid part of msg
>> + *		payload and/or pop value being to large.
>>   */
>>  #define __BPF_FUNC_MAPPER(FN)		\
>>  	FN(unspec),			\
>> @@ -2360,7 +2370,8 @@ union bpf_attr {
>>  	FN(map_push_elem),		\
>>  	FN(map_pop_elem),		\
>>  	FN(map_peek_elem),		\
>> -	FN(msg_push_data),
>> +	FN(msg_push_data),		\
>> +	FN(msg_pop_data),
>>  
>>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>   * function eBPF program intends to call
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index f6ca38a..c6b35b5 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2428,6 +2428,173 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
>>  	.arg4_type	= ARG_ANYTHING,
>>  };
>>  
>> +static void sk_msg_shift_left(struct sk_msg *msg, int i)
>> +{
>> +	int prev;
>> +
>> +	do {
>> +		prev = i;
>> +		sk_msg_iter_var_next(i);
>> +		msg->sg.data[prev] = msg->sg.data[i];
>> +	} while (i != msg->sg.end);
>> +
>> +	sk_msg_iter_prev(msg, end);
>> +}
>> +
>> +static void sk_msg_shift_right(struct sk_msg *msg, int i)
>> +{
>> +	struct scatterlist tmp, sge;
>> +
>> +	sk_msg_iter_next(msg, end);
>> +	sge = sk_msg_elem_cpy(msg, i);
>> +	sk_msg_iter_var_next(i);
>> +	tmp = sk_msg_elem_cpy(msg, i);
>> +
>> +	while (i != msg->sg.end) {
>> +		msg->sg.data[i] = sge;
>> +		sk_msg_iter_var_next(i);
>> +		sge = tmp;
>> +		tmp = sk_msg_elem_cpy(msg, i);
>> +	}
>> +}
>> +
>> +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
>> +	   u32, len, u64, flags)
>> +{
>> +	u32 i = 0, l, space, offset = 0;
>> +	u64 last = start + len;
>> +	int pop;
>> +
>> +	if (unlikely(flags))
>> +		return -EINVAL;
>> +
>> +	/* First find the starting scatterlist element */
>> +	i = msg->sg.start;
>> +	do {
>> +		l = sk_msg_elem(msg, i)->length;
>> +
>> +		if (start < offset + l)
>> +			break;
>> +		offset += l;
>> +		sk_msg_iter_var_next(i);
>> +	} while (i != msg->sg.end);
>> +
>> +	/* Bounds checks: start and pop must be inside message */
>> +	if (start >= offset + l || last >= msg->sg.size)
>> +		return -EINVAL;
>> +
>> +	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
>> +
>> +	pop = len;
>> +	/* --------------| offset
>> +	 * -| start      |------- len ------|
>> +	 *
>> +	 *  |----- a ----|-------- pop -------|----- b ----|
>> +	 *  |______________________________________________| length
>> +	 *
>> +	 *
>> +	 * a:   region at front of scatter element to save
>> +	 * b:   region at back of scatter element to save when length > A + pop
>> +	 * pop: region to pop from element, same as input 'pop' here will be
>> +	 *      decremented below per iteration.
>> +	 *
>> +	 * Two top-level cases to handle when start != offset, first B is non
>> +	 * zero and second B is zero corresponding to when a pop includes more
>> +	 * than one element.
>> +	 *
>> +	 * Then if B is non-zero AND there is no space allocate space and
>> +	 * compact A, B regions into page. If there is space shift ring to
>> +	 * the rigth free'ing the next element in ring to place B, leaving
>> +	 * A untouched except to reduce length.
>> +	 */
>> +	if (start != offset) {
>> +		struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
>> +		int a = start;
>> +		int b = sge->length - pop - a;
>> +
>> +		sk_msg_iter_var_next(i);
>> +
>> +		if (pop < sge->length - a) {
>> +			if (space) {
>> +				sge->length = a;
>> +				sk_msg_shift_right(msg, i);
>> +				nsge = sk_msg_elem(msg, i);
>> +				get_page(sg_page(sge));
>> +				sg_set_page(nsge,
>> +					    sg_page(sge), b, offset + pop);
>> +			} else {
>> +				struct page *page, *orig;
>> +				u8 *to, *from;
>> +
>> +				page = alloc_pages(__GFP_NOWARN |
>> +						   __GFP_COMP   | GFP_ATOMIC,
>> +						   get_order(a + b));
>> +				if (unlikely(!page))
>> +					return -ENOMEM;
>> +
>> +				sge->length = a;
>> +				orig = sg_page(sge);
>> +				from = sg_virt(sge);
>> +				to = page_address(page);
>> +				memcpy(to, from, a);
>> +				memcpy(to + a, from + a + pop, b);
>> +				sg_set_page(sge, page, a + b, 0);
>> +				put_page(orig);
>> +			}
>> +			pop = 0;
>> +		} else if (pop >= sge->length - a) {
>> +			sge->length = a;
>> +			pop -= (sge->length - a);
>> +		}
>> +	}
>> +
>> +	/* From above the current layout _must_ be as follows,
>> +	 *
>> +	 * -| offset
>> +	 * -| start
>> +	 *
>> +	 *  |---- pop ---|---------------- b ------------|
>> +	 *  |____________________________________________| length
>> +	 *
>> +	 * Offset and start of the current msg elem are equal because in the
>> +	 * previous case we handled offset != start and either consumed the
>> +	 * entire element and advanced to the next element OR pop == 0.
>> +	 *
>> +	 * Two cases to handle here are first pop is less than the length
>> +	 * leaving some remainder b above. Simply adjust the element's layout
>> +	 * in this case. Or pop >= length of the element so that b = 0. In this
>> +	 * case advance to next element decrementing pop.
>> +	 */
>> +	while (pop) {
>> +		struct scatterlist *sge = sk_msg_elem(msg, i);
>> +
>> +		if (pop < sge->length) {
>> +			sge->length -= pop;
>> +			sge->offset += pop;
>> +			pop = 0;
>> +		} else {
>> +			pop -= sge->length;
>> +			sk_msg_shift_left(msg, i);
>> +		}
>> +		sk_msg_iter_var_next(i);
>> +	}
>> +
>> +	sk_mem_uncharge(msg->sk, len - pop);
>> +	msg->sg.size -= (len - pop);
>> +	sk_msg_compute_data_pointers(msg);
>> +	return 0;
>> +}
> 
> Hmm, had to revert. I noticed this will have a use after free. While you do the
> sk_msg_compute_data_pointers() there is nothing from verifier that forces a
> reload of the payload pointers that were set up before the helper call, so they
> can still be used after the bpf_msg_pop_data() even though we dropped ref from
> page etc.

Thanks for catching this. I will send a v2 with bpf_msg_pop_data() added to
the list in bpf_helper_changes_pkt_data.

> Thanks,
> Daniel
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-11-27  2:37 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-23  1:38 [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data John Fastabend
2018-11-23  1:38 ` [PATCH bpf-next 1/3] bpf: helper to pop data from messages John Fastabend
2018-11-26  1:05   ` Daniel Borkmann
2018-11-26 11:16     ` Quentin Monnet
2018-11-26 15:38       ` John Fastabend
2018-11-26 15:43     ` John Fastabend
2018-11-23  1:38 ` [PATCH bpf-next 2/3] bpf: add msg_pop_data helper to tools John Fastabend
2018-11-23  1:38 ` [PATCH bpf-next 3/3] bpf: test_sockmap, add options for msg_pop_data() John Fastabend
2018-11-26  0:45 ` [PATCH bpf-next 0/3] bpf: add sk_msg helper sk_msg_pop_data Daniel Borkmann

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.