All of lore.kernel.org
 help / color / mirror / Atom feed
From: Francis Laniel <flaniel@linux.microsoft.com>
To: bpf@vger.kernel.org
Cc: Francis Laniel <flaniel@linux.microsoft.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Song Liu <song@kernel.org>, Yonghong Song <yhs@fb.com>,
	John Fastabend <john.fastabend@gmail.com>,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@google.com>, Hao Luo <haoluo@google.com>,
	Jiri Olsa <jolsa@kernel.org>, Jonathan Corbet <corbet@lwn.net>,
	Mykola Lysenko <mykolal@fb.com>, Shuah Khan <shuah@kernel.org>,
	Joanne Koong <joannelkoong@gmail.com>,
	Dave Marchevsky <davemarchevsky@fb.com>,
	Lorenzo Bianconi <lorenzo@kernel.org>,
	Maxim Mikityanskiy <maximmi@nvidia.com>,
	Geliang Tang <geliang.tang@suse.com>,
	"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-kselftest@vger.kernel.org
Subject: [RFC PATCH v2 1/5] bpf: Make ring buffer overwritable.
Date: Tue,  6 Sep 2022 21:56:42 +0200	[thread overview]
Message-ID: <20220906195656.33021-2-flaniel@linux.microsoft.com> (raw)
In-Reply-To: <20220906195656.33021-1-flaniel@linux.microsoft.com>

By default, BPF ring buffer are size bounded, when producers already filled the
buffer, they need to wait for the consumer to get those data before adding new
ones.
In terms of API, bpf_ringbuf_reserve() returns NULL if the buffer is full.

This patch permits making BPF ring buffer overwritable.
When producers already wrote as many data as the buffer size, they will begin to
over write existing data, so the oldest will be replaced.
As a result, bpf_ringbuf_reserve() never returns NULL.

To avoid memory consumption, this patch writes data backward like overwritable
perf ring buffer added in
commit 9ecda41acb97 ("perf/core: Add ::write_backward attribute to perf event").

Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
---
 include/uapi/linux/bpf.h |  3 +++
 kernel/bpf/ringbuf.c     | 43 ++++++++++++++++++++++++++++++----------
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59a217ca2dfd..c87a667649ab 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1227,6 +1227,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create an overwritable BPF_RINGBUF */
+	BFP_F_RB_OVERWRITABLE	= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index ded4faeca192..369c61cfe8aa 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -12,7 +12,7 @@
 #include <uapi/linux/btf.h>
 #include <linux/btf_ids.h>
 
-#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BFP_F_RB_OVERWRITABLE)
 
 /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
 #define RINGBUF_PGOFF \
@@ -37,6 +37,8 @@ struct bpf_ringbuf {
 	u64 mask;
 	struct page **pages;
 	int nr_pages;
+	__u8 overwritable: 1,
+	     __reserved:    7;
 	spinlock_t spinlock ____cacheline_aligned_in_smp;
 	/* Consumer and producer counters are put into separate pages to allow
 	 * mapping consumer page as r/w, but restrict producer page to r/o.
@@ -127,7 +129,12 @@ static void bpf_ringbuf_notify(struct irq_work *work)
 	wake_up_all(&rb->waitq);
 }
 
-static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+static inline bool is_overwritable(struct bpf_ringbuf *rb)
+{
+	return !!rb->overwritable;
+}
+
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, __u32 flags)
 {
 	struct bpf_ringbuf *rb;
 
@@ -142,6 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
 	rb->mask = data_sz - 1;
 	rb->consumer_pos = 0;
 	rb->producer_pos = 0;
+	rb->overwritable = !!(flags & BFP_F_RB_OVERWRITABLE);
 
 	return rb;
 }
@@ -170,7 +178,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
 
 	bpf_map_init_from_attr(&rb_map->map, attr);
 
-	rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+	rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, attr->map_flags);
 	if (!rb_map->rb) {
 		kfree(rb_map);
 		return ERR_PTR(-ENOMEM);
@@ -248,6 +256,7 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 
 	cons_pos = smp_load_acquire(&rb->consumer_pos);
 	prod_pos = smp_load_acquire(&rb->producer_pos);
+
 	return prod_pos - cons_pos;
 }
 
@@ -325,14 +334,24 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
 	}
 
 	prod_pos = rb->producer_pos;
-	new_prod_pos = prod_pos + len;
 
-	/* check for out of ringbuf space by ensuring producer position
-	 * doesn't advance more than (ringbuf_size - 1) ahead
-	 */
-	if (new_prod_pos - cons_pos > rb->mask) {
-		spin_unlock_irqrestore(&rb->spinlock, flags);
-		return NULL;
+	if (!is_overwritable(rb)) {
+		new_prod_pos = prod_pos + len;
+
+		/* check for out of ringbuf space by ensuring producer position
+		 * doesn't advance more than (ringbuf_size - 1) ahead
+		 */
+		if (new_prod_pos - cons_pos > rb->mask) {
+			spin_unlock_irqrestore(&rb->spinlock, flags);
+			return NULL;
+		}
+	} else {
+		/*
+		 * With overwritable ring buffer we go from the end toward the
+		 * beginning.
+		 */
+		prod_pos -= len;
+		new_prod_pos = prod_pos;
 	}
 
 	hdr = (void *)rb->data + (prod_pos & rb->mask);
@@ -457,10 +476,14 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
 
 	switch (flags) {
 	case BPF_RB_AVAIL_DATA:
+		if (is_overwritable(rb))
+			return -EINVAL;
 		return ringbuf_avail_data_sz(rb);
 	case BPF_RB_RING_SIZE:
 		return rb->mask + 1;
 	case BPF_RB_CONS_POS:
+		if (is_overwritable(rb))
+			return -EINVAL;
 		return smp_load_acquire(&rb->consumer_pos);
 	case BPF_RB_PROD_POS:
 		return smp_load_acquire(&rb->producer_pos);
-- 
2.25.1


  reply	other threads:[~2022-09-06 20:10 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-06 19:56 [RFC PATCH v2 0/5] Make BPF ring buffer overwritable Francis Laniel
2022-09-06 19:56 ` Francis Laniel [this message]
2022-09-06 19:56 ` [RFC PATCH v2 2/5] selftests: Add BPF overwritable ring buffer self tests Francis Laniel
2022-09-06 19:56 ` [RFC PATCH v2 3/5] docs/bpf: Add documentation for overwritable ring buffer Francis Laniel
2022-09-06 19:56 ` [RFC PATCH v2 4/5] libbpf: Add implementation to consume overwritable BPF " Francis Laniel
2022-09-06 19:56 ` [RFC PATCH v2 5/5] for test purpose only: Add toy to play with BPF ring Francis Laniel
2022-09-28  0:12 ` [RFC PATCH v2 0/5] Make BPF ring buffer overwritable Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220906195656.33021-2-flaniel@linux.microsoft.com \
    --to=flaniel@linux.microsoft.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=daniel@iogearbox.net \
    --cc=davemarchevsky@fb.com \
    --cc=geliang.tang@suse.com \
    --cc=haoluo@google.com \
    --cc=joannelkoong@gmail.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=lorenzo@kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=maximmi@nvidia.com \
    --cc=mykolal@fb.com \
    --cc=naveen.n.rao@linux.vnet.ibm.com \
    --cc=sdf@google.com \
    --cc=shuah@kernel.org \
    --cc=song@kernel.org \
    --cc=yhs@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.