All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Introducing trace buffer mapping by user-space
@ 2023-03-17 14:33 Vincent Donnefort
  2023-03-17 14:33 ` [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
  2023-03-17 14:33 ` [PATCH 2/2] tracing: Allow user-space mapping of the ring-buffer Vincent Donnefort
  0 siblings, 2 replies; 11+ messages in thread
From: Vincent Donnefort @ 2023-03-17 14:33 UTC (permalink / raw)
  To: rostedt, mhiramat, linux-kernel, linux-trace-kernel
  Cc: kernel-team, Vincent Donnefort

The tracing ring-buffers can be stored on disk or sent to network without any
copy via splice. However the later doesn't allow real time processing of the
traces. A solution is to give access to userspace to the ring-buffer pages
directly via a mapping. A piece of software can now become a reader of the
ring-buffer, and drive a consuming or non-consuming read in a similar fashion to
what trace and trace_pipe offer.

Attached to this cover letter an example of consuming read for a ring-buffer,
using libtracefs.

Vincent

--

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <signal.h>
#include <errno.h>
#include <unistd.h>
#include <tracefs.h>
#include <kbuffer.h>
#include <event-parse.h>

#include <asm/types.h>
#include <sys/mman.h>
#include <sys/ioctl.h>

#define TRACE_MMAP_IOCTL_GET_READER_PAGE	_IO('T', 0x1)

struct ring_buffer_meta_page {
	__u64		entries;
	__u64		overrun;
	__u32		pages_touched;
	__u32		reader_page;
	__u32		nr_data_pages;
	__u32		data_page_head;
	__u32		data_pages[];
};

/* Need to access private struct to save counters */
struct kbuffer {
	unsigned long long 	timestamp;
	long long		lost_events;
	unsigned long		flags;
	void			*subbuffer;
	void			*data;
	unsigned int		index;
	unsigned int		curr;
	unsigned int		next;
	unsigned int		size;
	unsigned int		start;
	unsigned int		first;

	unsigned int (*read_4)(void *ptr);
	unsigned long long (*read_8)(void *ptr);
	unsigned long long (*read_long)(struct kbuffer *kbuf, void *ptr);
	int (*next_event)(struct kbuffer *kbuf);
};

static char *argv0;
static bool need_exit;

static char *get_this_name(void)
{
	static char *this_name;
	char *arg;
	char *p;

	if (this_name)
		return this_name;

	arg = argv0;
	p = arg+strlen(arg);

	while (p >= arg && *p != '/')
		p--;
	p++;

	this_name = p;
	return p;
}

static void __vdie(const char *fmt, va_list ap, int err)
{
	int ret = errno;
	char *p = get_this_name();

	if (err && errno)
		perror(p);
	else
		ret = -1;

	fprintf(stderr, "  ");
	vfprintf(stderr, fmt, ap);

	fprintf(stderr, "\n");
	exit(ret);
}

void pdie(const char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	__vdie(fmt, ap, 1);
	va_end(ap);
}

static void read_page(struct tep_handle *tep, struct kbuffer *kbuf,
		      void *data, int page)
{
	static struct trace_seq seq;
	struct tep_record record;

	if (seq.buffer)
		trace_seq_reset(&seq);
	else
		trace_seq_init(&seq);

	while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
		kbuffer_next_event(kbuf, NULL);
		tep_print_event(tep, &seq, &record,
				"%s-%d %9d\t%s\n", TEP_PRINT_COMM,
				TEP_PRINT_PID, TEP_PRINT_TIME, TEP_PRINT_NAME);
		trace_seq_do_printf(&seq);
		trace_seq_reset(&seq);
	}
}

static int next_reader_page(int fd, struct ring_buffer_meta_page *meta,
			    struct kbuffer *kbuf)
{
	int prev_reader_page = meta->reader_page;

	if (ioctl(fd, TRACE_MMAP_IOCTL_GET_READER_PAGE) < 0)
		pdie("ioctl");

	return meta->reader_page;
}

static void signal_handler(int unused)
{
	printf("Exit!\n");
	need_exit = true;
}

int main(int argc, char **argv)
{
	int page_size, data_len, page, fd, start = -1;
	struct ring_buffer_meta_page *map;
	struct kbuffer *kbuf, prev_kbuf;
	struct tep_handle *tep;
	__u64 prev_entries;
	void *meta, *data;
	char *buf, path[32];
	int cpu;

	argv0 = argv[0];
	cpu = atoi(argv[1]);
	snprintf(path, 32, "per_cpu/cpu%d/trace_pipe_raw", cpu);

	tep = tracefs_local_events(NULL);
	kbuf = tep_kbuffer(tep);
	page_size = getpagesize();

	fd = tracefs_instance_file_open(NULL, path, O_RDONLY);
	if (fd < 0)
		pdie("raw");

	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
	if (meta == MAP_FAILED)
		pdie("mmap");
	map = meta;

	printf("entries:	%llu\n", map->entries);
	printf("overrun:	%llu\n", map->overrun);
	printf("pages_touched:	%u\n", map->pages_touched);
	printf("reader_page:	%u\n", map->reader_page);
	printf("nr_data_pages:	%u\n\n", map->nr_data_pages);

	data_len = page_size * (map->nr_data_pages + 1);

	data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, fd, page_size);
	if (data == MAP_FAILED)
		pdie("mmap data");

	signal(SIGINT, signal_handler);

	page = ((struct ring_buffer_meta_page*)meta)->reader_page;
again:
	do {
		kbuffer_load_subbuffer(kbuf, data + page_size * page);

		if (page != start) {
			printf("READER PAGE: %d\n", map->reader_page);
		} else {
			kbuf->curr = prev_kbuf.curr;
			kbuf->index = prev_kbuf.index;
			kbuf->next = prev_kbuf.next;
			kbuf->timestamp = prev_kbuf.timestamp;
			kbuffer_next_event(kbuf, NULL);
		}

		prev_entries = map->entries;
		start = page;

		read_page(tep, kbuf, data, page);
	} while ((page = next_reader_page(fd, meta, kbuf)) != start);

	prev_kbuf.curr = kbuf->curr;
	prev_kbuf.index = kbuf->index;
	prev_kbuf.next = kbuf->next;
	prev_kbuf.timestamp = kbuf->timestamp;

	while (prev_entries == *(volatile __u64 *)&map->entries && !need_exit)
		usleep(100000);

	if (!need_exit)
		goto again;

	munmap(data, data_len);
	munmap(meta, page_size);
	close(fd);

	return 0;
}

Vincent Donnefort (2):
  ring-buffer: Introducing ring-buffer mapping functions
  tracing: Allow user-space mapping of the ring-buffer

 include/linux/ring_buffer.h     |   8 +
 include/uapi/linux/trace_mmap.h |  27 +++
 kernel/trace/ring_buffer.c      | 334 +++++++++++++++++++++++++++++++-
 kernel/trace/trace.c            |  73 ++++++-
 4 files changed, 436 insertions(+), 6 deletions(-)
 create mode 100644 include/uapi/linux/trace_mmap.h

-- 
2.40.0.rc1.284.g88254d51c5-goog


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-17 14:33 [PATCH 0/2] Introducing trace buffer mapping by user-space Vincent Donnefort
@ 2023-03-17 14:33 ` Vincent Donnefort
  2023-03-21  1:45   ` Steven Rostedt
  2023-03-17 14:33 ` [PATCH 2/2] tracing: Allow user-space mapping of the ring-buffer Vincent Donnefort
  1 sibling, 1 reply; 11+ messages in thread
From: Vincent Donnefort @ 2023-03-17 14:33 UTC (permalink / raw)
  To: rostedt, mhiramat, linux-kernel, linux-trace-kernel
  Cc: kernel-team, Vincent Donnefort

In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:

  ring_buffer_{map,unmap}()
  ring_buffer_map_fault()

And controls on the ring-buffer:

  ring_buffer_get_reader_page()  /* swap reader and head */
  ring_buffer_update_meta_page()

Mapping the ring-buffer also involves:

  A unique ID for each page of the ring-buffer, as currently the pages
  are only identified through their in-kernel VA.

  A meta-page, where are stored statistics about the ring-buffer and
  a page IDs list, ordered. A field gives what page is the reader
  one and one to gives where the ring-buffer starts in the list of data
  pages.

The linear mapping exposes the meta-page, and each page of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.

Once mapped, no page can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping pages.

Also, the meta-page being... a single page, this limits at the moment the
number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
system.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..4897e17ebdde 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 
+#include <uapi/linux/trace_mmap.h>
+
 struct trace_buffer;
 struct ring_buffer_iter;
 
@@ -211,4 +213,10 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
 #define trace_rb_cpu_prepare	NULL
 #endif
 
+int ring_buffer_map(struct trace_buffer *buffer, int cpu);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+				   unsigned long pgoff);
+int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu);
+int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu);
 #endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index 000000000000..b5caed17a066
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_TRACE_MMAP_H_
+#define _UAPI_TRACE_MMAP_H_
+
+#include <asm/bitsperlong.h>
+
+#include <linux/types.h>
+
+struct ring_buffer_meta_page {
+#if __BITS_PER_LONG == 64
+	__u64	entries;
+	__u64	overrun;
+#else
+	__u32	entries;
+	__u32	overrun;
+#endif
+	__u32	pages_touched;
+	__u32	reader_page;
+	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
+	__u32	data_page_head;	/* index of data_pages[] */
+	__u32	data_pages[];
+};
+
+#endif /* _UAPI_TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index af50d931b020..08765310380b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -332,6 +332,7 @@ struct buffer_page {
 	local_t		 entries;	/* entries on this page */
 	unsigned long	 real_end;	/* real end of data */
 	struct buffer_data_page *page;	/* Actual data page */
+	u32		 id;		/* ID for external mapping */
 };
 
 /*
@@ -529,6 +530,12 @@ struct ring_buffer_per_cpu {
 	rb_time_t			before_stamp;
 	u64				event_stamp[MAX_NEST];
 	u64				read_stamp;
+
+	int				mapped;
+	struct mutex			mapping_lock;
+	unsigned long			*page_ids;	/* ID to addr */
+	struct ring_buffer_meta_page	*meta_page;
+
 	/* ring buffer pages to update, > 0 to add, < 0 to remove */
 	long				nr_pages_to_update;
 	struct list_head		new_pages; /* new pages to add */
@@ -1452,12 +1459,37 @@ static inline void rb_inc_page(struct buffer_page **bpage)
 	*bpage = list_entry(p, struct buffer_page, list);
 }
 
+static inline void
+rb_meta_page_head_move(struct ring_buffer_per_cpu *cpu_buffer, unsigned long num)
+{
+	unsigned long head_id;
+
+	if (!READ_ONCE(cpu_buffer->mapped))
+		return;
+
+	head_id = cpu_buffer->meta_page->data_page_head;
+	cpu_buffer->meta_page->data_page_head = (head_id + num) % cpu_buffer->nr_pages;
+}
+
+static inline void
+rb_meta_page_head_swap(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
+
+	if (!READ_ONCE(cpu_buffer->mapped))
+		return;
+
+	meta->reader_page = cpu_buffer->head_page->id;
+	meta->data_pages[meta->data_page_head] = cpu_buffer->reader_page->id;
+}
+
 static struct buffer_page *
 rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *head;
 	struct buffer_page *page;
 	struct list_head *list;
+	unsigned long cnt = 0;
 	int i;
 
 	if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
@@ -1479,9 +1511,12 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 		do {
 			if (rb_is_head_page(page, page->list.prev)) {
 				cpu_buffer->head_page = page;
+				rb_meta_page_head_move(cpu_buffer, cnt);
+
 				return page;
 			}
 			rb_inc_page(&page);
+			cnt++;
 		} while (page != head);
 	}
 
@@ -1567,6 +1602,13 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Again, either we update tail_page or an interrupt does */
 		(void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
 	}
+
+	if (READ_ONCE(cpu_buffer->mapped)) {
+		/* Ensure the meta_page is ready */
+		smp_rmb();
+		WRITE_ONCE(cpu_buffer->meta_page->pages_touched,
+			   local_read(&cpu_buffer->pages_touched));
+	}
 }
 
 static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1735,6 +1777,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
 	init_waitqueue_head(&cpu_buffer->irq_work.waiters);
 	init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+	mutex_init(&cpu_buffer->mapping_lock);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -2173,7 +2216,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 	/* prevent another thread from changing buffer sizes */
 	mutex_lock(&buffer->mutex);
 
-
 	if (cpu_id == RING_BUFFER_ALL_CPUS) {
 		/*
 		 * Don't succeed if resizing is disabled, as a reader might be
@@ -2523,6 +2565,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
 		local_inc(&cpu_buffer->pages_lost);
 
+		if (READ_ONCE(cpu_buffer->mapped)) {
+			/* Ensure the meta_page is ready */
+			smp_rmb();
+			WRITE_ONCE(cpu_buffer->meta_page->overrun,
+				   local_read(&cpu_buffer->overrun));
+		}
+
 		/*
 		 * The entries will be zeroed out when we move the
 		 * tail page.
@@ -3179,6 +3228,14 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	local_inc(&cpu_buffer->entries);
+
+	if (READ_ONCE(cpu_buffer->mapped)) {
+		/* Ensure the meta_page is ready */
+		smp_rmb();
+		WRITE_ONCE(cpu_buffer->meta_page->entries,
+			   local_read(&cpu_buffer->entries));
+	}
+
 	rb_end_commit(cpu_buffer);
 }
 
@@ -3482,7 +3539,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 		return;
 
 	/*
-	 * If this interrupted another event, 
+	 * If this interrupted another event,
 	 */
 	if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
 		goto out;
@@ -4643,7 +4700,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 * Now make the new head point back to the reader page.
 	 */
 	rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
+	rb_meta_page_head_swap(cpu_buffer);
 	rb_inc_page(&cpu_buffer->head_page);
+	rb_meta_page_head_move(cpu_buffer, 1);
 
 	local_inc(&cpu_buffer->pages_read);
 
@@ -5285,6 +5344,12 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->lost_events = 0;
 	cpu_buffer->last_overrun = 0;
 
+	if (READ_ONCE(cpu_buffer->mapped)) {
+		WRITE_ONCE(cpu_buffer->meta_page->entries, 0);
+		WRITE_ONCE(cpu_buffer->meta_page->pages_touched, 0);
+		WRITE_ONCE(cpu_buffer->meta_page->overrun, 0);
+	}
+
 	rb_head_page_activate(cpu_buffer);
 }
 
@@ -5489,6 +5554,11 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
+	if (READ_ONCE(cpu_buffer_a->mapped) || READ_ONCE(cpu_buffer_b->mapped)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	/* At least make sure the two buffers are somewhat the same */
 	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
 		goto out;
@@ -5722,7 +5792,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
 	if (read || (len < (commit - read)) ||
-	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
+	    cpu_buffer->reader_page == cpu_buffer->commit_page ||
+	    READ_ONCE(cpu_buffer->mapped)) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 		unsigned int rpos = read;
 		unsigned int pos = 0;
@@ -5839,6 +5910,263 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+#define META_PAGE_MAX_PAGES \
+	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+
+static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	kfree(cpu_buffer->page_ids);
+	cpu_buffer->page_ids = NULL;
+}
+
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	if (cpu_buffer->meta_page)
+		return 0;
+
+	if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
+		return -E2BIG;
+
+	cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
+	if (!cpu_buffer->meta_page)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	free_page((unsigned long)cpu_buffer->meta_page);
+	cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+				   unsigned long *page_ids)
+{
+	struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
+	struct buffer_page *first_page, *bpage;
+	int id = 0;
+
+	page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+	cpu_buffer->reader_page->id = id++;
+
+	first_page = bpage = rb_set_head_page(cpu_buffer);
+	do {
+		if (id > META_PAGE_MAX_PAGES) {
+			WARN_ON(1);
+			break;
+		}
+
+		page_ids[id] = (unsigned long)bpage->page;
+		bpage->id = id;
+		meta->data_pages[id - 1] = id;
+
+		rb_inc_page(&bpage);
+		id++;
+	} while (bpage != first_page);
+
+	/* install page ID to kern VA translation */
+	cpu_buffer->page_ids = page_ids;
+
+	meta->entries = 0;
+	meta->overrun = 0;
+	meta->pages_touched = 0;
+	meta->reader_page = cpu_buffer->reader_page->id;
+	meta->nr_data_pages = cpu_buffer->nr_pages;
+	meta->data_page_head = 0;
+}
+
+static inline struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return ERR_PTR(-EINVAL);
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (!cpu_buffer->mapped) {
+		mutex_unlock(&cpu_buffer->mapping_lock);
+		return ERR_PTR(-ENODEV);
+	}
+
+	return cpu_buffer;
+}
+
+static inline void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags, *page_ids;
+	int err = 0;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return -EINVAL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (cpu_buffer->mapped) {
+		WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped + 1);
+		goto unlock;
+	}
+
+	/* prevent another thread from changing buffer sizes */
+	mutex_lock(&buffer->mutex);
+	atomic_inc(&cpu_buffer->resize_disabled);
+	mutex_unlock(&buffer->mutex);
+
+	err = rb_alloc_meta_page(cpu_buffer);
+	if (err) {
+		atomic_dec(&cpu_buffer->resize_disabled);
+		goto unlock;
+	}
+
+	/* page_ids include the reader page while nr_pages does not */
+	page_ids = kzalloc(sizeof(*page_ids) * (cpu_buffer->nr_pages + 1),
+			   GFP_KERNEL);
+	if (!page_ids) {
+		rb_free_meta_page(cpu_buffer);
+		atomic_dec(&cpu_buffer->resize_disabled);
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	/*
+	 * Lock all readers to block any page swap until the page IDs are
+	 * assigned.
+	 */
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	rb_setup_ids_meta_page(cpu_buffer, page_ids);
+	/*
+	 * Ensure the writer will observe the meta-page before
+	 * cpu_buffer->mapped.
+	 */
+	smp_wmb();
+	WRITE_ONCE(cpu_buffer->mapped, 1);
+
+	/* Init meta_page values unless the writer did it already */
+	cmpxchg(&cpu_buffer->meta_page->entries, 0,
+		local_read(&cpu_buffer->entries));
+	cmpxchg(&cpu_buffer->meta_page->overrun, 0,
+		local_read(&cpu_buffer->overrun));
+	cmpxchg(&cpu_buffer->meta_page->pages_touched, 0,
+		local_read(&cpu_buffer->pages_touched));
+
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+unlock:
+	mutex_unlock(&cpu_buffer->mapping_lock);
+
+	return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int err = 0;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return -EINVAL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (!cpu_buffer->mapped) {
+		err = -ENODEV;
+		goto unlock;
+	}
+
+	WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped - 1);
+	if (!cpu_buffer->mapped) {
+		/* Wait the writer and readers to observe !mapped */
+		synchronize_rcu();
+
+		rb_free_page_ids(cpu_buffer);
+		rb_free_meta_page(cpu_buffer);
+		atomic_dec(&cpu_buffer->resize_disabled);
+	}
+
+unlock:
+	mutex_unlock(&cpu_buffer->mapping_lock);
+
+	return err;
+}
+
+/*
+ *   +--------------+
+ *   |   meta page  |  pgoff=0
+ *   +--------------+
+ *   |  data page1  |  pgoff=1 page_ids=0
+ *   +--------------+
+ *   |  data page2  |  pgoff=2 page_ids=1
+ *         ...
+ */
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+				   unsigned long pgoff)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+
+	if (!pgoff)
+		return virt_to_page(cpu_buffer->meta_page);
+
+	pgoff--;
+	if (pgoff > cpu_buffer->nr_pages)
+		return NULL;
+
+	return virt_to_page(cpu_buffer->page_ids[pgoff]);
+}
+
+int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct buffer_page *reader;
+	unsigned long flags;
+
+	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+	if (IS_ERR(cpu_buffer))
+		return (int)PTR_ERR(cpu_buffer);
+
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	reader = cpu_buffer->reader_page;
+	reader->read = rb_page_size(reader);
+	if (!rb_per_cpu_empty(cpu_buffer))
+		WARN_ON(!rb_get_reader_page(cpu_buffer));
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	rb_put_mapped_buffer(cpu_buffer);
+
+	return 0;
+}
+
+int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+
+	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+	if (IS_ERR(cpu_buffer))
+		return PTR_ERR(cpu_buffer);
+
+	/* Update the head page if the writer moved it */
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	rb_set_head_page(cpu_buffer);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	rb_put_mapped_buffer(cpu_buffer);
+
+	return 0;
+}
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
-- 
2.40.0.rc1.284.g88254d51c5-goog


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] tracing: Allow user-space mapping of the ring-buffer
  2023-03-17 14:33 [PATCH 0/2] Introducing trace buffer mapping by user-space Vincent Donnefort
  2023-03-17 14:33 ` [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
@ 2023-03-17 14:33 ` Vincent Donnefort
  1 sibling, 0 replies; 11+ messages in thread
From: Vincent Donnefort @ 2023-03-17 14:33 UTC (permalink / raw)
  To: rostedt, mhiramat, linux-kernel, linux-trace-kernel
  Cc: kernel-team, Vincent Donnefort

Currently, user-space extracts data from the ring-buffer via splice,
which is handy for storage or network sharing. However, due to splice
limitations, it is imposible to do real-time analysis without a copy.

A solution for that problem is to let the user-space map the ring-buffer
directly.

The mapping exposed via the per-CPU file trace_pipe_raw. The first page
is the meta-page and is followed by each page of the ring-buffer,
ordered by their unique page ID. It is therefore easy to translate a
page-ID to an offset in the mapping.

  * Meta-page -- include/uapi/linux/trace_mmap.h for a description
  * Page ID 0
  * Page ID 1
     ...

The mapper must then do what use to be the kernel jobs: swap the reader
with the head. This is done with a newly introduced ioctl:
TRACE_MMAP_IOCTL_GET_READER_PAGE.

Entries, pages_touched and overrun fields are automatically updated by
the writer. Only readers keep the head page field updated. An additional
ioctl TRACE_MMAP_IOCTL_UPDATE_META_PAGE allows to query that update,
enabling non-consuming read from userspace.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index b5caed17a066..24bcec754a35 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -21,4 +21,7 @@ struct ring_buffer_meta_page {
 	__u32	data_pages[];
 };
 
+#define TRACE_MMAP_IOCTL_GET_READER_PAGE	_IO('T', 0x1)
+#define TRACE_MMAP_IOCTL_UPDATE_META_PAGE	_IO('T', 0x2)
+
 #endif /* _UAPI_TRACE_MMAP_H_ */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45551c7b4c36..51d06a2a7545 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6399,7 +6399,7 @@ static void tracing_set_nop(struct trace_array *tr)
 {
 	if (tr->current_trace == &nop_trace)
 		return;
-	
+
 	tr->current_trace->enabled--;
 
 	if (tr->current_trace->reset)
@@ -8432,15 +8432,27 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	return ret;
 }
 
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
 static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct ftrace_buffer_info *info = file->private_data;
 	struct trace_iterator *iter = &info->iter;
 
+	switch (cmd) {
+	case TRACE_MMAP_IOCTL_GET_READER_PAGE:
+		return ring_buffer_get_reader_page(iter->array_buffer->buffer,
+						   iter->cpu_file);
+	case TRACE_MMAP_IOCTL_UPDATE_META_PAGE:
+		return ring_buffer_update_meta_page(iter->array_buffer->buffer,
+						    iter->cpu_file);
+	}
+
 	if (cmd)
-		return -ENOIOCTLCMD;
+		return -ENOTTY;
 
+	/*
+	 * An ioctl call with cmd 0 to the ring buffer file will wake up all
+	 * waiters
+	 */
 	mutex_lock(&trace_types_lock);
 
 	iter->wait_index++;
@@ -8453,6 +8465,60 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
 	return 0;
 }
 
+static vm_fault_t tracing_buffers_mmap_fault(struct vm_fault *vmf)
+{
+	struct ftrace_buffer_info *info = vmf->vma->vm_file->private_data;
+	struct trace_iterator *iter = &info->iter;
+	vm_fault_t ret = VM_FAULT_SIGBUS;
+	struct page *page;
+
+	page = ring_buffer_map_fault(iter->array_buffer->buffer, iter->cpu_file,
+				     vmf->pgoff);
+	if (!page)
+		return ret;
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+	struct ftrace_buffer_info *info = vma->vm_file->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file);
+}
+
+static void tracing_buffers_mmap_open(struct vm_area_struct *vma)
+{
+	struct ftrace_buffer_info *info = vma->vm_file->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	WARN_ON(ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file));
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+	.open		= tracing_buffers_mmap_open,
+	.close		= tracing_buffers_mmap_close,
+	.fault		= tracing_buffers_mmap_fault,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	vm_flags_mod(vma, VM_DONTCOPY | VM_DONTDUMP, VM_MAYWRITE);
+	vma->vm_ops = &tracing_buffers_vmops;
+
+	return ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file);
+}
+
 static const struct file_operations tracing_buffers_fops = {
 	.open		= tracing_buffers_open,
 	.read		= tracing_buffers_read,
@@ -8461,6 +8527,7 @@ static const struct file_operations tracing_buffers_fops = {
 	.splice_read	= tracing_buffers_splice_read,
 	.unlocked_ioctl = tracing_buffers_ioctl,
 	.llseek		= no_llseek,
+	.mmap		= tracing_buffers_mmap,
 };
 
 static ssize_t
-- 
2.40.0.rc1.284.g88254d51c5-goog


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-17 14:33 ` [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
@ 2023-03-21  1:45   ` Steven Rostedt
  2023-03-21 15:17     ` Vincent Donnefort
  0 siblings, 1 reply; 11+ messages in thread
From: Steven Rostedt @ 2023-03-21  1:45 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Fri, 17 Mar 2023 14:33:09 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> Also, the meta-page being... a single page, this limits at the moment the
> number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> system.

I hate this limitation, so I fixed it ;-)

I added a meta_page_size field to the meta page, and user space can do:

	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
	if (meta == MAP_FAILED)
		pdie("mmap");

	map = meta;
	meta_len = map->meta_page_size;

	if (meta_len > page_size) {
		munmap(meta, page_size);
		meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
		if (meta == MAP_FAILED)
			pdie("mmap");
		map = meta;
	}

This appears to work (but I'm still testing it).

-- Steve

diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index 24bcec754a35..12f3f7ee33d9 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
 	__u32	reader_page;
 	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
 	__u32	data_page_head;	/* index of data_pages[] */
+	__u32	meta_page_size;	/* size of the meta page */
 	__u32	data_pages[];
 };
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10a17e78cfe6..77c92e4a7adc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
 	u64				read_stamp;
 
 	int				mapped;
+	int				meta_order;
 	struct mutex			mapping_lock;
 	unsigned long			*page_ids;	/* ID to addr */
 	struct ring_buffer_meta_page	*meta_page;
@@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
 #define META_PAGE_MAX_PAGES \
-	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
 
 static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
 {
@@ -5908,22 +5909,34 @@ static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
 
 static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
+	struct page *meta_pages;
+	int pages;
+	int order = 0;
+
 	if (cpu_buffer->meta_page)
 		return 0;
 
-	if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
-		return -E2BIG;
-
-	cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
-	if (!cpu_buffer->meta_page)
+	if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES) {
+		/* Calcualte how many more pages we need to hold indexes */
+		pages = DIV_ROUND_UP(cpu_buffer->nr_pages - META_PAGE_MAX_PAGES,
+				     PAGE_SIZE / sizeof(u32));
+		/* Add back the meta_page itself */
+		pages++;
+		order = fls(pages) - 1;
+	}
+	meta_pages = alloc_pages(GFP_USER, order);
+	if (!meta_pages)
 		return -ENOMEM;
 
+	cpu_buffer->meta_page = page_to_virt(meta_pages);
+	cpu_buffer->meta_order = order;
+
 	return 0;
 }
 
 static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	free_page((unsigned long)cpu_buffer->meta_page);
+	free_pages((unsigned long)cpu_buffer->meta_page, cpu_buffer->meta_order);
 	cpu_buffer->meta_page = NULL;
 }
 
@@ -5932,14 +5945,20 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
 	struct buffer_page *first_page, *bpage;
+	int data_page_end;
 	int id = 0;
 
 	page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
 	cpu_buffer->reader_page->id = id++;
 
+	/* Calculate the last index of data_pages[] */
+	data_page_end = (1 << (cpu_buffer->meta_order + PAGE_SHIFT)) -
+		offsetof(struct ring_buffer_meta_page, data_pages);
+	data_page_end /= sizeof(u32);
+
 	first_page = bpage = rb_set_head_page(cpu_buffer);
 	do {
-		if (id > META_PAGE_MAX_PAGES) {
+		if (id > data_page_end) {
 			WARN_ON(1);
 			break;
 		}
@@ -5960,6 +5979,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 	meta->pages_touched = 0;
 	meta->reader_page = cpu_buffer->reader_page->id;
 	meta->nr_data_pages = cpu_buffer->nr_pages;
+	meta->meta_page_size = 1 << (cpu_buffer->meta_order + PAGE_SHIFT);
 	meta->data_page_head = 0;
 }
 
@@ -6092,10 +6112,12 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
 /*
  *   +--------------+
  *   |   meta page  |  pgoff=0
+ *   |     ...      |
+ *   |              |  pgoff=(1<<cpu_buffer->meta_order - 1)
  *   +--------------+
- *   |  data page1  |  pgoff=1 page_ids=0
+ *   |  data page1  |  page_ids=0
  *   +--------------+
- *   |  data page2  |  pgoff=2 page_ids=1
+ *   |  data page2  |  page_ids=1
  *         ...
  */
 struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
@@ -6103,10 +6125,11 @@ struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 
-	if (!pgoff)
-		return virt_to_page(cpu_buffer->meta_page);
+	if (pgoff < (1 << cpu_buffer->meta_order) + 1)
+		return virt_to_page((void *)cpu_buffer->meta_page + (pgoff << PAGE_SHIFT));
+
+	pgoff -= (1 << cpu_buffer->meta_order);
 
-	pgoff--;
 	if (pgoff > cpu_buffer->nr_pages)
 		return NULL;
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-21  1:45   ` Steven Rostedt
@ 2023-03-21 15:17     ` Vincent Donnefort
  2023-03-21 15:40       ` Steven Rostedt
  0 siblings, 1 reply; 11+ messages in thread
From: Vincent Donnefort @ 2023-03-21 15:17 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> On Fri, 17 Mar 2023 14:33:09 +0000
> Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> > Also, the meta-page being... a single page, this limits at the moment the
> > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > system.
> 
> I hate this limitation, so I fixed it ;-)

Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?

> 
> I added a meta_page_size field to the meta page, and user space can do:
> 
> 	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> 	if (meta == MAP_FAILED)
> 		pdie("mmap");
> 
> 	map = meta;
> 	meta_len = map->meta_page_size;
> 
> 	if (meta_len > page_size) {
> 		munmap(meta, page_size);
> 		meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> 		if (meta == MAP_FAILED)
> 			pdie("mmap");
> 		map = meta;
> 	}
> 
> This appears to work (but I'm still testing it).
> 
> -- Steve
> 
> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> index 24bcec754a35..12f3f7ee33d9 100644
> --- a/include/uapi/linux/trace_mmap.h
> +++ b/include/uapi/linux/trace_mmap.h
> @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
>  	__u32	reader_page;
>  	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
>  	__u32	data_page_head;	/* index of data_pages[] */
> +	__u32	meta_page_size;	/* size of the meta page */

Do we want a specific field here? That could be deduced from nr_data_pages()
quite easily?


>  	__u32	data_pages[];
>  };
>  
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 10a17e78cfe6..77c92e4a7adc 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
>  	u64				read_stamp;
>  
>  	int				mapped;
> +	int				meta_order;
>  	struct mutex			mapping_lock;
>  	unsigned long			*page_ids;	/* ID to addr */
>  	struct ring_buffer_meta_page	*meta_page;
> @@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
>  EXPORT_SYMBOL_GPL(ring_buffer_read_page);
>  
>  #define META_PAGE_MAX_PAGES \
> -	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
> +	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
>

[...]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-21 15:17     ` Vincent Donnefort
@ 2023-03-21 15:40       ` Steven Rostedt
  2023-03-21 16:20         ` Vincent Donnefort
  2023-03-21 16:44         ` Steven Rostedt
  0 siblings, 2 replies; 11+ messages in thread
From: Steven Rostedt @ 2023-03-21 15:40 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Tue, 21 Mar 2023 15:17:15 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> > On Fri, 17 Mar 2023 14:33:09 +0000
> > Vincent Donnefort <vdonnefort@google.com> wrote:
> >   
> > > Also, the meta-page being... a single page, this limits at the moment the
> > > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > > system.  
> > 
> > I hate this limitation, so I fixed it ;-)  
> 
> Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?

Hold off, I found some bugs that I'm fixing ;-)

> 
> > 
> > I added a meta_page_size field to the meta page, and user space can do:
> > 
> > 	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > 	if (meta == MAP_FAILED)
> > 		pdie("mmap");
> > 
> > 	map = meta;
> > 	meta_len = map->meta_page_size;
> > 
> > 	if (meta_len > page_size) {
> > 		munmap(meta, page_size);
> > 		meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > 		if (meta == MAP_FAILED)
> > 			pdie("mmap");
> > 		map = meta;
> > 	}
> > 
> > This appears to work (but I'm still testing it).
> > 
> > -- Steve
> > 
> > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> > index 24bcec754a35..12f3f7ee33d9 100644
> > --- a/include/uapi/linux/trace_mmap.h
> > +++ b/include/uapi/linux/trace_mmap.h
> > @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> >  	__u32	reader_page;
> >  	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
> >  	__u32	data_page_head;	/* index of data_pages[] */
> > +	__u32	meta_page_size;	/* size of the meta page */  
> 
> Do we want a specific field here? That could be deduced from nr_data_pages()
> quite easily?

I rather not have too much implementation detail knowledge in user space.
It only removes a single entry, and it makes user space easier. In fact,
I'm thinking we should not include "__u32 data_pages[]" but instead add a:
"__u32 data_start" where user space does:

	__u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;

That way we could extend the data provided by the meta_page in the future.

-- Steve


> 
> 
> >  	__u32	data_pages[];
> >  };
> >  
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 10a17e78cfe6..77c92e4a7adc 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
> >  	u64				read_stamp;
> >  
> >  	int				mapped;
> > +	int				meta_order;
> >  	struct mutex			mapping_lock;
> >  	unsigned long			*page_ids;	/* ID to addr */
> >  	struct ring_buffer_meta_page	*meta_page;
> > @@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> >  EXPORT_SYMBOL_GPL(ring_buffer_read_page);
> >  
> >  #define META_PAGE_MAX_PAGES \
> > -	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
> > +	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
> >  
> 
> [...]


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-21 15:40       ` Steven Rostedt
@ 2023-03-21 16:20         ` Vincent Donnefort
  2023-03-21 16:51           ` Steven Rostedt
  2023-03-21 16:44         ` Steven Rostedt
  1 sibling, 1 reply; 11+ messages in thread
From: Vincent Donnefort @ 2023-03-21 16:20 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Tue, Mar 21, 2023 at 11:40:47AM -0400, Steven Rostedt wrote:
> On Tue, 21 Mar 2023 15:17:15 +0000
> Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> > On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> > > On Fri, 17 Mar 2023 14:33:09 +0000
> > > Vincent Donnefort <vdonnefort@google.com> wrote:
> > >   
> > > > Also, the meta-page being... a single page, this limits at the moment the
> > > > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > > > system.  
> > > 
> > > I hate this limitation, so I fixed it ;-)  
> > 
> > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
> 
> Hold off, I found some bugs that I'm fixing ;-)
> 
> > 
> > > 
> > > I added a meta_page_size field to the meta page, and user space can do:
> > > 
> > > 	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > > 	if (meta == MAP_FAILED)
> > > 		pdie("mmap");
> > > 
> > > 	map = meta;
> > > 	meta_len = map->meta_page_size;
> > > 
> > > 	if (meta_len > page_size) {
> > > 		munmap(meta, page_size);
> > > 		meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > > 		if (meta == MAP_FAILED)
> > > 			pdie("mmap");
> > > 		map = meta;
> > > 	}
> > > 
> > > This appears to work (but I'm still testing it).
> > > 
> > > -- Steve
> > > 
> > > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> > > index 24bcec754a35..12f3f7ee33d9 100644
> > > --- a/include/uapi/linux/trace_mmap.h
> > > +++ b/include/uapi/linux/trace_mmap.h
> > > @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> > >  	__u32	reader_page;
> > >  	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
> > >  	__u32	data_page_head;	/* index of data_pages[] */
> > > +	__u32	meta_page_size;	/* size of the meta page */  
> > 
> > Do we want a specific field here? That could be deduced from nr_data_pages()
> > quite easily?
> 
> I rather not have too much implementation detail knowledge in user space.
> It only removes a single entry, and it makes user space easier. In fact,

Ack.

> I'm thinking we should not include "__u32 data_pages[]" but instead add a:
> "__u32 data_start" where user space does:
> 
> 	__u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
> 
> That way we could extend the data provided by the meta_page in the future.

That'd be nice. Couldn't we keep both to simplify the code for the kernel side?

> 
> -- Steve
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-21 15:40       ` Steven Rostedt
  2023-03-21 16:20         ` Vincent Donnefort
@ 2023-03-21 16:44         ` Steven Rostedt
  2023-03-21 16:50           ` Vincent Donnefort
  1 sibling, 1 reply; 11+ messages in thread
From: Steven Rostedt @ 2023-03-21 16:44 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Tue, 21 Mar 2023 11:40:47 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > 
> > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?  
> 
> Hold off, I found some bugs that I'm fixing ;-)

OK, you can fold this in. I also fixed an issue with your patch where it
was missing setting a page->mapping and also clearing it.

I haven't updated to replace "__u32 *data_pages[]" with an "__u32 data_start"
But I think that should still be done.

-- Steve

diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index 24bcec754a35..12f3f7ee33d9 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
 	__u32	reader_page;
 	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
 	__u32	data_page_head;	/* index of data_pages[] */
+	__u32	meta_page_size;	/* size of the meta page */
 	__u32	data_pages[];
 };
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10a17e78cfe6..d546fdd14fc3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
 	u64				read_stamp;
 
 	int				mapped;
+	int				meta_order;
 	struct mutex			mapping_lock;
 	unsigned long			*page_ids;	/* ID to addr */
 	struct ring_buffer_meta_page	*meta_page;
@@ -5898,32 +5899,63 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
 #define META_PAGE_MAX_PAGES \
-	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+	((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
+
+static void unmap_page(unsigned long addr)
+{
+	struct page *page = virt_to_page(addr);
+
+	page->mapping = NULL;
+}
 
 static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
 {
+	int i;
+
+	for (i = 0; i < cpu_buffer->nr_pages; i++)
+		unmap_page(cpu_buffer->page_ids[i]);
+
 	kfree(cpu_buffer->page_ids);
 	cpu_buffer->page_ids = NULL;
 }
 
 static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
+	struct page *meta_pages;
+	int pages;
+	int order = 0;
+
 	if (cpu_buffer->meta_page)
 		return 0;
 
-	if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
-		return -E2BIG;
-
-	cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
-	if (!cpu_buffer->meta_page)
+	if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES) {
+		/* Calcualte how many more pages we need to hold indexes */
+		pages = DIV_ROUND_UP(cpu_buffer->nr_pages - META_PAGE_MAX_PAGES,
+				     PAGE_SIZE / sizeof(u32));
+		/* Add back the meta_page itself */
+		pages++;
+		order = fls(pages) - 1;
+	}
+	meta_pages = alloc_pages(GFP_USER, order);
+	if (!meta_pages)
 		return -ENOMEM;
 
+	cpu_buffer->meta_page = page_to_virt(meta_pages);
+	cpu_buffer->meta_order = order;
+
 	return 0;
 }
 
 static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	free_page((unsigned long)cpu_buffer->meta_page);
+	unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+	int i;
+
+	for (i = 0; i < (1 << cpu_buffer->meta_order); i++) {
+		unmap_page(addr);
+		addr += PAGE_SIZE;
+	}
+	free_pages((unsigned long)cpu_buffer->meta_page, cpu_buffer->meta_order);
 	cpu_buffer->meta_page = NULL;
 }
 
@@ -5932,14 +5964,20 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
 	struct buffer_page *first_page, *bpage;
+	int data_page_end;
 	int id = 0;
 
 	page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
 	cpu_buffer->reader_page->id = id++;
 
+	/* Calculate the last index of data_pages[] */
+	data_page_end = (1 << (cpu_buffer->meta_order + PAGE_SHIFT)) -
+		offsetof(struct ring_buffer_meta_page, data_pages);
+	data_page_end /= sizeof(u32);
+
 	first_page = bpage = rb_set_head_page(cpu_buffer);
 	do {
-		if (id > META_PAGE_MAX_PAGES) {
+		if (id > data_page_end) {
 			WARN_ON(1);
 			break;
 		}
@@ -5960,6 +5998,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 	meta->pages_touched = 0;
 	meta->reader_page = cpu_buffer->reader_page->id;
 	meta->nr_data_pages = cpu_buffer->nr_pages;
+	meta->meta_page_size = 1 << (cpu_buffer->meta_order + PAGE_SHIFT);
 	meta->data_page_head = 0;
 }
 
@@ -6092,10 +6131,12 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
 /*
  *   +--------------+
  *   |   meta page  |  pgoff=0
+ *   |     ...      |
+ *   |              |  pgoff=(1<<cpu_buffer->meta_order - 1)
  *   +--------------+
- *   |  data page1  |  pgoff=1 page_ids=0
+ *   |  data page1  |  page_ids=0
  *   +--------------+
- *   |  data page2  |  pgoff=2 page_ids=1
+ *   |  data page2  |  page_ids=1
  *         ...
  */
 struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
@@ -6103,10 +6144,11 @@ struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 
-	if (!pgoff)
-		return virt_to_page(cpu_buffer->meta_page);
+	if (pgoff < (1 << cpu_buffer->meta_order))
+		return virt_to_page((void *)cpu_buffer->meta_page + (pgoff << PAGE_SHIFT));
+
+	pgoff -= (1 << cpu_buffer->meta_order);
 
-	pgoff--;
 	if (pgoff > cpu_buffer->nr_pages)
 		return NULL;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea48eabce7b7..2f43e4a842e7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8479,9 +8479,12 @@ static vm_fault_t tracing_buffers_mmap_fault(struct vm_fault *vmf)
 	if (!page)
 		return ret;
 
-	get_page(page);
 	vmf->page = page;
 
+	get_page(vmf->page);
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-21 16:44         ` Steven Rostedt
@ 2023-03-21 16:50           ` Vincent Donnefort
  0 siblings, 0 replies; 11+ messages in thread
From: Vincent Donnefort @ 2023-03-21 16:50 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Tue, Mar 21, 2023 at 12:44:25PM -0400, Steven Rostedt wrote:
> On Tue, 21 Mar 2023 11:40:47 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > > 
> > > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?  
> > 
> > Hold off, I found some bugs that I'm fixing ;-)
> 
> OK, you can fold this in. I also fixed an issue with your patch where it
> was missing setting a page->mapping and also clearing it.
> 
> I haven't updated to replace "__u32 *data_pages[]" with an "__u32 data_start"
> But I think that should still be done.
> 
> -- Steve
>

[...] 

Thanks! I'll prepare a v2 with all that!

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
  2023-03-21 16:20         ` Vincent Donnefort
@ 2023-03-21 16:51           ` Steven Rostedt
  0 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2023-03-21 16:51 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: mhiramat, linux-kernel, linux-trace-kernel, kernel-team

On Tue, 21 Mar 2023 16:20:42 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> > > Do we want a specific field here? That could be deduced from nr_data_pages()
> > > quite easily?  
> > 
> > I rather not have too much implementation detail knowledge in user space.
> > It only removes a single entry, and it makes user space easier. In fact,  
> 
> Ack.
> 
> > I'm thinking we should not include "__u32 data_pages[]" but instead add a:
> > "__u32 data_start" where user space does:
> > 
> > 	__u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
> > 
> > That way we could extend the data provided by the meta_page in the future.  
> 
> That'd be nice. Couldn't we keep both to simplify the code for the kernel side?

I would not expose the data_pages[] to user space, because then they'll use
it, and that *will* become an API.

But we could expose it to the kernel side with;

include/uapi/linux/trace_mmap.h:

struct ring_buffer_meta_page {
#if __BITS_PER_LONG == 64
	__u64	entries;
	__u64	overrun;
#else
	__u32	entries;
	__u32	overrun;
#endif
	__u32	pages_touched;
	__u32	reader_page;
	__u32	nr_data_pages;	/* doesn't take into account the reader_page */
	__u32	data_page_head;	/* index of data_pages[] */
	__u32	meta_page_size;	/* size of the meta page */
	__u32	data_start;	/* offset to where data_pages are */
};

kernel/trace/ring_buffer.c:

struct ring_buffer_meta {
	struct ring_buffer_meta_page	meta;
	u32				data_pages[];
}

Then we can start each function with:

	struct ring_buffer_meta_page *meta = &cpu_buffer->meta_page.meta;
	u32 *data_pages = cpu_buffer->meta_page.data_pages;

-- Steve

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
@ 2023-03-18  3:00 kernel test robot
  0 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2023-03-18  3:00 UTC (permalink / raw)
  To: oe-kbuild; +Cc: lkp, Dan Carpenter

BCC: lkp@intel.com
CC: oe-kbuild-all@lists.linux.dev
In-Reply-To: <20230317143310.1604700-2-vdonnefort@google.com>
References: <20230317143310.1604700-2-vdonnefort@google.com>
TO: Vincent Donnefort <vdonnefort@google.com>
TO: rostedt@goodmis.org
TO: mhiramat@kernel.org
TO: linux-kernel@vger.kernel.org
TO: linux-trace-kernel@vger.kernel.org
CC: kernel-team@android.com
CC: Vincent Donnefort <vdonnefort@google.com>

Hi Vincent,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on rostedt-trace/for-next v6.3-rc2 next-20230317]
[cannot apply to rostedt-trace/for-next-urgent]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Vincent-Donnefort/ring-buffer-Introducing-ring-buffer-mapping-functions/20230317-223437
patch link:    https://lore.kernel.org/r/20230317143310.1604700-2-vdonnefort%40google.com
patch subject: [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions
:::::: branch date: 12 hours ago
:::::: commit date: 12 hours ago
config: i386-randconfig-m021 (https://download.01.org/0day-ci/archive/20230318/202303181001.bu7VCBhF-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <error27@gmail.com>
| Link: https://lore.kernel.org/r/202303181001.bu7VCBhF-lkp@intel.com/

smatch warnings:
kernel/trace/ring_buffer.c:6137 ring_buffer_get_reader_page() warn: passing a valid pointer to 'PTR_ERR'
kernel/trace/ring_buffer.c:5996 rb_get_mapped_buffer() warn: inconsistent returns '&cpu_buffer->mapping_lock'.
kernel/trace/ring_buffer.c:6158 ring_buffer_update_meta_page() warn: passing a valid pointer to 'PTR_ERR'

vim +/PTR_ERR +6137 kernel/trace/ring_buffer.c

e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5978  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5979  static inline struct ring_buffer_per_cpu *
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5980  rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5981  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5982  	struct ring_buffer_per_cpu *cpu_buffer;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5983  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5984  	if (!cpumask_test_cpu(cpu, buffer->cpumask))
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5985  		return ERR_PTR(-EINVAL);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5986  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5987  	cpu_buffer = buffer->buffers[cpu];
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5988  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5989  	mutex_lock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5990  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5991  	if (!cpu_buffer->mapped) {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5992  		mutex_unlock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5993  		return ERR_PTR(-ENODEV);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5994  	}
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5995  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17 @5996  	return cpu_buffer;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5997  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5998  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  5999  static inline void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6000  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6001  	mutex_unlock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6002  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6003  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6004  int ring_buffer_map(struct trace_buffer *buffer, int cpu)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6005  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6006  	struct ring_buffer_per_cpu *cpu_buffer;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6007  	unsigned long flags, *page_ids;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6008  	int err = 0;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6009  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6010  	if (!cpumask_test_cpu(cpu, buffer->cpumask))
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6011  		return -EINVAL;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6012  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6013  	cpu_buffer = buffer->buffers[cpu];
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6014  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6015  	mutex_lock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6016  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6017  	if (cpu_buffer->mapped) {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6018  		WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped + 1);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6019  		goto unlock;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6020  	}
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6021  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6022  	/* prevent another thread from changing buffer sizes */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6023  	mutex_lock(&buffer->mutex);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6024  	atomic_inc(&cpu_buffer->resize_disabled);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6025  	mutex_unlock(&buffer->mutex);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6026  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6027  	err = rb_alloc_meta_page(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6028  	if (err) {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6029  		atomic_dec(&cpu_buffer->resize_disabled);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6030  		goto unlock;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6031  	}
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6032  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6033  	/* page_ids include the reader page while nr_pages does not */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6034  	page_ids = kzalloc(sizeof(*page_ids) * (cpu_buffer->nr_pages + 1),
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6035  			   GFP_KERNEL);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6036  	if (!page_ids) {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6037  		rb_free_meta_page(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6038  		atomic_dec(&cpu_buffer->resize_disabled);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6039  		err = -ENOMEM;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6040  		goto unlock;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6041  	}
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6042  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6043  	/*
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6044  	 * Lock all readers to block any page swap until the page IDs are
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6045  	 * assigned.
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6046  	 */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6047  	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6048  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6049  	rb_setup_ids_meta_page(cpu_buffer, page_ids);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6050  	/*
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6051  	 * Ensure the writer will observe the meta-page before
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6052  	 * cpu_buffer->mapped.
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6053  	 */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6054  	smp_wmb();
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6055  	WRITE_ONCE(cpu_buffer->mapped, 1);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6056  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6057  	/* Init meta_page values unless the writer did it already */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6058  	cmpxchg(&cpu_buffer->meta_page->entries, 0,
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6059  		local_read(&cpu_buffer->entries));
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6060  	cmpxchg(&cpu_buffer->meta_page->overrun, 0,
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6061  		local_read(&cpu_buffer->overrun));
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6062  	cmpxchg(&cpu_buffer->meta_page->pages_touched, 0,
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6063  		local_read(&cpu_buffer->pages_touched));
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6064  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6065  	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6066  unlock:
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6067  	mutex_unlock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6068  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6069  	return err;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6070  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6071  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6072  int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6073  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6074  	struct ring_buffer_per_cpu *cpu_buffer;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6075  	int err = 0;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6076  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6077  	if (!cpumask_test_cpu(cpu, buffer->cpumask))
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6078  		return -EINVAL;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6079  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6080  	cpu_buffer = buffer->buffers[cpu];
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6081  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6082  	mutex_lock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6083  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6084  	if (!cpu_buffer->mapped) {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6085  		err = -ENODEV;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6086  		goto unlock;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6087  	}
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6088  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6089  	WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped - 1);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6090  	if (!cpu_buffer->mapped) {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6091  		/* Wait the writer and readers to observe !mapped */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6092  		synchronize_rcu();
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6093  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6094  		rb_free_page_ids(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6095  		rb_free_meta_page(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6096  		atomic_dec(&cpu_buffer->resize_disabled);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6097  	}
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6098  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6099  unlock:
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6100  	mutex_unlock(&cpu_buffer->mapping_lock);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6101  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6102  	return err;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6103  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6104  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6105  /*
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6106   *   +--------------+
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6107   *   |   meta page  |  pgoff=0
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6108   *   +--------------+
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6109   *   |  data page1  |  pgoff=1 page_ids=0
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6110   *   +--------------+
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6111   *   |  data page2  |  pgoff=2 page_ids=1
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6112   *         ...
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6113   */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6114  struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6115  				   unsigned long pgoff)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6116  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6117  	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6118  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6119  	if (!pgoff)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6120  		return virt_to_page(cpu_buffer->meta_page);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6121  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6122  	pgoff--;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6123  	if (pgoff > cpu_buffer->nr_pages)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6124  		return NULL;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6125  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6126  	return virt_to_page(cpu_buffer->page_ids[pgoff]);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6127  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6128  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6129  int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6130  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6131  	struct ring_buffer_per_cpu *cpu_buffer;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6132  	struct buffer_page *reader;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6133  	unsigned long flags;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6134  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6135  	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6136  	if (IS_ERR(cpu_buffer))
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17 @6137  		return (int)PTR_ERR(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6138  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6139  	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6140  	reader = cpu_buffer->reader_page;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6141  	reader->read = rb_page_size(reader);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6142  	if (!rb_per_cpu_empty(cpu_buffer))
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6143  		WARN_ON(!rb_get_reader_page(cpu_buffer));
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6144  	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6145  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6146  	rb_put_mapped_buffer(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6147  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6148  	return 0;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6149  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6150  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6151  int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu)
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6152  {
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6153  	struct ring_buffer_per_cpu *cpu_buffer;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6154  	unsigned long flags;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6155  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6156  	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6157  	if (IS_ERR(cpu_buffer))
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17 @6158  		return PTR_ERR(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6159  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6160  	/* Update the head page if the writer moved it */
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6161  	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6162  	rb_set_head_page(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6163  	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6164  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6165  	rb_put_mapped_buffer(cpu_buffer);
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6166  
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6167  	return 0;
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6168  }
e6f6ebfdb93b0a Vincent Donnefort 2023-03-17  6169  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-03-21 16:52 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-17 14:33 [PATCH 0/2] Introducing trace buffer mapping by user-space Vincent Donnefort
2023-03-17 14:33 ` [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
2023-03-21  1:45   ` Steven Rostedt
2023-03-21 15:17     ` Vincent Donnefort
2023-03-21 15:40       ` Steven Rostedt
2023-03-21 16:20         ` Vincent Donnefort
2023-03-21 16:51           ` Steven Rostedt
2023-03-21 16:44         ` Steven Rostedt
2023-03-21 16:50           ` Vincent Donnefort
2023-03-17 14:33 ` [PATCH 2/2] tracing: Allow user-space mapping of the ring-buffer Vincent Donnefort
2023-03-18  3:00 [PATCH 1/2] ring-buffer: Introducing ring-buffer mapping functions kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.