linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Masami Hiramatsu <mhiramat@kernel.org>
To: Beau Belgrave <beaub@linux.microsoft.com>
Cc: rostedt@goodmis.org, linux-trace-devel@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v7 09/13] user_events: Optimize writing events by only copying data once
Date: Fri, 10 Dec 2021 23:51:10 +0900	[thread overview]
Message-ID: <20211210235110.f674dd81e27bdedb231826a2@kernel.org> (raw)
In-Reply-To: <20211209223210.1818-10-beaub@linux.microsoft.com>

Hi Beau,

On Thu,  9 Dec 2021 14:32:06 -0800
Beau Belgrave <beaub@linux.microsoft.com> wrote:

> Pass iterator through to probes to allow copying data directly to the
> probe buffers instead of taking multiple copies. Enables eBPF user and
> raw iterator types out to programs for no-copy scenarios.
> 
> Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
> ---
>  kernel/trace/trace_events_user.c | 102 ++++++++++++++++++++++---------
>  1 file changed, 74 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 807db0af74fb..1d29f6ec907d 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -41,6 +41,10 @@
>  #define MAX_FIELD_ARRAY_SIZE (2 * PAGE_SIZE)
>  #define MAX_FIELD_ARG_NAME 256
>  
> +#define MAX_BPF_COPY_SIZE PAGE_SIZE
> +#define MAX_STACK_BPF_DATA 512
> +#define copy_nofault copy_from_iter_nocache
> +
>  static char *register_page_data;
>  
>  static DEFINE_MUTEX(reg_mutex);
> @@ -78,8 +82,7 @@ struct user_event_refs {
>  	struct user_event *events[];
>  };
>  
> -typedef void (*user_event_func_t) (struct user_event *user,
> -				   void *data, u32 datalen,
> +typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
>  				   void *tpdata);
>  
>  static int user_event_parse(char *name, char *args, char *flags,
> @@ -515,7 +518,7 @@ static struct user_event *find_user_event(char *name, u32 *outkey)
>  /*
>   * Writes the user supplied payload out to a trace file.
>   */
> -static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
> +static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
>  			      void *tpdata)
>  {
>  	struct trace_event_file *file;
> @@ -531,41 +534,85 @@ static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
>  
>  	/* Allocates and fills trace_entry, + 1 of this is data payload */
>  	entry = trace_event_buffer_reserve(&event_buffer, file,
> -					   sizeof(*entry) + datalen);
> +					   sizeof(*entry) + i->count);
>  
>  	if (unlikely(!entry))
>  		return;
>  
> -	memcpy(entry + 1, data, datalen);
> +	if (unlikely(!copy_nofault(entry + 1, i->count, i))) {
> +		__trace_event_discard_commit(event_buffer.buffer,
> +					     event_buffer.event);
> +		return;
> +	}
>  
>  	trace_event_buffer_commit(&event_buffer);
>  }
>  
>  #ifdef CONFIG_PERF_EVENTS
> +static void user_event_bpf(struct user_event *user, struct iov_iter *i)
> +{
> +	struct user_bpf_context context;
> +	struct user_bpf_iter bpf_i;
> +	char fast_data[MAX_STACK_BPF_DATA];
> +	void *temp = NULL;
> +
> +	if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
> +		/* Raw iterator */
> +		context.data_type = USER_BPF_DATA_ITER;
> +		context.data_len = i->count;
> +		context.iter = &bpf_i;
> +
> +		bpf_i.iov_offset = i->iov_offset;
> +		bpf_i.iov = i->iov;
> +		bpf_i.nr_segs = i->nr_segs;
> +	} else if (i->nr_segs == 1 && iter_is_iovec(i)) {
> +		/* Single buffer from user */
> +		context.data_type = USER_BPF_DATA_USER;
> +		context.data_len = i->count;
> +		context.udata = i->iov->iov_base + i->iov_offset;
> +	} else {
> +		/* Multi buffer from user */
> +		struct iov_iter copy = *i;
> +		size_t copy_size = min(i->count, (size_t)MAX_BPF_COPY_SIZE);
> +
> +		context.data_type = USER_BPF_DATA_KERNEL;
> +		context.kdata = fast_data;
> +
> +		if (unlikely(copy_size > sizeof(fast_data))) {
> +			temp = kmalloc(copy_size, GFP_NOWAIT);
> +
> +			if (temp)
> +				context.kdata = temp;
> +			else
> +				copy_size = sizeof(fast_data);
> +		}
> +
> +		context.data_len = copy_nofault(context.kdata,
> +						copy_size, &copy);
> +	}
> +
> +	trace_call_bpf(&user->call, &context);
> +
> +	kfree(temp);
> +}
> +
>  /*
>   * Writes the user supplied payload out to perf ring buffer or eBPF program.
>   */
> -static void user_event_perf(struct user_event *user, void *data, u32 datalen,
> +static void user_event_perf(struct user_event *user, struct iov_iter *i,
>  			    void *tpdata)
>  {
>  	struct hlist_head *perf_head;
>  
> -	if (bpf_prog_array_valid(&user->call)) {
> -		struct user_bpf_context context = {0};
> -
> -		context.data_len = datalen;
> -		context.data_type = USER_BPF_DATA_KERNEL;
> -		context.kdata = data;
> -
> -		trace_call_bpf(&user->call, &context);
> -	}
> +	if (bpf_prog_array_valid(&user->call))
> +		user_event_bpf(user, i);
>  
>  	perf_head = this_cpu_ptr(user->call.perf_events);
>  
>  	if (perf_head && !hlist_empty(perf_head)) {
>  		struct trace_entry *perf_entry;
>  		struct pt_regs *regs;
> -		size_t size = sizeof(*perf_entry) + datalen;
> +		size_t size = sizeof(*perf_entry) + i->count;
>  		int context;
>  
>  		perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
> @@ -576,7 +623,10 @@ static void user_event_perf(struct user_event *user, void *data, u32 datalen,
>  
>  		perf_fetch_caller_regs(regs);
>  
> -		memcpy(perf_entry + 1, data, datalen);
> +		if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) {
> +			perf_swevent_put_recursion_context(context);
> +			return;
> +		}
>  
>  		perf_trace_buf_submit(perf_entry, size, context,
>  				      user->call.event.type, 1, regs,
> @@ -1009,32 +1059,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
>  	if (likely(atomic_read(&tp->key.enabled) > 0)) {
>  		struct tracepoint_func *probe_func_ptr;
>  		user_event_func_t probe_func;
> +		struct iov_iter copy;
>  		void *tpdata;
> -		void *kdata;
> -		u32 datalen;
>  
> -		kdata = kmalloc(i->count, GFP_KERNEL);
> -
> -		if (unlikely(!kdata))
> -			return -ENOMEM;
> -
> -		datalen = copy_from_iter(kdata, i->count, i);
> +		if (unlikely(iov_iter_fault_in_readable(i, i->count)))
> +			return -EFAULT;
>  
>  		rcu_read_lock_sched();
> +		pagefault_disable();

Since the pagefault_disable() may have unexpected side effect,
I think it should be used really limited area, e.g. around
actual memory access function.
Can we move this around the copy_nofault()?

Thank you,
>  
>  		probe_func_ptr = rcu_dereference_sched(tp->funcs);
>  
>  		if (probe_func_ptr) {
>  			do {
> +				copy = *i;
>  				probe_func = probe_func_ptr->func;
>  				tpdata = probe_func_ptr->data;
> -				probe_func(user, kdata, datalen, tpdata);
> +				probe_func(user, &copy, tpdata);
>  			} while ((++probe_func_ptr)->func);
>  		}
>  
> +		pagefault_enable();
>  		rcu_read_unlock_sched();
> -
> -		kfree(kdata);
>  	}
>  
>  	return ret;
> -- 
> 2.17.1
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>

  reply	other threads:[~2021-12-10 14:51 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-09 22:31 [PATCH v7 00/13] user_events: Enable user processes to create and write to trace events Beau Belgrave
2021-12-09 22:31 ` [PATCH v7 01/13] user_events: Add UABI header for user access to user_events Beau Belgrave
2021-12-10 13:30   ` Masami Hiramatsu
2021-12-10 17:29     ` Beau Belgrave
2021-12-09 22:31 ` [PATCH v7 02/13] user_events: Add minimal support for trace_event into ftrace Beau Belgrave
2021-12-10 10:43   ` Masami Hiramatsu
2021-12-10 17:43     ` Steven Rostedt
2021-12-13  0:09       ` Masami Hiramatsu
2021-12-13 16:48         ` Steven Rostedt
2021-12-10 18:03     ` Beau Belgrave
2021-12-13  4:24       ` Masami Hiramatsu
2021-12-13 17:58         ` Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 03/13] user_events: Add print_fmt generation support for basic types Beau Belgrave
2021-12-10  2:50   ` Masami Hiramatsu
2021-12-09 22:32 ` [PATCH v7 04/13] user_events: Handle matching arguments from dyn_events Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 05/13] user_events: Add basic perf and eBPF support Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 06/13] user_events: Add self-test for ftrace integration Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 07/13] user_events: Add self-test for dynamic_events integration Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 08/13] user_events: Add self-test for perf_event integration Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 09/13] user_events: Optimize writing events by only copying data once Beau Belgrave
2021-12-10 14:51   ` Masami Hiramatsu [this message]
2021-12-10 18:36     ` Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 10/13] user_events: Add documentation file Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 11/13] user_events: Add sample code for typical usage Beau Belgrave
2021-12-09 22:32 ` [PATCH v7 12/13] user_events: Validate user payloads for size and null termination Beau Belgrave
2021-12-10 14:46   ` Masami Hiramatsu
2021-12-09 22:32 ` [PATCH v7 13/13] user_events: Use __get_rel_str for relative string fields Beau Belgrave
2021-12-10  1:23   ` Masami Hiramatsu
2021-12-10 18:45     ` Beau Belgrave
2021-12-12 15:12       ` Masami Hiramatsu
2021-12-13 18:47         ` Beau Belgrave
2021-12-14  6:30           ` Masami Hiramatsu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211210235110.f674dd81e27bdedb231826a2@kernel.org \
    --to=mhiramat@kernel.org \
    --cc=beaub@linux.microsoft.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-devel@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).