All of lore.kernel.org
 help / color / mirror / Atom feed
From: Song Liu <songliubraving@fb.com>
To: Namhyung Kim <namhyung@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>,
	Jiri Olsa <jolsa@redhat.com>, Ingo Molnar <mingo@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	LKML <linux-kernel@vger.kernel.org>,
	Andi Kleen <ak@linux.intel.com>, Ian Rogers <irogers@google.com>,
	Stephane Eranian <eranian@google.com>
Subject: Re: [PATCH 4/4] perf stat: Enable BPF counter with --for-each-cgroup
Date: Wed, 30 Jun 2021 18:47:16 +0000	[thread overview]
Message-ID: <43811928-C46C-45CE-AB5A-4DE84DCDB1AF@fb.com> (raw)
In-Reply-To: <20210625071826.608504-5-namhyung@kernel.org>



> On Jun 25, 2021, at 12:18 AM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> Recently bperf was added to use BPF to count perf events for various
> purposes.  This is an extension for the approach and targetting to
> cgroup usages.
> 
> Unlike the other bperf, it doesn't share the events with other
> processes but it'd reduce unnecessary events (and the overhead of
> multiplexing) for each monitored cgroup within the perf session.
> 
> When --for-each-cgroup is used with --bpf-counters, it will open
> cgroup-switches event per cpu internally and attach the new BPF
> program to read given perf_events and to aggregate the results for
> cgroups.  It's only called when task is switched to a task in a
> different cgroup.
> 
> Cc: Song Liu <songliubraving@fb.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> tools/perf/Makefile.perf                    |  17 +-
> tools/perf/util/Build                       |   1 +
> tools/perf/util/bpf_counter.c               |   5 +
> tools/perf/util/bpf_counter_cgroup.c        | 299 ++++++++++++++++++++
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 191 +++++++++++++
> tools/perf/util/cgroup.c                    |   2 +
> tools/perf/util/cgroup.h                    |   1 +
> 7 files changed, 515 insertions(+), 1 deletion(-)
> create mode 100644 tools/perf/util/bpf_counter_cgroup.c
> create mode 100644 tools/perf/util/bpf_skel/bperf_cgroup.bpf.c

[...]

> diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
> new file mode 100644
> index 000000000000..327f97a23a84
> --- /dev/null
> +++ b/tools/perf/util/bpf_counter_cgroup.c
> @@ -0,0 +1,299 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* Copyright (c) 2019 Facebook */

I am not sure whether this ^^^ is accurate. 

> +/* Copyright (c) 2021 Google */
> +
> +#include <assert.h>
> +#include <limits.h>
> +#include <unistd.h>
> +#include <sys/file.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
> +#include <linux/err.h>
> +#include <linux/zalloc.h>
> +#include <linux/perf_event.h>
> +#include <api/fs/fs.h>
> +#include <perf/bpf_perf.h>
> +
> +#include "affinity.h"
> +#include "bpf_counter.h"
> +#include "cgroup.h"
> +#include "counts.h"
> +#include "debug.h"
> +#include "evsel.h"
> +#include "evlist.h"
> +#include "target.h"
> +#include "cpumap.h"
> +#include "thread_map.h"
> +
> +#include "bpf_skel/bperf_cgroup.skel.h"
> +
> +static struct perf_event_attr cgrp_switch_attr = {
> +	.type = PERF_TYPE_SOFTWARE,
> +	.config = PERF_COUNT_SW_CGROUP_SWITCHES,
> +	.size = sizeof(cgrp_switch_attr),
> +	.sample_period = 1,
> +	.disabled = 1,
> +};
> +
> +static struct evsel *cgrp_switch;
> +static struct bperf_cgroup_bpf *skel;
> +
> +#define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0))
> +
> +static int bperf_load_program(struct evlist *evlist)
> +{
> +	struct bpf_link *link;
> +	struct evsel *evsel;
> +	struct cgroup *cgrp, *leader_cgrp;
> +	__u32 i, cpu;
> +	int nr_cpus = evlist->core.all_cpus->nr;
> +	int total_cpus = cpu__max_cpu();
> +	int map_size, map_fd;
> +	int prog_fd, err;
> +
> +	skel = bperf_cgroup_bpf__open();
> +	if (!skel) {
> +		pr_err("Failed to open cgroup skeleton\n");
> +		return -1;
> +	}
> +
> +	skel->rodata->num_cpus = total_cpus;
> +	skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups;
> +
> +	BUG_ON(evlist->core.nr_entries % nr_cgroups != 0);
> +
> +	/* we need one copy of events per cpu for reading */
> +	map_size = total_cpus * evlist->core.nr_entries / nr_cgroups;
> +	bpf_map__resize(skel->maps.events, map_size);
> +	bpf_map__resize(skel->maps.cgrp_idx, nr_cgroups);
> +	/* previous result is saved in a per-cpu array */
> +	map_size = evlist->core.nr_entries / nr_cgroups;
> +	bpf_map__resize(skel->maps.prev_readings, map_size);
> +	/* cgroup result needs all events (per-cpu) */
> +	map_size = evlist->core.nr_entries;
> +	bpf_map__resize(skel->maps.cgrp_readings, map_size);
> +
> +	set_max_rlimit();
> +
> +	err = bperf_cgroup_bpf__load(skel);
> +	if (err) {
> +		pr_err("Failed to load cgroup skeleton\n");
> +		goto out;
> +	}
> +
> +	if (cgroup_is_v2("perf_event") > 0)
> +		skel->bss->use_cgroup_v2 = 1;
> +
> +	err = -1;
> +
> +	cgrp_switch = evsel__new(&cgrp_switch_attr);
> +	if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) {
> +		pr_err("Failed to open cgroup switches event\n");
> +		goto out;
> +	}
> +
> +	for (i = 0; i < nr_cpus; i++) {
> +		link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch,
> +						      FD(cgrp_switch, i));
> +		if (IS_ERR(link)) {
> +			pr_err("Failed to attach cgroup program\n");
> +			err = PTR_ERR(link);
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * Update cgrp_idx map from cgroup-id to event index.
> +	 */
> +	cgrp = NULL;
> +	i = 0;
> +
> +	evlist__for_each_entry(evlist, evsel) {
> +		if (cgrp == NULL || evsel->cgrp == leader_cgrp) {
> +			leader_cgrp = evsel->cgrp;
> +			evsel->cgrp = NULL;
> +
> +			/* open single copy of the events w/o cgroup */
> +			err = evsel__open_per_cpu(evsel, evlist->core.all_cpus, -1);
> +			if (err) {
> +				pr_err("Failed to open first cgroup events\n");
> +				goto out;
> +			}
> +
> +			map_fd = bpf_map__fd(skel->maps.events);
> +			for (cpu = 0; cpu < nr_cpus; cpu++) {
> +				int fd = FD(evsel, cpu);
> +				__u32 idx = evsel->idx * total_cpus +
> +					evlist->core.all_cpus->map[cpu];
> +
> +				err = bpf_map_update_elem(map_fd, &idx, &fd,
> +							  BPF_ANY);
> +				if (err < 0) {
> +					pr_err("Failed to update perf_event fd\n");
> +					goto out;
> +				}
> +			}
> +
> +			evsel->cgrp = leader_cgrp;
> +		}
> +		evsel->supported = true;
> +
> +		if (evsel->cgrp == cgrp)
> +			continue;
> +
> +		cgrp = evsel->cgrp;
> +
> +		if (read_cgroup_id(cgrp) < 0) {
> +			pr_err("Failed to get cgroup id\n");
> +			err = -1;
> +			goto out;
> +		}
> +
> +		map_fd = bpf_map__fd(skel->maps.cgrp_idx);
> +		err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY);
> +		if (err < 0) {
> +			pr_err("Failed to update cgroup index map\n");
> +			goto out;
> +		}
> +
> +		i++;
> +	}
> +
> +	/*
> +	 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
> +	 * whether the kernel support it
> +	 */
> +	prog_fd = bpf_program__fd(skel->progs.trigger_read);
> +	err = bperf_trigger_reading(prog_fd, 0);
> +	if (err) {
> +		pr_debug("The kernel does not support test_run for raw_tp BPF programs.\n"
> +			 "Therefore, --for-each-cgroup might show inaccurate readings\n");

I think this should be a warning, and we should set err = 0 to continue? 

> +	}
> +
> +out:
> +	return err;
> +}
> +

[...]

> +
> +/*
> + * trigger the leader prog on each cpu, so the cgrp_reading map could get
> + * the latest results.
> + */
> +static int bperf_cgrp__sync_counters(struct evlist *evlist)
> +{
> +	int i, cpu;
> +	int nr_cpus = evlist->core.all_cpus->nr;
> +	int prog_fd = bpf_program__fd(skel->progs.trigger_read);
> +
> +	for (i = 0; i < nr_cpus; i++) {
> +		cpu = evlist->core.all_cpus->map[i];
> +		bperf_trigger_reading(prog_fd, cpu);
> +	}
> +
> +	return 0;
> +}
> +
> +static int bperf_cgrp__enable(struct evsel *evsel)
> +{

Do we need to call bperf_cgrp__sync_counters() before setting enabled to 1? 
If we don't, we may count some numbers before setting enabled to 1, no? 

> +	skel->bss->enabled = 1;
> +	return 0;
> +}

[...]


  reply	other threads:[~2021-06-30 18:47 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-25  7:18 [PATCHSET v4 0/4] perf stat: Enable BPF counters with --for-each-cgroup Namhyung Kim
2021-06-25  7:18 ` [PATCH 1/4] perf tools: Add read_cgroup_id() function Namhyung Kim
2021-07-01 17:59   ` Arnaldo Carvalho de Melo
2021-06-25  7:18 ` [PATCH 2/4] perf tools: Add cgroup_is_v2() helper Namhyung Kim
2021-06-29 15:51   ` Ian Rogers
2021-06-30  6:35     ` Namhyung Kim
2021-06-30 18:43       ` Arnaldo Carvalho de Melo
2021-06-25  7:18 ` [PATCH 3/4] perf tools: Move common bpf functions to bpf_counter.h Namhyung Kim
2021-06-30 18:28   ` Song Liu
2021-07-01 19:09   ` Arnaldo Carvalho de Melo
2021-07-01 20:11     ` Namhyung Kim
2021-06-25  7:18 ` [PATCH 4/4] perf stat: Enable BPF counter with --for-each-cgroup Namhyung Kim
2021-06-30 18:47   ` Song Liu [this message]
2021-06-30 20:09     ` Namhyung Kim
2021-07-01 20:16       ` Namhyung Kim
2021-06-30 18:50   ` Arnaldo Carvalho de Melo
2021-06-30 20:12     ` Namhyung Kim
2021-07-01 13:43     ` Arnaldo Carvalho de Melo
2021-07-01 17:10       ` Namhyung Kim
2021-06-27 15:29 ` [PATCHSET v4 0/4] perf stat: Enable BPF counters " Namhyung Kim
2021-06-30  6:19   ` Namhyung Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=43811928-C46C-45CE-AB5A-4DE84DCDB1AF@fb.com \
    --to=songliubraving@fb.com \
    --cc=acme@kernel.org \
    --cc=ak@linux.intel.com \
    --cc=eranian@google.com \
    --cc=irogers@google.com \
    --cc=jolsa@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.